From 8e651f223aeca9a472864db786fec1fb22e78fd0 Mon Sep 17 00:00:00 2001 From: Antoine Bon Date: Fri, 12 Dec 2025 16:15:06 +0100 Subject: [PATCH 1/5] wip --- mostlyai/engine/_tabular/probability.py | 5 +++- tests/end_to_end/test_tabular_interface.py | 34 ++++++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/mostlyai/engine/_tabular/probability.py b/mostlyai/engine/_tabular/probability.py index 74a1a094..18db5a30 100644 --- a/mostlyai/engine/_tabular/probability.py +++ b/mostlyai/engine/_tabular/probability.py @@ -245,6 +245,7 @@ def _generate_marginal_probs( tgt_stats: dict, seed_columns: list[str], device: torch.device, + n_samples: int, ctx_data: pd.DataFrame | None = None, ctx_stats: dict | None = None, fixed_probs: dict | None = None, @@ -259,6 +260,7 @@ def _generate_marginal_probs( tgt_stats: Target statistics seed_columns: Seed column names in original format, in correct order device: Device for computation + n_samples: Number of samples to generate probabilities for ctx_data: Optional context data ctx_stats: Optional context statistics (required if ctx_data provided) fixed_probs: Optional fixed probabilities for rare token handling @@ -266,7 +268,6 @@ def _generate_marginal_probs( Returns: DataFrame of shape (n_samples, cardinality) with probabilities and column names """ - n_samples = len(seed_encoded) target_stats = tgt_stats["columns"][target_column] # Build fixed_values dict from seed_encoded @@ -407,6 +408,7 @@ def predict_proba( tgt_stats=tgt_stats, seed_columns=seed_columns, device=device, + n_samples=n_samples, ctx_data=ctx_data, ctx_stats=ctx_stats, fixed_probs=fixed_probs, @@ -490,6 +492,7 @@ def predict_proba( tgt_stats=tgt_stats, seed_columns=extended_seed_columns, device=device, + n_samples=n_samples * num_prev_combos, ctx_data=batched_ctx_data, ctx_stats=ctx_stats, fixed_probs=fixed_probs, diff --git a/tests/end_to_end/test_tabular_interface.py b/tests/end_to_end/test_tabular_interface.py index 601049db..c7229e35 100644 --- a/tests/end_to_end/test_tabular_interface.py +++ b/tests/end_to_end/test_tabular_interface.py @@ -368,6 +368,40 @@ def test_predict_proba_wrong_column_order_raises(self, classification_data, tmp_ with pytest.raises(ValueError, match="(?i)column order.*does not match"): argn.predict_proba(test_X, target="target") + def test_predict_proba_with_context_only(self, tmp_path_factory): + """Test predict_proba() when X only contains join key and actual features are in context.""" + # Create data where target depends on context features + df = pd.DataFrame({ + "id": range(300), + "ctx_a": ["a1", "a2", "a3"] * 100, + "ctx_b": ["b1", "b2", "b3"] * 100, + "target": ["c1", "c2", "c3"] * 100, + }) + + # Train with context + argn = TabularARGN( + enable_flexible_generation=False, + verbose=0, + max_epochs=5, + ctx_data=df[["id", "ctx_a", "ctx_b"]], + ctx_primary_key="id", + tgt_context_key="id", + workspace_dir=tmp_path_factory.mktemp("workspace"), + ) + argn.fit(X=df[["id", "target"]]) + + # Predict probabilities with only join key in X (all features in context) + test_df = df.head(10) + proba = argn.predict_proba( + X=test_df[["id"]], ctx_data=test_df[["id", "ctx_a", "ctx_b"]], target="target" + ) + + # Verify probabilities + assert proba.shape[0] == 10 + assert proba.shape[1] >= 3 # At least 3 classes (c1, c2, c3) + # Verify probabilities sum to 1.0 for each sample + np.testing.assert_allclose(proba.sum(axis=1), 1.0, rtol=1e-5) + class TestTabularARGNRegression: """Test regression: predict numeric target.""" From 58a12cd662047549dd282ed2d693fb752c8adc24 Mon Sep 17 00:00:00 2001 From: Antoine Bon Date: Fri, 12 Dec 2025 16:15:33 +0100 Subject: [PATCH 2/5] wip --- tests/end_to_end/test_tabular_interface.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/end_to_end/test_tabular_interface.py b/tests/end_to_end/test_tabular_interface.py index c7229e35..8d461cd8 100644 --- a/tests/end_to_end/test_tabular_interface.py +++ b/tests/end_to_end/test_tabular_interface.py @@ -371,12 +371,14 @@ def test_predict_proba_wrong_column_order_raises(self, classification_data, tmp_ def test_predict_proba_with_context_only(self, tmp_path_factory): """Test predict_proba() when X only contains join key and actual features are in context.""" # Create data where target depends on context features - df = pd.DataFrame({ - "id": range(300), - "ctx_a": ["a1", "a2", "a3"] * 100, - "ctx_b": ["b1", "b2", "b3"] * 100, - "target": ["c1", "c2", "c3"] * 100, - }) + df = pd.DataFrame( + { + "id": range(300), + "ctx_a": ["a1", "a2", "a3"] * 100, + "ctx_b": ["b1", "b2", "b3"] * 100, + "target": ["c1", "c2", "c3"] * 100, + } + ) # Train with context argn = TabularARGN( @@ -392,9 +394,7 @@ def test_predict_proba_with_context_only(self, tmp_path_factory): # Predict probabilities with only join key in X (all features in context) test_df = df.head(10) - proba = argn.predict_proba( - X=test_df[["id"]], ctx_data=test_df[["id", "ctx_a", "ctx_b"]], target="target" - ) + proba = argn.predict_proba(X=test_df[["id"]], ctx_data=test_df[["id", "ctx_a", "ctx_b"]], target="target") # Verify probabilities assert proba.shape[0] == 10 From 528160029ff8925f1f16f7e43a67dc12a2afa5d6 Mon Sep 17 00:00:00 2001 From: Antoine Bon Date: Fri, 12 Dec 2025 16:37:31 +0100 Subject: [PATCH 3/5] wip --- mostlyai/engine/_tabular/probability.py | 10 +++--- tests/end_to_end/test_tabular_interface.py | 37 ++++++++++++++++++++++ uv.lock | 2 +- 3 files changed, 44 insertions(+), 5 deletions(-) diff --git a/mostlyai/engine/_tabular/probability.py b/mostlyai/engine/_tabular/probability.py index 18db5a30..a97a9f3e 100644 --- a/mostlyai/engine/_tabular/probability.py +++ b/mostlyai/engine/_tabular/probability.py @@ -453,10 +453,10 @@ def predict_proba( # Build DataFrames for each combo with actual values, then concatenate combo_dfs = [] for combo_idx, prev_combo in enumerate(prev_combos): - # Copy extended_seed for this combo - df = extended_seed.copy() + # Build data dict starting with columns from extended_seed + data = {col: extended_seed[col].values for col in extended_seed.columns} - # Add previous target columns with actual values (no dummy values) + # Add previous target columns with actual values for i in range(target_idx): prev_target_col = target_columns[i] encoded_val = prev_combo[i] @@ -468,8 +468,10 @@ def predict_proba( argn_column=prev_target_stats[ARGN_COLUMN], argn_sub_column=sub_col_key, ) - df[full_sub_col_name] = encoded_val + data[full_sub_col_name] = encoded_val + # Create DataFrame with explicit row count + df = pd.DataFrame(data, index=range(n_samples)) combo_dfs.append(df) # Concatenate all combo DataFrames into single batch diff --git a/tests/end_to_end/test_tabular_interface.py b/tests/end_to_end/test_tabular_interface.py index 8d461cd8..23150f85 100644 --- a/tests/end_to_end/test_tabular_interface.py +++ b/tests/end_to_end/test_tabular_interface.py @@ -402,6 +402,43 @@ def test_predict_proba_with_context_only(self, tmp_path_factory): # Verify probabilities sum to 1.0 for each sample np.testing.assert_allclose(proba.sum(axis=1), 1.0, rtol=1e-5) + def test_predict_proba_multi_target_with_context_only(self, tmp_path_factory): + """Test predict_proba() for multiple targets when X only contains join key.""" + # Create data where targets depend on context features + df = pd.DataFrame( + { + "id": range(300), + "ctx_a": ["a1", "a2", "a3"] * 100, + "target_b": ["b1", "b2", "b3"] * 100, + "target_c": ["c1", "c2", "c3"] * 100, + } + ) + + # Train with context + argn = TabularARGN( + enable_flexible_generation=False, + verbose=0, + max_epochs=5, + ctx_data=df[["id", "ctx_a"]], + ctx_primary_key="id", + tgt_context_key="id", + workspace_dir=tmp_path_factory.mktemp("workspace"), + ) + argn.fit(X=df[["id", "target_b", "target_c"]]) + + # Predict joint probabilities with only join key in X + test_df = df.head(10) + proba = argn.predict_proba( + X=test_df[["id"]], ctx_data=test_df[["id", "ctx_a"]], target=["target_b", "target_c"] + ) + + # Verify joint probabilities + assert proba.shape[0] == 10 + # Joint probability has product of cardinalities: 4 (b) × 4 (c) = 16 + assert proba.shape[1] == 16 + # Verify probabilities sum to 1.0 for each sample + np.testing.assert_allclose(proba.sum(axis=1), 1.0, rtol=1e-5) + class TestTabularARGNRegression: """Test regression: predict numeric target.""" diff --git a/uv.lock b/uv.lock index 5c767336..b322e48c 100644 --- a/uv.lock +++ b/uv.lock @@ -2282,7 +2282,7 @@ wheels = [ [[package]] name = "mostlyai-engine" -version = "2.3.1" +version = "2.3.3" source = { editable = "." } dependencies = [ { name = "accelerate" }, From cb964309e704d3761abed1876a36ceef0b1e2d81 Mon Sep 17 00:00:00 2001 From: Antoine Bon Date: Fri, 12 Dec 2025 16:42:03 +0100 Subject: [PATCH 4/5] wip --- tests/end_to_end/test_tabular_interface.py | 67 +++++++--------------- 1 file changed, 22 insertions(+), 45 deletions(-) diff --git a/tests/end_to_end/test_tabular_interface.py b/tests/end_to_end/test_tabular_interface.py index 23150f85..902d0f4e 100644 --- a/tests/end_to_end/test_tabular_interface.py +++ b/tests/end_to_end/test_tabular_interface.py @@ -369,75 +369,52 @@ def test_predict_proba_wrong_column_order_raises(self, classification_data, tmp_ argn.predict_proba(test_X, target="target") def test_predict_proba_with_context_only(self, tmp_path_factory): - """Test predict_proba() when X only contains join key and actual features are in context.""" - # Create data where target depends on context features + """Test predict_proba() when X only contains join key (single and multi-target).""" df = pd.DataFrame( { "id": range(300), "ctx_a": ["a1", "a2", "a3"] * 100, - "ctx_b": ["b1", "b2", "b3"] * 100, - "target": ["c1", "c2", "c3"] * 100, + "target_b": ["b1", "b2", "b3"] * 100, + "target_c": ["c1", "c2", "c3"] * 100, } ) + test_df = df.head(10) - # Train with context - argn = TabularARGN( + # Test single target + argn_single = TabularARGN( enable_flexible_generation=False, verbose=0, max_epochs=5, - ctx_data=df[["id", "ctx_a", "ctx_b"]], + ctx_data=df[["id", "ctx_a"]], ctx_primary_key="id", tgt_context_key="id", - workspace_dir=tmp_path_factory.mktemp("workspace"), + workspace_dir=tmp_path_factory.mktemp("workspace_single"), ) - argn.fit(X=df[["id", "target"]]) - - # Predict probabilities with only join key in X (all features in context) - test_df = df.head(10) - proba = argn.predict_proba(X=test_df[["id"]], ctx_data=test_df[["id", "ctx_a", "ctx_b"]], target="target") - - # Verify probabilities - assert proba.shape[0] == 10 - assert proba.shape[1] >= 3 # At least 3 classes (c1, c2, c3) - # Verify probabilities sum to 1.0 for each sample - np.testing.assert_allclose(proba.sum(axis=1), 1.0, rtol=1e-5) - - def test_predict_proba_multi_target_with_context_only(self, tmp_path_factory): - """Test predict_proba() for multiple targets when X only contains join key.""" - # Create data where targets depend on context features - df = pd.DataFrame( - { - "id": range(300), - "ctx_a": ["a1", "a2", "a3"] * 100, - "target_b": ["b1", "b2", "b3"] * 100, - "target_c": ["c1", "c2", "c3"] * 100, - } + argn_single.fit(X=df[["id", "target_b"]]) + proba_single = argn_single.predict_proba( + X=test_df[["id"]], ctx_data=test_df[["id", "ctx_a"]], target="target_b" ) + assert proba_single.shape[0] == 10 + assert proba_single.shape[1] >= 3 + np.testing.assert_allclose(proba_single.sum(axis=1), 1.0, rtol=1e-5) - # Train with context - argn = TabularARGN( + # Test multiple targets (joint probabilities) + argn_multi = TabularARGN( enable_flexible_generation=False, verbose=0, max_epochs=5, ctx_data=df[["id", "ctx_a"]], ctx_primary_key="id", tgt_context_key="id", - workspace_dir=tmp_path_factory.mktemp("workspace"), + workspace_dir=tmp_path_factory.mktemp("workspace_multi"), ) - argn.fit(X=df[["id", "target_b", "target_c"]]) - - # Predict joint probabilities with only join key in X - test_df = df.head(10) - proba = argn.predict_proba( + argn_multi.fit(X=df[["id", "target_b", "target_c"]]) + proba_multi = argn_multi.predict_proba( X=test_df[["id"]], ctx_data=test_df[["id", "ctx_a"]], target=["target_b", "target_c"] ) - - # Verify joint probabilities - assert proba.shape[0] == 10 - # Joint probability has product of cardinalities: 4 (b) × 4 (c) = 16 - assert proba.shape[1] == 16 - # Verify probabilities sum to 1.0 for each sample - np.testing.assert_allclose(proba.sum(axis=1), 1.0, rtol=1e-5) + assert proba_multi.shape[0] == 10 + assert proba_multi.shape[1] == 16 # 4 × 4 = 16 combinations + np.testing.assert_allclose(proba_multi.sum(axis=1), 1.0, rtol=1e-5) class TestTabularARGNRegression: From a336bd2e552de1989dd0901219e790b89590bcf0 Mon Sep 17 00:00:00 2001 From: Antoine Bon Date: Fri, 12 Dec 2025 17:35:15 +0100 Subject: [PATCH 5/5] wip --- tests/end_to_end/test_tabular_interface.py | 48 ---------------------- 1 file changed, 48 deletions(-) diff --git a/tests/end_to_end/test_tabular_interface.py b/tests/end_to_end/test_tabular_interface.py index 902d0f4e..601049db 100644 --- a/tests/end_to_end/test_tabular_interface.py +++ b/tests/end_to_end/test_tabular_interface.py @@ -368,54 +368,6 @@ def test_predict_proba_wrong_column_order_raises(self, classification_data, tmp_ with pytest.raises(ValueError, match="(?i)column order.*does not match"): argn.predict_proba(test_X, target="target") - def test_predict_proba_with_context_only(self, tmp_path_factory): - """Test predict_proba() when X only contains join key (single and multi-target).""" - df = pd.DataFrame( - { - "id": range(300), - "ctx_a": ["a1", "a2", "a3"] * 100, - "target_b": ["b1", "b2", "b3"] * 100, - "target_c": ["c1", "c2", "c3"] * 100, - } - ) - test_df = df.head(10) - - # Test single target - argn_single = TabularARGN( - enable_flexible_generation=False, - verbose=0, - max_epochs=5, - ctx_data=df[["id", "ctx_a"]], - ctx_primary_key="id", - tgt_context_key="id", - workspace_dir=tmp_path_factory.mktemp("workspace_single"), - ) - argn_single.fit(X=df[["id", "target_b"]]) - proba_single = argn_single.predict_proba( - X=test_df[["id"]], ctx_data=test_df[["id", "ctx_a"]], target="target_b" - ) - assert proba_single.shape[0] == 10 - assert proba_single.shape[1] >= 3 - np.testing.assert_allclose(proba_single.sum(axis=1), 1.0, rtol=1e-5) - - # Test multiple targets (joint probabilities) - argn_multi = TabularARGN( - enable_flexible_generation=False, - verbose=0, - max_epochs=5, - ctx_data=df[["id", "ctx_a"]], - ctx_primary_key="id", - tgt_context_key="id", - workspace_dir=tmp_path_factory.mktemp("workspace_multi"), - ) - argn_multi.fit(X=df[["id", "target_b", "target_c"]]) - proba_multi = argn_multi.predict_proba( - X=test_df[["id"]], ctx_data=test_df[["id", "ctx_a"]], target=["target_b", "target_c"] - ) - assert proba_multi.shape[0] == 10 - assert proba_multi.shape[1] == 16 # 4 × 4 = 16 combinations - np.testing.assert_allclose(proba_multi.sum(axis=1), 1.0, rtol=1e-5) - class TestTabularARGNRegression: """Test regression: predict numeric target."""