Synthesize FKs in parent->child order

GitOrigin-RevId: ce78201e0f6bf3f9c796c0611799e4639ec91fc3
gretelai · Oct 20, 2023 · e0a51b8 · e0a51b8
1 parent f64806b
commit e0a51b8
Show file tree

Hide file tree

Showing 2 changed files with 95 additions and 12 deletions.
diff --git a/src/gretel_trainer/relational/strategies/independent.py b/src/gretel_trainer/relational/strategies/independent.py
@@ -201,18 +201,26 @@ def _synthesize_foreign_keys(
     being referenced.
     """
     processed = {}
-    for table_name, synth_data in synth_tables.items():
-        out_df = synth_data.copy()
+    for table_name in rel_data.list_tables_parents_before_children():
+        out_df = synth_tables.get(table_name)
+        if out_df is None:
+            continue
         for foreign_key in rel_data.get_foreign_keys(table_name):
-            parent_synth_table = synth_tables.get(foreign_key.parent_table_name)
+            # We pull the parent from `processed` instead of from `synth_tables` because "this" table
+            # (`table_name` / `out_df`) may have a FK pointing to a parent column that _is itself_ a FK to some third table.
+            # We want to ensure the synthetic values we're using to populate "this" table's FK column are
+            # the final output values we've produced for its parent table.
+            # We are synthesizing foreign keys in parent->child order, so the parent table
+            # should(*) already exist in the processed dict with its final synthetic values...
+            parent_synth_table = processed.get(foreign_key.parent_table_name)
             if parent_synth_table is None:
-                # Parent table generation job may have failed and therefore not be present in synth_tables.
-                # The synthetic data for this table may still be useful, but we do not have valid synthetic
-                # primary key values to set in this table's foreign key column. Instead of introducing dangling
+                # (*)...BUT the parent table generation job may have failed and therefore not be present in either `processed` or `synth_tables`.
+                # The synthetic data for "this" table may still be useful, but we do not have valid/any synthetic
+                # values from the parent to set in "this" table's foreign key column. Instead of introducing dangling
                 # pointers, set the entire column to None.
-                synth_pk_values = [None] * len(foreign_key.parent_columns)
+                synth_parent_values = [None] * len(foreign_key.parent_columns)
             else:
-                synth_pk_values = parent_synth_table[
+                synth_parent_values = parent_synth_table[
                     foreign_key.parent_columns
                 ].values.tolist()
 
@@ -222,7 +230,7 @@ def _synthesize_foreign_keys(
             )
 
             new_fk_values = _collect_values(
-                synth_pk_values, fk_frequencies, len(out_df)
+                synth_parent_values, fk_frequencies, len(out_df)
             )
 
             out_df[foreign_key.columns] = new_fk_values

diff --git a/tests/relational/test_independent_strategy.py b/tests/relational/test_independent_strategy.py
@@ -1,4 +1,3 @@
-import json
 import os
 import tempfile
 
@@ -8,6 +7,7 @@
 import pandas as pd
 import pandas.testing as pdtest
 
+from gretel_trainer.relational.core import RelationalData
 from gretel_trainer.relational.strategies.independent import IndependentStrategy
 
 
@@ -110,8 +110,7 @@ def test_post_processing_one_to_one(pets):
     }
 
     # Normally we shuffle synthesized keys for realism, but for deterministic testing we sort instead
-    with patch("random.shuffle") as shuffle:
-        shuffle = sorted
+    with patch("random.shuffle", wraps=sorted):
         processed = strategy.post_process_synthetic_results(
             raw_synth_tables, [], pets, 1
         )
@@ -169,3 +168,79 @@ def test_post_processing_foreign_keys_with_skewed_frequencies_and_different_size
     fk_value_counts = sorted(list(fk_value_counts.values()))
 
     assert fk_value_counts == [5, 5, 15, 30, 35, 60]
+
+
+# In this scenario, a table (shipping_notifications) has a FK (customer_id) pointing to
+# a column that is itself a FK but *not* a PK (orders.customer_id).
+# (No, this is not a "perfectly normalized" schema, but it can happen in the wild.)
+# We need to ensure tables have FKs synthesized in parent->child order to avoid blowing up
+# due to missing columns.
+def test_post_processing_fks_to_non_pks(tmpdir):
+    rel_data = RelationalData(directory=tmpdir)
+
+    rel_data.add_table(
+        name="customers",
+        primary_key="id",
+        data=pd.DataFrame(data={"id": [1, 2], "name": ["Xavier", "Yesenia"]}),
+    )
+    rel_data.add_table(
+        name="orders",
+        primary_key="id",
+        data=pd.DataFrame(
+            data={
+                "id": [1, 2],
+                "customer_id": [1, 2],
+                "total": [42, 43],
+            }
+        ),
+    )
+    rel_data.add_table(
+        name="shipping_notifications",
+        primary_key="id",
+        data=pd.DataFrame(
+            data={
+                "id": [1, 2],
+                "order_id": [1, 2],
+                "customer_id": [1, 2],
+                "service": ["FedEx", "USPS"],
+            }
+        ),
+    )
+
+    # Add FKs. The third one is the critical one for this test.
+    rel_data.add_foreign_key_constraint(
+        table="orders",
+        constrained_columns=["customer_id"],
+        referred_table="customers",
+        referred_columns=["id"],
+    )
+    rel_data.add_foreign_key_constraint(
+        table="shipping_notifications",
+        constrained_columns=["order_id"],
+        referred_table="orders",
+        referred_columns=["id"],
+    )
+    rel_data.add_foreign_key_constraint(
+        table="shipping_notifications",
+        constrained_columns=["customer_id"],
+        referred_table="orders",
+        referred_columns=["customer_id"],
+    )
+
+    strategy = IndependentStrategy()
+
+    # This dict is deliberately ordered child->parent for this unit test.
+    # Were it not for logic in the strategy (processing tables in parent->child order),
+    # this setup would cause an exception.
+    raw_synth_tables = {
+        "shipping_notifications": pd.DataFrame(data={"service": ["FedEx", "USPS"]}),
+        "orders": pd.DataFrame(data={"total": [55, 56]}),
+        "customers": pd.DataFrame(data={"name": ["Alice", "Bob"]}),
+    }
+
+    processed = strategy.post_process_synthetic_results(
+        raw_synth_tables, [], rel_data, 1
+    )
+
+    for table in rel_data.list_all_tables():
+        assert set(processed[table].columns) == set(rel_data.get_table_columns(table))