Skip to content

Commit

Permalink
Synthesize FKs in parent->child order
Browse files Browse the repository at this point in the history
GitOrigin-RevId: ce78201e0f6bf3f9c796c0611799e4639ec91fc3
  • Loading branch information
mikeknep committed Oct 20, 2023
1 parent f64806b commit e0a51b8
Show file tree
Hide file tree
Showing 2 changed files with 95 additions and 12 deletions.
26 changes: 17 additions & 9 deletions src/gretel_trainer/relational/strategies/independent.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,18 +201,26 @@ def _synthesize_foreign_keys(
being referenced.
"""
processed = {}
for table_name, synth_data in synth_tables.items():
out_df = synth_data.copy()
for table_name in rel_data.list_tables_parents_before_children():
out_df = synth_tables.get(table_name)
if out_df is None:
continue
for foreign_key in rel_data.get_foreign_keys(table_name):
parent_synth_table = synth_tables.get(foreign_key.parent_table_name)
# We pull the parent from `processed` instead of from `synth_tables` because "this" table
# (`table_name` / `out_df`) may have a FK pointing to a parent column that _is itself_ a FK to some third table.
# We want to ensure the synthetic values we're using to populate "this" table's FK column are
# the final output values we've produced for its parent table.
# We are synthesizing foreign keys in parent->child order, so the parent table
# should(*) already exist in the processed dict with its final synthetic values...
parent_synth_table = processed.get(foreign_key.parent_table_name)
if parent_synth_table is None:
# Parent table generation job may have failed and therefore not be present in synth_tables.
# The synthetic data for this table may still be useful, but we do not have valid synthetic
# primary key values to set in this table's foreign key column. Instead of introducing dangling
# (*)...BUT the parent table generation job may have failed and therefore not be present in either `processed` or `synth_tables`.
# The synthetic data for "this" table may still be useful, but we do not have valid/any synthetic
# values from the parent to set in "this" table's foreign key column. Instead of introducing dangling
# pointers, set the entire column to None.
synth_pk_values = [None] * len(foreign_key.parent_columns)
synth_parent_values = [None] * len(foreign_key.parent_columns)
else:
synth_pk_values = parent_synth_table[
synth_parent_values = parent_synth_table[
foreign_key.parent_columns
].values.tolist()

Expand All @@ -222,7 +230,7 @@ def _synthesize_foreign_keys(
)

new_fk_values = _collect_values(
synth_pk_values, fk_frequencies, len(out_df)
synth_parent_values, fk_frequencies, len(out_df)
)

out_df[foreign_key.columns] = new_fk_values
Expand Down
81 changes: 78 additions & 3 deletions tests/relational/test_independent_strategy.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import json
import os
import tempfile

Expand All @@ -8,6 +7,7 @@
import pandas as pd
import pandas.testing as pdtest

from gretel_trainer.relational.core import RelationalData
from gretel_trainer.relational.strategies.independent import IndependentStrategy


Expand Down Expand Up @@ -110,8 +110,7 @@ def test_post_processing_one_to_one(pets):
}

# Normally we shuffle synthesized keys for realism, but for deterministic testing we sort instead
with patch("random.shuffle") as shuffle:
shuffle = sorted
with patch("random.shuffle", wraps=sorted):
processed = strategy.post_process_synthetic_results(
raw_synth_tables, [], pets, 1
)
Expand Down Expand Up @@ -169,3 +168,79 @@ def test_post_processing_foreign_keys_with_skewed_frequencies_and_different_size
fk_value_counts = sorted(list(fk_value_counts.values()))

assert fk_value_counts == [5, 5, 15, 30, 35, 60]


# In this scenario, a table (shipping_notifications) has a FK (customer_id) pointing to
# a column that is itself a FK but *not* a PK (orders.customer_id).
# (No, this is not a "perfectly normalized" schema, but it can happen in the wild.)
# We need to ensure tables have FKs synthesized in parent->child order to avoid blowing up
# due to missing columns.
def test_post_processing_fks_to_non_pks(tmpdir):
rel_data = RelationalData(directory=tmpdir)

rel_data.add_table(
name="customers",
primary_key="id",
data=pd.DataFrame(data={"id": [1, 2], "name": ["Xavier", "Yesenia"]}),
)
rel_data.add_table(
name="orders",
primary_key="id",
data=pd.DataFrame(
data={
"id": [1, 2],
"customer_id": [1, 2],
"total": [42, 43],
}
),
)
rel_data.add_table(
name="shipping_notifications",
primary_key="id",
data=pd.DataFrame(
data={
"id": [1, 2],
"order_id": [1, 2],
"customer_id": [1, 2],
"service": ["FedEx", "USPS"],
}
),
)

# Add FKs. The third one is the critical one for this test.
rel_data.add_foreign_key_constraint(
table="orders",
constrained_columns=["customer_id"],
referred_table="customers",
referred_columns=["id"],
)
rel_data.add_foreign_key_constraint(
table="shipping_notifications",
constrained_columns=["order_id"],
referred_table="orders",
referred_columns=["id"],
)
rel_data.add_foreign_key_constraint(
table="shipping_notifications",
constrained_columns=["customer_id"],
referred_table="orders",
referred_columns=["customer_id"],
)

strategy = IndependentStrategy()

# This dict is deliberately ordered child->parent for this unit test.
# Were it not for logic in the strategy (processing tables in parent->child order),
# this setup would cause an exception.
raw_synth_tables = {
"shipping_notifications": pd.DataFrame(data={"service": ["FedEx", "USPS"]}),
"orders": pd.DataFrame(data={"total": [55, 56]}),
"customers": pd.DataFrame(data={"name": ["Alice", "Bob"]}),
}

processed = strategy.post_process_synthetic_results(
raw_synth_tables, [], rel_data, 1
)

for table in rel_data.list_all_tables():
assert set(processed[table].columns) == set(rel_data.get_table_columns(table))

0 comments on commit e0a51b8

Please sign in to comment.