Skip to content

Commit 0b47c1b

Browse files
authored
Merge pull request #306 from posit-dev/feat-tbl-match
feat: add the `tbl_match()` validation method
2 parents 53b198f + 01a8633 commit 0b47c1b

File tree

7 files changed

+2560
-3
lines changed

7 files changed

+2560
-3
lines changed

docs/_quarto.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ quartodoc:
170170
- name: Validate.col_schema_match
171171
- name: Validate.row_count_match
172172
- name: Validate.col_count_match
173+
- name: Validate.tbl_match
173174
- name: Validate.conjointly
174175
- name: Validate.specially
175176
- name: Validate.prompt

pointblank/_constants.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
"col_schema_match": "col_schema_match",
4949
"row_count_match": "row_count_match",
5050
"col_count_match": "col_count_match",
51+
"tbl_match": "tbl_match",
5152
"conjointly": "conjointly",
5253
"specially": "specially",
5354
}
@@ -513,6 +514,25 @@
513514
<path d="M11.5931863,12.5146694 C11.3836625,12.5146694 10.212234,12.5646694 10.212234,13.8480027 L10.212234,53.181336 C10.212234,54.4646694 11.3836625,54.5146694 11.5931863,54.5146694 L14.1646149,54.5146694 L14.1646149,12.5146694 L11.5931863,12.5146694 Z M20.1721771,12.5146694 L20.1721771,54.5146694 L16.2522908,54.5146694 L16.2522908,54.5146694 L16.2522908,12.5146694 L16.2522908,12.5146694 L20.1721771,12.5146694 Z M24.8656149,12.5150904 C25.1448786,12.521763 26.212234,12.6230027 26.212234,13.8480027 L26.212234,13.8480027 L26.212234,53.181336 C26.212234,54.4646694 25.0408054,54.5146694 24.8312816,54.5146694 L24.8312816,54.5146694 L22.259853,54.5146694 L22.259853,12.5146694 Z" id="rows_one" fill="#000000" fill-rule="nonzero" transform="translate(18.212234, 33.514669) rotate(-180.000000) translate(-18.212234, -33.514669) "></path>
514515
</g>
515516
</g>
517+
</svg>""",
518+
"tbl_match": """<?xml version="1.0" encoding="UTF-8"?>
519+
<svg width="67px" height="67px" viewBox="0 0 67 67" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
520+
<title>tbl_match</title>
521+
<g id="All-Icons" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
522+
<g id="tbl_match" transform="translate(0.000000, 0.758621)">
523+
<path d="M56.712234,1.01466935 C59.1975153,1.01466935 61.4475153,2.02202867 63.076195,3.65070832 C64.7048747,5.27938798 65.712234,7.52938798 65.712234,10.0146694 L65.712234,10.0146694 L65.712234,65.0146694 L10.712234,65.0146694 C8.22695259,65.0146694 5.97695259,64.00731 4.34827294,62.3786304 C2.71959328,60.7499507 1.71223397,58.4999507 1.71223397,56.0146694 L1.71223397,56.0146694 L1.71223397,10.0146694 C1.71223397,7.52938798 2.71959328,5.27938798 4.34827294,3.65070832 C5.97695259,2.02202867 8.22695259,1.01466935 10.712234,1.01466935 L10.712234,1.01466935 Z" id="rectangle" stroke="#000000" stroke-width="2" fill="#FFFFFF"></path>
524+
<g id="equal" transform="translate(46.026611, 20.710122) rotate(-90.000000) translate(-46.026611, -20.710122) translate(42.526611, 16.210122)" stroke="#000000" stroke-linecap="square">
525+
<line x1="2.21223397" y1="0.514669353" x2="2.21223397" y2="7.58573716" id="Line"></line>
526+
<line x1="5.21223397" y1="0.514669353" x2="5.21223397" y2="7.58573716" id="Line"></line>
527+
</g>
528+
<g id="equal" transform="translate(21.397857, 45.319217) rotate(-90.000000) translate(-21.397857, -45.319217) translate(17.897857, 40.819217)" stroke="#000000" stroke-linecap="square">
529+
<line x1="2.21223397" y1="0.514669353" x2="2.21223397" y2="7.58573716" id="Line"></line>
530+
<line x1="5.21223397" y1="0.514669353" x2="5.21223397" y2="7.58573716" id="Line"></line>
531+
</g>
532+
<path d="M21.3882419,7.77869783 C21.3584298,7.77935177 21.328704,7.78216367 21.2992996,7.78711914 L9.09014824,7.78711914 C8.75029431,7.78715312 8.47479677,8.06265067 8.47476279,8.4025046 L8.47476279,16.2991498 C8.46377874,16.3656061 8.46377874,16.4334149 8.47476279,16.4998713 L8.47476279,24.9145461 C8.46377874,24.9810025 8.46377874,25.0488112 8.47476279,25.1152676 L8.47476279,33.0179226 C8.47479677,33.3577766 8.75029431,33.6332741 9.09014824,33.6333081 L21.2944916,33.6333081 C21.3609479,33.6442921 21.4287567,33.6442921 21.4952131,33.6333081 L33.7055663,33.6333081 C34.0454202,33.6332741 34.3209178,33.3577766 34.3209517,33.0179226 L34.3209517,25.1212775 C34.3319358,25.0548211 34.3319358,24.9870123 34.3209517,24.920556 L34.3209517,16.5058811 C34.3319358,16.4394248 34.3319358,16.371616 34.3209517,16.3051596 L34.3209517,8.4025046 C34.3209178,8.06265067 34.0454202,7.78715312 33.7055663,7.78711914 L21.4928094,7.78711914 C21.4582565,7.78134369 21.4232736,7.77869783 21.3882419,7.77869783 Z M9.70553369,9.01789005 L20.7824718,9.01789005 L20.7824718,15.78713 L9.70553369,15.78713 L9.70553369,9.01789005 Z M22.0132427,9.01789005 L33.0901808,9.01789005 L33.0901808,15.78713 L22.0132427,15.78713 L22.0132427,9.01789005 Z M9.70553369,17.0179009 L20.7824718,17.0179009 L20.7824718,24.4025263 L9.70553369,24.4025263 L9.70553369,17.0179009 Z M22.0132427,17.0179009 L33.0901808,17.0179009 L33.0901808,24.4025263 L22.0132427,24.4025263 L22.0132427,17.0179009 Z M9.70553369,25.6332972 L20.7824718,25.6332972 L20.7824718,32.4025372 L9.70553369,32.4025372 L9.70553369,25.6332972 Z M22.0132427,25.6332972 L33.0901808,25.6332972 L33.0901808,32.4025372 L22.0132427,32.4025372 L22.0132427,25.6332972 Z" id="table" fill="#000000" fill-rule="nonzero"></path>
533+
<path d="M46.0169953,32.3877926 C45.9871832,32.3884465 45.9574575,32.3912584 45.928053,32.3962139 L33.7189016,32.3962139 C33.3790477,32.3962479 33.1035502,32.6717454 33.1035162,33.0115993 L33.1035162,40.9082445 C33.0925322,40.9747009 33.0925322,41.0425097 33.1035162,41.108966 L33.1035162,49.5236408 C33.0925322,49.5900972 33.0925322,49.657906 33.1035162,49.7243624 L33.1035162,57.6270174 C33.1035502,57.9668713 33.3790477,58.2423689 33.7189016,58.2424028 L45.923245,58.2424028 C45.9897014,58.2533869 46.0575101,58.2533869 46.1239665,58.2424028 L58.3343197,58.2424028 C58.6741736,58.2423689 58.9496712,57.9668713 58.9497051,57.6270174 L58.9497051,49.7303722 C58.9606892,49.6639158 58.9606892,49.5961071 58.9497051,49.5296507 L58.9497051,41.1149759 C58.9606892,41.0485195 58.9606892,40.9807107 58.9497051,40.9142544 L58.9497051,33.0115993 C58.9496712,32.6717454 58.6741736,32.3962479 58.3343197,32.3962139 L46.1215628,32.3962139 C46.0870099,32.3904384 46.052027,32.3877926 46.0169953,32.3877926 Z M34.3342871,33.6269848 L45.4112252,33.6269848 L45.4112252,40.3962248 L34.3342871,40.3962248 L34.3342871,33.6269848 Z M46.6419961,33.6269848 L57.7189342,33.6269848 L57.7189342,40.3962248 L46.6419961,40.3962248 L46.6419961,33.6269848 Z M34.3342871,41.6269957 L45.4112252,41.6269957 L45.4112252,49.0116211 L34.3342871,49.0116211 L34.3342871,41.6269957 Z M46.6419961,41.6269957 L57.7189342,41.6269957 L57.7189342,49.0116211 L46.6419961,49.0116211 L46.6419961,41.6269957 Z M34.3342871,50.242392 L45.4112252,50.242392 L45.4112252,57.0116319 L34.3342871,57.0116319 L34.3342871,50.242392 Z M46.6419961,50.242392 L57.7189342,50.242392 L57.7189342,57.0116319 L46.6419961,57.0116319 L46.6419961,50.242392 Z" id="table" fill="#000000" fill-rule="nonzero"></path>
534+
</g>
535+
</g>
516536
</svg>""",
517537
"col_vals_expr": """<?xml version="1.0" encoding="UTF-8"?>
518538
<svg width="67px" height="66px" viewBox="0 0 67 66" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">

pointblank/_interrogation.py

Lines changed: 308 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -757,6 +757,311 @@ def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
757757
return get_column_count(data=data_tbl) != count
758758

759759

760+
def _coerce_to_common_backend(data_tbl: FrameT, tbl_compare: FrameT) -> tuple[FrameT, FrameT]:
761+
"""
762+
Coerce two tables to the same backend if they differ.
763+
764+
If the tables to compare have different backends (e.g., one is Polars and one is Pandas),
765+
this function will convert the comparison table to match the data table's backend.
766+
This ensures consistent dtype handling during comparison.
767+
768+
Parameters
769+
----------
770+
data_tbl
771+
The primary table (backend is preserved).
772+
tbl_compare
773+
The comparison table (may be converted to match data_tbl's backend).
774+
775+
Returns
776+
-------
777+
tuple[FrameT, FrameT]
778+
Both tables, with tbl_compare potentially converted to data_tbl's backend.
779+
"""
780+
# Get backend types for both tables
781+
data_backend = _get_tbl_type(data_tbl)
782+
compare_backend = _get_tbl_type(tbl_compare)
783+
784+
# If backends match, no conversion needed
785+
if data_backend == compare_backend:
786+
return data_tbl, tbl_compare
787+
788+
# Define database backends (Ibis tables that need materialization)
789+
database_backends = {"duckdb", "sqlite", "postgres", "mysql", "snowflake", "bigquery"}
790+
791+
#
792+
# If backends differ, convert tbl_compare to match data_tbl's backend
793+
#
794+
795+
# Handle Ibis/database tables: materialize them to match the target backend
796+
if compare_backend in database_backends:
797+
# Materialize to Polars if data table is Polars, otherwise Pandas
798+
if data_backend == "polars":
799+
try:
800+
tbl_compare = tbl_compare.to_polars()
801+
compare_backend = "polars"
802+
except Exception:
803+
# Fallback: materialize to Pandas, then convert to Polars
804+
try:
805+
tbl_compare = tbl_compare.execute()
806+
compare_backend = "pandas"
807+
except Exception:
808+
try:
809+
tbl_compare = tbl_compare.to_pandas()
810+
compare_backend = "pandas"
811+
except Exception:
812+
pass
813+
else:
814+
# Materialize to Pandas for Pandas or other backends
815+
try:
816+
tbl_compare = tbl_compare.execute() # Returns Pandas DataFrame
817+
compare_backend = "pandas"
818+
except Exception:
819+
try:
820+
tbl_compare = tbl_compare.to_pandas()
821+
compare_backend = "pandas"
822+
except Exception:
823+
pass
824+
825+
if data_backend in database_backends:
826+
# If data table itself is a database backend, materialize to Polars
827+
# (Polars is the default modern backend for optimal performance)
828+
try:
829+
data_tbl = data_tbl.to_polars()
830+
data_backend = "polars"
831+
except Exception:
832+
# Fallback to Pandas if Polars conversion fails
833+
try:
834+
data_tbl = data_tbl.execute()
835+
data_backend = "pandas"
836+
except Exception:
837+
try:
838+
data_tbl = data_tbl.to_pandas()
839+
data_backend = "pandas"
840+
except Exception:
841+
pass
842+
843+
# Now handle the Polars/Pandas conversions
844+
if data_backend == "polars" and compare_backend == "pandas":
845+
try:
846+
import polars as pl
847+
848+
tbl_compare = pl.from_pandas(tbl_compare)
849+
except Exception:
850+
# If conversion fails, return original tables
851+
pass
852+
853+
elif data_backend == "pandas" and compare_backend == "polars":
854+
try:
855+
tbl_compare = tbl_compare.to_pandas()
856+
except Exception:
857+
# If conversion fails, return original tables
858+
pass
859+
860+
return data_tbl, tbl_compare
861+
862+
863+
def tbl_match(data_tbl: FrameT, tbl_compare: FrameT) -> bool:
864+
"""
865+
Check if two tables match exactly in schema, row count, and data.
866+
867+
This function performs a comprehensive comparison between two tables,
868+
checking progressively stricter conditions from least to most stringent:
869+
870+
1. Column count match
871+
2. Row count match
872+
3. Schema match (case-insensitive column names, any order)
873+
4. Schema match (case-insensitive column names, correct order)
874+
5. Schema match (case-sensitive column names, correct order)
875+
6. Data match: compares values column-by-column
876+
877+
If the two tables have different backends (e.g., one is Polars and one is Pandas),
878+
the comparison table will be automatically coerced to match the data table's backend
879+
before comparison. This ensures consistent dtype handling.
880+
881+
Parameters
882+
----------
883+
data_tbl
884+
The target table to validate.
885+
tbl_compare
886+
The comparison table to validate against.
887+
888+
Returns
889+
-------
890+
bool
891+
True if tables match completely, False otherwise.
892+
"""
893+
from pointblank.schema import Schema, _check_schema_match
894+
from pointblank.validate import get_column_count, get_row_count
895+
896+
# Coerce to common backend if needed
897+
data_tbl, tbl_compare = _coerce_to_common_backend(data_tbl, tbl_compare)
898+
899+
# Convert both tables to narwhals for compatibility
900+
tbl = _convert_to_narwhals(df=data_tbl)
901+
tbl_cmp = _convert_to_narwhals(df=tbl_compare)
902+
903+
# Stage 1: Check column count (least stringent)
904+
col_count_matching = get_column_count(data=data_tbl) == get_column_count(data=tbl_compare)
905+
906+
if not col_count_matching:
907+
return False
908+
909+
# Stage 2: Check row count
910+
row_count_matching = get_row_count(data=data_tbl) == get_row_count(data=tbl_compare)
911+
912+
if not row_count_matching:
913+
return False
914+
915+
# Stage 3: Check schema match for case-insensitive column names, any order
916+
schema = Schema(tbl=tbl_compare)
917+
918+
col_schema_matching_any_order = _check_schema_match(
919+
data_tbl=data_tbl,
920+
schema=schema,
921+
complete=True,
922+
in_order=False,
923+
case_sensitive_colnames=False,
924+
case_sensitive_dtypes=False,
925+
full_match_dtypes=False,
926+
)
927+
928+
if not col_schema_matching_any_order:
929+
return False
930+
931+
# Stage 4: Check schema match for case-insensitive column names, correct order
932+
col_schema_matching_in_order = _check_schema_match(
933+
data_tbl=data_tbl,
934+
schema=schema,
935+
complete=True,
936+
in_order=True,
937+
case_sensitive_colnames=False,
938+
case_sensitive_dtypes=False,
939+
full_match_dtypes=False,
940+
)
941+
942+
if not col_schema_matching_in_order:
943+
return False
944+
945+
# Stage 5: Check schema match for case-sensitive column names, correct order
946+
col_schema_matching_exact = _check_schema_match(
947+
data_tbl=data_tbl,
948+
schema=schema,
949+
complete=True,
950+
in_order=True,
951+
case_sensitive_colnames=True,
952+
case_sensitive_dtypes=False,
953+
full_match_dtypes=False,
954+
)
955+
956+
if not col_schema_matching_exact:
957+
return False
958+
959+
# Stage 6: Check for exact data by cell across matched columns (most stringent)
960+
# Handle edge case where both tables have zero rows (they match)
961+
if get_row_count(data=data_tbl) == 0:
962+
return True
963+
964+
column_count = get_column_count(data=data_tbl)
965+
966+
# Compare column-by-column
967+
for i in range(column_count):
968+
# Get column name
969+
col_name = tbl.columns[i]
970+
971+
# Get column data from both tables
972+
col_data_1 = tbl.select(col_name)
973+
col_data_2 = tbl_cmp.select(col_name)
974+
975+
# Convert to native format for comparison
976+
# We need to collect if lazy frames
977+
if hasattr(col_data_1, "collect"):
978+
col_data_1 = col_data_1.collect()
979+
980+
if hasattr(col_data_2, "collect"):
981+
col_data_2 = col_data_2.collect()
982+
983+
# Convert to native and then to lists for comparison
984+
col_1_native = col_data_1.to_native()
985+
col_2_native = col_data_2.to_native()
986+
987+
# Extract values as lists for comparison
988+
if hasattr(col_1_native, "to_list"): # Polars Series
989+
values_1 = col_1_native[col_name].to_list()
990+
values_2 = col_2_native[col_name].to_list()
991+
992+
elif hasattr(col_1_native, "tolist"): # Pandas Series/DataFrame
993+
values_1 = col_1_native[col_name].tolist()
994+
values_2 = col_2_native[col_name].tolist()
995+
996+
elif hasattr(col_1_native, "collect"): # Ibis
997+
values_1 = col_1_native[col_name].to_pandas().tolist()
998+
values_2 = col_2_native[col_name].to_pandas().tolist()
999+
1000+
else:
1001+
# Fallback: try direct comparison
1002+
values_1 = list(col_1_native[col_name])
1003+
values_2 = list(col_2_native[col_name])
1004+
1005+
# Compare the two lists element by element, handling NaN/None
1006+
if len(values_1) != len(values_2):
1007+
return False
1008+
1009+
for v1, v2 in zip(values_1, values_2):
1010+
# Handle None/NaN comparisons and check both None and NaN
1011+
# Note: When Pandas NaN is converted to Polars, it may become None
1012+
v1_is_null = v1 is None
1013+
v2_is_null = v2 is None
1014+
1015+
# Check if v1 is NaN
1016+
if not v1_is_null:
1017+
try:
1018+
import math
1019+
1020+
if math.isnan(v1):
1021+
v1_is_null = True
1022+
except (TypeError, ValueError):
1023+
pass
1024+
1025+
# Check if v2 is NaN
1026+
if not v2_is_null:
1027+
try:
1028+
import math
1029+
1030+
if math.isnan(v2):
1031+
v2_is_null = True
1032+
except (TypeError, ValueError):
1033+
pass
1034+
1035+
# If both are null (None or NaN), they match
1036+
if v1_is_null and v2_is_null:
1037+
continue
1038+
1039+
# If only one is null, they don't match
1040+
if v1_is_null or v2_is_null:
1041+
return False
1042+
1043+
# Direct comparison: handle lists/arrays separately
1044+
try:
1045+
if v1 != v2:
1046+
return False
1047+
except (TypeError, ValueError):
1048+
# If direct comparison fails (e.g., for lists/arrays), try element-wise comparison
1049+
try:
1050+
if isinstance(v1, list) and isinstance(v2, list):
1051+
if v1 != v2:
1052+
return False
1053+
elif hasattr(v1, "__eq__") and hasattr(v2, "__eq__"):
1054+
# For array-like objects, check if they're equal
1055+
if not (v1 == v2).all() if hasattr((v1 == v2), "all") else v1 == v2:
1056+
return False
1057+
else:
1058+
return False
1059+
except Exception:
1060+
return False
1061+
1062+
return True
1063+
1064+
7601065
def conjointly_validation(data_tbl: FrameT, expressions, threshold: int, tbl_type: str = "local"):
7611066
"""
7621067
Perform conjoint validation using multiple expressions.
@@ -1629,7 +1934,7 @@ def interrogate_outside(
16291934
pb_is_good_4=nw.lit(na_pass), # Pass if any Null in lb, val, or ub
16301935
)
16311936

1632-
# Note: Logic is inverted for "outside" - when inclusive[0] is True,
1937+
# Note: Logic is inverted for "outside"; when inclusive[0] is True,
16331938
# we want values < low_val (not <= low_val) to be "outside"
16341939
if inclusive[0]:
16351940
result_tbl = result_tbl.with_columns(pb_is_good_5=nw.col(column) < low_val)
@@ -1852,7 +2157,7 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
18522157
if hasattr(native_tbl, "execute"):
18532158
native_tbl = native_tbl.execute()
18542159

1855-
# Add validation column - convert native table to Series, then back through Narwhals
2160+
# Add validation column: convert native table to Series, then back through Narwhals
18562161
if is_polars_dataframe(native_tbl):
18572162
import polars as pl
18582163

@@ -2095,7 +2400,7 @@ def interrogate_credit_card_db(
20952400
# Get the column as an Ibis expression
20962401
col_expr = native_tbl[column]
20972402

2098-
# Step 1: Clean the input - remove spaces and hyphens
2403+
# Step 1: Clean the input and remove spaces and hyphens
20992404
# First check format: only digits, spaces, and hyphens allowed
21002405
valid_chars = col_expr.re_search(r"^[0-9\s\-]+$").notnull()
21012406

pointblank/_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -683,6 +683,7 @@ def _get_api_text() -> str:
683683
"Validate.col_schema_match",
684684
"Validate.row_count_match",
685685
"Validate.col_count_match",
686+
"Validate.tbl_match",
686687
"Validate.conjointly",
687688
"Validate.specially",
688689
"Validate.prompt",

0 commit comments

Comments
 (0)