@@ -757,6 +757,311 @@ def col_count_match(data_tbl: FrameT, count, inverse: bool) -> bool:
757757 return get_column_count (data = data_tbl ) != count
758758
759759
760+ def _coerce_to_common_backend (data_tbl : FrameT , tbl_compare : FrameT ) -> tuple [FrameT , FrameT ]:
761+ """
762+ Coerce two tables to the same backend if they differ.
763+
764+ If the tables to compare have different backends (e.g., one is Polars and one is Pandas),
765+ this function will convert the comparison table to match the data table's backend.
766+ This ensures consistent dtype handling during comparison.
767+
768+ Parameters
769+ ----------
770+ data_tbl
771+ The primary table (backend is preserved).
772+ tbl_compare
773+ The comparison table (may be converted to match data_tbl's backend).
774+
775+ Returns
776+ -------
777+ tuple[FrameT, FrameT]
778+ Both tables, with tbl_compare potentially converted to data_tbl's backend.
779+ """
780+ # Get backend types for both tables
781+ data_backend = _get_tbl_type (data_tbl )
782+ compare_backend = _get_tbl_type (tbl_compare )
783+
784+ # If backends match, no conversion needed
785+ if data_backend == compare_backend :
786+ return data_tbl , tbl_compare
787+
788+ # Define database backends (Ibis tables that need materialization)
789+ database_backends = {"duckdb" , "sqlite" , "postgres" , "mysql" , "snowflake" , "bigquery" }
790+
791+ #
792+ # If backends differ, convert tbl_compare to match data_tbl's backend
793+ #
794+
795+ # Handle Ibis/database tables: materialize them to match the target backend
796+ if compare_backend in database_backends :
797+ # Materialize to Polars if data table is Polars, otherwise Pandas
798+ if data_backend == "polars" :
799+ try :
800+ tbl_compare = tbl_compare .to_polars ()
801+ compare_backend = "polars"
802+ except Exception :
803+ # Fallback: materialize to Pandas, then convert to Polars
804+ try :
805+ tbl_compare = tbl_compare .execute ()
806+ compare_backend = "pandas"
807+ except Exception :
808+ try :
809+ tbl_compare = tbl_compare .to_pandas ()
810+ compare_backend = "pandas"
811+ except Exception :
812+ pass
813+ else :
814+ # Materialize to Pandas for Pandas or other backends
815+ try :
816+ tbl_compare = tbl_compare .execute () # Returns Pandas DataFrame
817+ compare_backend = "pandas"
818+ except Exception :
819+ try :
820+ tbl_compare = tbl_compare .to_pandas ()
821+ compare_backend = "pandas"
822+ except Exception :
823+ pass
824+
825+ if data_backend in database_backends :
826+ # If data table itself is a database backend, materialize to Polars
827+ # (Polars is the default modern backend for optimal performance)
828+ try :
829+ data_tbl = data_tbl .to_polars ()
830+ data_backend = "polars"
831+ except Exception :
832+ # Fallback to Pandas if Polars conversion fails
833+ try :
834+ data_tbl = data_tbl .execute ()
835+ data_backend = "pandas"
836+ except Exception :
837+ try :
838+ data_tbl = data_tbl .to_pandas ()
839+ data_backend = "pandas"
840+ except Exception :
841+ pass
842+
843+ # Now handle the Polars/Pandas conversions
844+ if data_backend == "polars" and compare_backend == "pandas" :
845+ try :
846+ import polars as pl
847+
848+ tbl_compare = pl .from_pandas (tbl_compare )
849+ except Exception :
850+ # If conversion fails, return original tables
851+ pass
852+
853+ elif data_backend == "pandas" and compare_backend == "polars" :
854+ try :
855+ tbl_compare = tbl_compare .to_pandas ()
856+ except Exception :
857+ # If conversion fails, return original tables
858+ pass
859+
860+ return data_tbl , tbl_compare
861+
862+
863+ def tbl_match (data_tbl : FrameT , tbl_compare : FrameT ) -> bool :
864+ """
865+ Check if two tables match exactly in schema, row count, and data.
866+
867+ This function performs a comprehensive comparison between two tables,
868+ checking progressively stricter conditions from least to most stringent:
869+
870+ 1. Column count match
871+ 2. Row count match
872+ 3. Schema match (case-insensitive column names, any order)
873+ 4. Schema match (case-insensitive column names, correct order)
874+ 5. Schema match (case-sensitive column names, correct order)
875+ 6. Data match: compares values column-by-column
876+
877+ If the two tables have different backends (e.g., one is Polars and one is Pandas),
878+ the comparison table will be automatically coerced to match the data table's backend
879+ before comparison. This ensures consistent dtype handling.
880+
881+ Parameters
882+ ----------
883+ data_tbl
884+ The target table to validate.
885+ tbl_compare
886+ The comparison table to validate against.
887+
888+ Returns
889+ -------
890+ bool
891+ True if tables match completely, False otherwise.
892+ """
893+ from pointblank .schema import Schema , _check_schema_match
894+ from pointblank .validate import get_column_count , get_row_count
895+
896+ # Coerce to common backend if needed
897+ data_tbl , tbl_compare = _coerce_to_common_backend (data_tbl , tbl_compare )
898+
899+ # Convert both tables to narwhals for compatibility
900+ tbl = _convert_to_narwhals (df = data_tbl )
901+ tbl_cmp = _convert_to_narwhals (df = tbl_compare )
902+
903+ # Stage 1: Check column count (least stringent)
904+ col_count_matching = get_column_count (data = data_tbl ) == get_column_count (data = tbl_compare )
905+
906+ if not col_count_matching :
907+ return False
908+
909+ # Stage 2: Check row count
910+ row_count_matching = get_row_count (data = data_tbl ) == get_row_count (data = tbl_compare )
911+
912+ if not row_count_matching :
913+ return False
914+
915+ # Stage 3: Check schema match for case-insensitive column names, any order
916+ schema = Schema (tbl = tbl_compare )
917+
918+ col_schema_matching_any_order = _check_schema_match (
919+ data_tbl = data_tbl ,
920+ schema = schema ,
921+ complete = True ,
922+ in_order = False ,
923+ case_sensitive_colnames = False ,
924+ case_sensitive_dtypes = False ,
925+ full_match_dtypes = False ,
926+ )
927+
928+ if not col_schema_matching_any_order :
929+ return False
930+
931+ # Stage 4: Check schema match for case-insensitive column names, correct order
932+ col_schema_matching_in_order = _check_schema_match (
933+ data_tbl = data_tbl ,
934+ schema = schema ,
935+ complete = True ,
936+ in_order = True ,
937+ case_sensitive_colnames = False ,
938+ case_sensitive_dtypes = False ,
939+ full_match_dtypes = False ,
940+ )
941+
942+ if not col_schema_matching_in_order :
943+ return False
944+
945+ # Stage 5: Check schema match for case-sensitive column names, correct order
946+ col_schema_matching_exact = _check_schema_match (
947+ data_tbl = data_tbl ,
948+ schema = schema ,
949+ complete = True ,
950+ in_order = True ,
951+ case_sensitive_colnames = True ,
952+ case_sensitive_dtypes = False ,
953+ full_match_dtypes = False ,
954+ )
955+
956+ if not col_schema_matching_exact :
957+ return False
958+
959+ # Stage 6: Check for exact data by cell across matched columns (most stringent)
960+ # Handle edge case where both tables have zero rows (they match)
961+ if get_row_count (data = data_tbl ) == 0 :
962+ return True
963+
964+ column_count = get_column_count (data = data_tbl )
965+
966+ # Compare column-by-column
967+ for i in range (column_count ):
968+ # Get column name
969+ col_name = tbl .columns [i ]
970+
971+ # Get column data from both tables
972+ col_data_1 = tbl .select (col_name )
973+ col_data_2 = tbl_cmp .select (col_name )
974+
975+ # Convert to native format for comparison
976+ # We need to collect if lazy frames
977+ if hasattr (col_data_1 , "collect" ):
978+ col_data_1 = col_data_1 .collect ()
979+
980+ if hasattr (col_data_2 , "collect" ):
981+ col_data_2 = col_data_2 .collect ()
982+
983+ # Convert to native and then to lists for comparison
984+ col_1_native = col_data_1 .to_native ()
985+ col_2_native = col_data_2 .to_native ()
986+
987+ # Extract values as lists for comparison
988+ if hasattr (col_1_native , "to_list" ): # Polars Series
989+ values_1 = col_1_native [col_name ].to_list ()
990+ values_2 = col_2_native [col_name ].to_list ()
991+
992+ elif hasattr (col_1_native , "tolist" ): # Pandas Series/DataFrame
993+ values_1 = col_1_native [col_name ].tolist ()
994+ values_2 = col_2_native [col_name ].tolist ()
995+
996+ elif hasattr (col_1_native , "collect" ): # Ibis
997+ values_1 = col_1_native [col_name ].to_pandas ().tolist ()
998+ values_2 = col_2_native [col_name ].to_pandas ().tolist ()
999+
1000+ else :
1001+ # Fallback: try direct comparison
1002+ values_1 = list (col_1_native [col_name ])
1003+ values_2 = list (col_2_native [col_name ])
1004+
1005+ # Compare the two lists element by element, handling NaN/None
1006+ if len (values_1 ) != len (values_2 ):
1007+ return False
1008+
1009+ for v1 , v2 in zip (values_1 , values_2 ):
1010+ # Handle None/NaN comparisons and check both None and NaN
1011+ # Note: When Pandas NaN is converted to Polars, it may become None
1012+ v1_is_null = v1 is None
1013+ v2_is_null = v2 is None
1014+
1015+ # Check if v1 is NaN
1016+ if not v1_is_null :
1017+ try :
1018+ import math
1019+
1020+ if math .isnan (v1 ):
1021+ v1_is_null = True
1022+ except (TypeError , ValueError ):
1023+ pass
1024+
1025+ # Check if v2 is NaN
1026+ if not v2_is_null :
1027+ try :
1028+ import math
1029+
1030+ if math .isnan (v2 ):
1031+ v2_is_null = True
1032+ except (TypeError , ValueError ):
1033+ pass
1034+
1035+ # If both are null (None or NaN), they match
1036+ if v1_is_null and v2_is_null :
1037+ continue
1038+
1039+ # If only one is null, they don't match
1040+ if v1_is_null or v2_is_null :
1041+ return False
1042+
1043+ # Direct comparison: handle lists/arrays separately
1044+ try :
1045+ if v1 != v2 :
1046+ return False
1047+ except (TypeError , ValueError ):
1048+ # If direct comparison fails (e.g., for lists/arrays), try element-wise comparison
1049+ try :
1050+ if isinstance (v1 , list ) and isinstance (v2 , list ):
1051+ if v1 != v2 :
1052+ return False
1053+ elif hasattr (v1 , "__eq__" ) and hasattr (v2 , "__eq__" ):
1054+ # For array-like objects, check if they're equal
1055+ if not (v1 == v2 ).all () if hasattr ((v1 == v2 ), "all" ) else v1 == v2 :
1056+ return False
1057+ else :
1058+ return False
1059+ except Exception :
1060+ return False
1061+
1062+ return True
1063+
1064+
7601065def conjointly_validation (data_tbl : FrameT , expressions , threshold : int , tbl_type : str = "local" ):
7611066 """
7621067 Perform conjoint validation using multiple expressions.
@@ -1629,7 +1934,7 @@ def interrogate_outside(
16291934 pb_is_good_4 = nw .lit (na_pass ), # Pass if any Null in lb, val, or ub
16301935 )
16311936
1632- # Note: Logic is inverted for "outside" - when inclusive[0] is True,
1937+ # Note: Logic is inverted for "outside"; when inclusive[0] is True,
16331938 # we want values < low_val (not <= low_val) to be "outside"
16341939 if inclusive [0 ]:
16351940 result_tbl = result_tbl .with_columns (pb_is_good_5 = nw .col (column ) < low_val )
@@ -1852,7 +2157,7 @@ def interrogate_within_spec(tbl: FrameT, column: str, values: dict, na_pass: boo
18522157 if hasattr (native_tbl , "execute" ):
18532158 native_tbl = native_tbl .execute ()
18542159
1855- # Add validation column - convert native table to Series, then back through Narwhals
2160+ # Add validation column: convert native table to Series, then back through Narwhals
18562161 if is_polars_dataframe (native_tbl ):
18572162 import polars as pl
18582163
@@ -2095,7 +2400,7 @@ def interrogate_credit_card_db(
20952400 # Get the column as an Ibis expression
20962401 col_expr = native_tbl [column ]
20972402
2098- # Step 1: Clean the input - remove spaces and hyphens
2403+ # Step 1: Clean the input and remove spaces and hyphens
20992404 # First check format: only digits, spaces, and hyphens allowed
21002405 valid_chars = col_expr .re_search (r"^[0-9\s\-]+$" ).notnull ()
21012406
0 commit comments