66from pandas import DataFrame
77from numpy import ones , tril , argwhere
88
9+ from src .ydata_quality .core .warnings import Priority
10+
911from ..core import QualityEngine , QualityWarning
1012from ..utils .auxiliary import infer_dtypes , standard_normalize
1113from ..utils .correlations import (chi2_collinearity , correlation_matrix ,
@@ -117,9 +119,11 @@ def _confounder_detection(self, corr_mat: DataFrame, par_corr_mat: DataFrame,
117119 mask [par_corr_mat .abs () > corr_th ] = False # Drop pairs with correlation after controling all other covariates
118120 confounded_pairs = [(corr_mat .index [i ], corr_mat .columns [j ]) for i , j in argwhere (mask )]
119121 if len (confounded_pairs ) > 0 :
120- self .store_warning (QualityWarning (
121- test = 'Confounded correlations' , category = 'Data Relations' , priority = 2 , data = confounded_pairs ,
122- description = f"""
122+ self .store_warning (
123+ QualityWarning (
124+ test = QualityWarning .Test .CONFOUNDED_CORRELATIONS , category = QualityWarning .Category .DATA_RELATIONS ,
125+ priority = Priority .P2 , data = confounded_pairs ,
126+ description = f"""
123127 Found { len (confounded_pairs )} independently correlated variable pairs that disappeared after controling\
124128 for the remaining variables. This is an indicator of potential confounder effects in the dataset.""" ))
125129 return confounded_pairs
@@ -138,9 +142,11 @@ def _collider_detection(self, corr_mat: DataFrame, par_corr_mat: DataFrame,
138142 mask [par_corr_mat .abs () <= corr_th ] = False # Drop pairs with correlation after controling all other covariates
139143 colliding_pairs = [(corr_mat .index [i ], corr_mat .columns [j ]) for i , j in argwhere (mask )]
140144 if len (colliding_pairs ) > 0 :
141- self .store_warning (QualityWarning (
142- test = 'Collider correlations' , category = 'Data Relations' , priority = 2 , data = colliding_pairs ,
143- description = f"Found { len (colliding_pairs )} independently uncorrelated variable pairs that showed \
145+ self .store_warning (
146+ QualityWarning (
147+ test = QualityWarning .Test .COLLIDER_CORRELATIONS , category = QualityWarning .category .DATA_RELATIONS ,
148+ priority = Priority .P2 , data = colliding_pairs ,
149+ description = f"Found { len (colliding_pairs )} independently uncorrelated variable pairs that showed \
144150 correlation after controling for the remaining variables. \
145151 This is an indicator of potential colliding bias with other covariates." ))
146152 return colliding_pairs
@@ -192,18 +198,22 @@ def _high_collinearity_detection(self, df: DataFrame, dtypes: dict, label: str =
192198 ['Adjusted Chi2' ].mean ()) for c in unique_cats ]
193199 cat_coll_scores = [c [0 ] for c in sorted (cat_coll_scores , key = lambda x : x [1 ], reverse = True )]
194200 if len (inflated ) > 0 :
195- self .store_warning (QualityWarning (
196- test = 'High Collinearity - Numerical' , category = 'Data Relations' , priority = 2 , data = inflated ,
197- description = f"""Found { len (inflated )} numerical variables with high Variance Inflation Factor \
201+ self .store_warning (
202+ QualityWarning (
203+ test = QualityWarning .Test .HIGH_COLLINEARITY_NUMERICAL ,
204+ category = QualityWarning .Category .DATA_RELATIONS , priority = Priority .P2 , data = inflated ,
205+ description = f"""Found { len (inflated )} numerical variables with high Variance Inflation Factor \
198206 (VIF>{ vif_th :.1f} ). The variables listed in results are highly collinear with other variables in the dataset. \
199207 These will make model explainability harder and potentially give way to issues like overfitting.\
200208 Depending on your end goal you might want to remove the highest VIF variables.""" ))
201209 if len (cat_coll_scores ) > 0 :
202210 # TODO: Merge warning messages (make one warning for the whole test,
203211 # summarizing findings from the numerical and categorical vars)
204- self .store_warning (QualityWarning (
205- test = 'High Collinearity - Categorical' , category = 'Data Relations' , priority = 2 , data = chi2_tests ,
206- description = f"""Found { len (cat_coll_scores )} categorical variables with significant collinearity \
212+ self .store_warning (
213+ QualityWarning (
214+ test = QualityWarning .Test .HIGH_COLLINEARITY_CATEGORICAL ,
215+ category = QualityWarning .Category .DATA_RELATIONS , priority = Priority .P2 , data = chi2_tests ,
216+ description = f"""Found { len (cat_coll_scores )} categorical variables with significant collinearity \
207217 (p-value < { p_th } ). The variables listed in results are highly collinear with other variables \
208218 in the dataset and sorted descending according to propensity. These will make model explainability \
209219 harder and potentially give way to issues like overfitting.Depending on your end goal you might want \
0 commit comments