ydataai
diff --git a/‎src/ydata_quality/bias_fairness/engine.py‎
Lines changed: 10 additions & 4 deletions b/‎src/ydata_quality/bias_fairness/engine.py‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎src/ydata_quality/core/data_quality.py‎
Lines changed: 6 additions & 2 deletions b/‎src/ydata_quality/core/data_quality.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/ydata_quality/core/warnings.py‎
Lines changed: 58 additions & 4 deletions b/‎src/ydata_quality/core/warnings.py‎
Lines changed: 58 additions & 4 deletions
diff --git a/‎src/ydata_quality/data_expectations/engine.py‎
Lines changed: 8 additions & 3 deletions b/‎src/ydata_quality/data_expectations/engine.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎src/ydata_quality/data_relations/engine.py‎
Lines changed: 22 additions & 12 deletions b/‎src/ydata_quality/data_relations/engine.py‎
Lines changed: 22 additions & 12 deletions
diff --git a/‎src/ydata_quality/drift/engine.py‎
Lines changed: 12 additions & 6 deletions b/‎src/ydata_quality/drift/engine.py‎
Lines changed: 12 additions & 6 deletions
diff --git a/‎src/ydata_quality/duplicates/engine.py‎
Lines changed: 7 additions & 3 deletions b/‎src/ydata_quality/duplicates/engine.py‎
Lines changed: 7 additions & 3 deletions
@@ -7,6 +7,8 @@
 from pandas import DataFrame, Series
 from dython.nominal import compute_associations
 
+from src.ydata_quality.core.warnings import Priority
+
 from ..core import QualityEngine, QualityWarning
 from ..utils.correlations import filter_associations
 from ..utils.modelling import (baseline_performance,
@@ -58,7 +60,8 @@ def proxy_identification(self, th=0.5):
         if len(corrs) > 0:
             self.store_warning(
                 QualityWarning(
-                    test='Proxy Identification', category='Bias&Fairness', priority=2, data=corrs,
+                    test=QualityWarning.Test.PROXY_IDENTIFICATION,
+                    category=QualityWarning.Category.BIAS_FAIRNESS, priority=Priority.P2, data=corrs,
                     description=f"Found {len(corrs)} feature pairs of correlation "
                     f"to sensitive attributes with values higher than defined threshold ({th})."
                 ))
@@ -80,7 +83,9 @@ def sensitive_predictability(self, th=0.5, adjusted_metric=True):
         if len(high_perfs) > 0:
             self.store_warning(
                 QualityWarning(
-                    test='Sensitive Attribute Predictability', category='Bias&Fairness', priority=3, data=high_perfs,
+                    test=QualityWarning.Test.SENSITIVE_ATTRIBUTE_PREDICTABILITY,
+                    category=QualityWarning.Category.BIAS_FAIRNESS,
+                    priority=Priority.P3, data=high_perfs,
                     description=f"Found {len(high_perfs)} sensitive attribute(s) with high predictability performance"
                     f" (greater than {th})."
                 )
@@ -124,8 +129,9 @@ def sensitive_representativity(self, min_pct: float = 0.01):
             if len(low_dist) > 0:
                 self.store_warning(
                     QualityWarning(
-                        test='Sensitive Attribute Representativity', category='Bias&Fairness', priority=2,
-                        data=low_dist, description=f"Found {len(low_dist)} values of '{cat}' \
+                        test=QualityWarning.Test.SENSITIVE_ATTRIBUTE_REPRESENTATIVITY,
+                        category=QualityWarning.Category.BIAS_FAIRNESS, priority=Priority.P2, data=low_dist,
+                        description=f"Found {len(low_dist)} values of '{cat}' \
 sensitive attribute with low representativity in the dataset (below {min_pct*100:.2f}%)."
                     )
                 )
 
@@ -135,10 +135,14 @@ def _clean_warnings(self):
         self._warnings = sorted(list(set(self._warnings)))  # Sort unique warnings by priority
 
     def get_warnings(self,
-                     category: Optional[str] = None,
-                     test: Optional[str] = None,
+                     category: Optional[Union[QualityWarning.Category, str]] = None,
+                     test: Optional[Union[QualityWarning.Test, str]] = None,
                      priority: Optional[Priority] = None) -> List[QualityWarning]:
         "Retrieves warnings filtered by their properties."
+
+        category = QualityWarning.Category(category) if category is not None else None
+        test = QualityWarning.Test(test) if test is not None else None
+
         self._store_warnings()
         self._clean_warnings()
         filtered = [w for w in self._warnings if w.category == category] if category else self._warnings
 
@@ -6,7 +6,7 @@
 
 from pydantic import BaseModel
 
-from ..utils.enum import OrderedEnum
+from ..utils.enum import OrderedEnum, StringEnum
 
 
 # pylint: disable=too-few-public-methods
@@ -67,8 +67,62 @@ class QualityWarning(BaseModel):
     data: sample data
     """
 
-    category: str
-    test: str
+    class Category(StringEnum):
+        BIAS_FAIRNESS = "BIAS&FAIRNESS"
+        DATA_EXPECTATIONS = "DATA EXPECTATIONS"
+        DATA_RELATIONS = "DATA RELATIONS"
+        DUPLICATES = "DUPLICATES"
+        ERRONEOUS_DATA = "ERRONEOUS DATA"
+        LABELS = "LABELS"
+        MISSINGS = "MISSINGS"
+        SAMPLING = "SAMPLING"
+
+    class Test(StringEnum):
+        # BIAS&FAIRNESS
+        PROXY_IDENTIFICATION = "PROXY IDENTIFICATION"
+        SENSITIVE_ATTRIBUTE_PREDICTABILITY = "SENSITIVE ATTRIBUTE PREDICTABILITY"
+        SENSITIVE_ATTRIBUTE_REPRESENTATIVITY = "SENSITIVE ATTRIBUTE REPRESENTATIVITY"
+
+        # DATA EXPECTATIONS
+        COVERAGE_FRACTION = "COVERAGE FRACTION"
+        EXPECTATION_ASSESSMENT_VALUE_BETWEEN = "EXPECTATION ASSESSMENT - VALUE BETWEEN"
+        OVERALL_ASSESSMENT = "OVERALL ASSESSMENT"
+
+        # DATA RELATIONS
+        COLLIDER_CORRELATIONS = "COLLIDER CORRELATIONS"
+        CONFOUNDED_CORRELATIONS = "CONFOUNDED CORRELATIONS"
+        HIGH_COLLINEARITY_CATEGORICAL = "HIGH COLLINEARITY - CATEGORICAL"
+        HIGH_COLLINEARITY_NUMERICAL = "HIGH COLLINEARITY - NUMERICAL"
+
+        # DUPLICATES
+        DUPLICATE_COLUMNS = "DUPLICATE COLUMNS"
+        ENTITY_DUPLICATES = "ENTITY DUPLICATES"
+        EXACT_DUPLICATES = "EXACT DUPLICATES"
+
+        # ERRONEOUS DATA
+        FLATLINES = "FLATLINES"
+        PREDEFINED_ERRONEOUS_DATA = "PREDEFINED ERRONEOUS DATA"
+
+        # LABELS
+        FEW_LABELS = "FEW LABELS"
+        MISSING_LABELS = "MISSING LABELS"
+        ONE_REST_PERFORMANCE = "ONE VS REST PERFORMANCE"
+        OUTLIER_DETECTION = "OUTLIER DETECTION"
+        TEST_NORMALITY = "TEST NORMALITY"
+        UNBALANCED_CLASSES = "UNBALANCED CLASSES"
+
+        # MISSINGS
+        HIGH_MISSINGS = "HIGH MISSINGS"
+        HIGH_MISSING_CORRELATIONS = "HIGH MISSING CORRELATIONS"
+        MISSINGNESS_PREDICTION = "MISSINGNESS PREDICTION"
+
+        # SAMPLING
+        CONCEPT_DRIFT = "CONCEPT DRIFT"
+        SAMPLE_COVARIATE_DRIFT = "SAMPLE COVARIATE DRIFT"
+        SAMPLE_LABEL_DRIFT = "SAMPLE LABEL DRIFT"
+
+    category: Category
+    test: Test
     description: str
     priority: Priority
     data: Any = None
@@ -78,7 +132,7 @@ class QualityWarning(BaseModel):
     #########################
     def __str__(self):
         return f"{WarningStyling.PRIORITIES[self.priority.value]}*{WarningStyling.ENDC} {WarningStyling.BOLD}\
-[{self.category.upper()}{WarningStyling.ENDC} - {WarningStyling.UNDERLINE}{self.test.upper()}]{WarningStyling.ENDC} \
+[{self.category.value}{WarningStyling.ENDC} - {WarningStyling.UNDERLINE}{self.test.value}]{WarningStyling.ENDC} \
 {self.description}"
 
     ########################
 
@@ -6,6 +6,8 @@
 from pandas import DataFrame
 from numpy import argmin
 
+from src.ydata_quality.core.warnings import Priority
+
 from ..core import QualityEngine, QualityWarning
 from ..utils.auxiliary import test_load_json_path
 from ..utils.logger import NAME, get_logger
@@ -55,7 +57,8 @@ def __between_value_error(self, expectation_summary: dict) -> tuple:
 bound of the expected range."
         self.store_warning(
             QualityWarning(
-                test='Expectation assessment - Value Between', category='Data Expectations', priority=3,
+                test=QualityWarning.Test.EXPECTATION_ASSESSMENT_VALUE_BETWEEN,
+                category=QualityWarning.Category.DATA_EXPECTATIONS, priority=Priority.P3,
                 data=(range_deviations, bound_deviations),
                 description=f"Column {column_name} - The observed value is outside of the expected range."
                 + (range_deviation_string if range_deviations else "")
@@ -122,7 +125,8 @@ def _coverage_fraction(self, results_json_path: str, df: DataFrame, minimum_cove
         if coverage_fraction < minimum_coverage:
             self.store_warning(
                 QualityWarning(
-                    test='Coverage Fraction', category='Data Expectations', priority=2,
+                    test=QualityWarning.Test.COVERAGE_FRACTION,
+                    category=QualityWarning.Category.DATA_EXPECTATIONS, priority=Priority.P2,
                     data={'Columns not covered': df_column_set.difference(column_coverage)},
                     description=f"The provided DataFrame has a total expectation coverage of {coverage_fraction:.0%} \
 of its columns, which is below the expected coverage of {minimum_coverage:.0%}."
@@ -147,7 +151,8 @@ def _overall_assessment(self, results_json_path: str, error_tol: int = 0,
         if results_summary['OVERALL']['expectation_count'] - results_summary['OVERALL']['total_successes'] > error_tol:
             self.store_warning(
                 QualityWarning(
-                    test='Overall Assessment', category='Data Expectations', priority=2,
+                    test=QualityWarning.Test.OVERALL_ASSESSMENT,
+                    category=QualityWarning.Category.DATA_EXPECTATIONS, priority=Priority.P2,
                     data={'Failed expectation indexes': failed_expectation_ids},
                     description=f"{len(failed_expectation_ids)} expectations have failed, which is more than the \
 implied absolute threshold of {int(error_tol)} failed expectations."
 
@@ -6,6 +6,8 @@
 from pandas import DataFrame
 from numpy import ones, tril, argwhere
 
+from src.ydata_quality.core.warnings import Priority
+
 from ..core import QualityEngine, QualityWarning
 from ..utils.auxiliary import infer_dtypes, standard_normalize
 from ..utils.correlations import (chi2_collinearity, correlation_matrix,
@@ -117,9 +119,11 @@ def _confounder_detection(self, corr_mat: DataFrame, par_corr_mat: DataFrame,
         mask[par_corr_mat.abs() > corr_th] = False  # Drop pairs with correlation after controling all other covariates
         confounded_pairs = [(corr_mat.index[i], corr_mat.columns[j]) for i, j in argwhere(mask)]
         if len(confounded_pairs) > 0:
-            self.store_warning(QualityWarning(
-                test='Confounded correlations', category='Data Relations', priority=2, data=confounded_pairs,
-                description=f"""
+            self.store_warning(
+                QualityWarning(
+                    test=QualityWarning.Test.CONFOUNDED_CORRELATIONS, category=QualityWarning.Category.DATA_RELATIONS,
+                    priority=Priority.P2, data=confounded_pairs,
+                    description=f"""
                 Found {len(confounded_pairs)} independently correlated variable pairs that disappeared after controling\
                 for the remaining variables. This is an indicator of potential confounder effects in the dataset."""))
         return confounded_pairs
@@ -138,9 +142,11 @@ def _collider_detection(self, corr_mat: DataFrame, par_corr_mat: DataFrame,
         mask[par_corr_mat.abs() <= corr_th] = False  # Drop pairs with correlation after controling all other covariates
         colliding_pairs = [(corr_mat.index[i], corr_mat.columns[j]) for i, j in argwhere(mask)]
         if len(colliding_pairs) > 0:
-            self.store_warning(QualityWarning(
-                test='Collider correlations', category='Data Relations', priority=2, data=colliding_pairs,
-                description=f"Found {len(colliding_pairs)} independently uncorrelated variable pairs that showed \
+            self.store_warning(
+                QualityWarning(
+                    test=QualityWarning.Test.COLLIDER_CORRELATIONS, category=QualityWarning.category.DATA_RELATIONS,
+                    priority=Priority.P2, data=colliding_pairs,
+                    description=f"Found {len(colliding_pairs)} independently uncorrelated variable pairs that showed \
 correlation after controling for the remaining variables. \
 This is an indicator of potential colliding bias with other covariates."))
         return colliding_pairs
@@ -192,18 +198,22 @@ def _high_collinearity_detection(self, df: DataFrame, dtypes: dict, label: str =
                             ['Adjusted Chi2'].mean()) for c in unique_cats]
         cat_coll_scores = [c[0] for c in sorted(cat_coll_scores, key=lambda x: x[1], reverse=True)]
         if len(inflated) > 0:
-            self.store_warning(QualityWarning(
-                test='High Collinearity - Numerical', category='Data Relations', priority=2, data=inflated,
-                description=f"""Found {len(inflated)} numerical variables with high Variance Inflation Factor \
+            self.store_warning(
+                QualityWarning(
+                    test=QualityWarning.Test.HIGH_COLLINEARITY_NUMERICAL,
+                    category=QualityWarning.Category.DATA_RELATIONS, priority=Priority.P2, data=inflated,
+                    description=f"""Found {len(inflated)} numerical variables with high Variance Inflation Factor \
 (VIF>{vif_th:.1f}). The variables listed in results are highly collinear with other variables in the dataset. \
 These will make model explainability harder and potentially give way to issues like overfitting.\
 Depending on your end goal you might want to remove the highest VIF variables."""))
         if len(cat_coll_scores) > 0:
             # TODO: Merge warning messages (make one warning for the whole test,
             # summarizing findings from the numerical and categorical vars)
-            self.store_warning(QualityWarning(
-                test='High Collinearity - Categorical', category='Data Relations', priority=2, data=chi2_tests,
-                description=f"""Found {len(cat_coll_scores)} categorical variables with significant collinearity \
+            self.store_warning(
+                QualityWarning(
+                    test=QualityWarning.Test.HIGH_COLLINEARITY_CATEGORICAL,
+                    category=QualityWarning.Category.DATA_RELATIONS, priority=Priority.P2, data=chi2_tests,
+                    description=f"""Found {len(cat_coll_scores)} categorical variables with significant collinearity \
 (p-value < {p_th}). The variables listed in results are highly collinear with other variables \
 in the dataset and sorted descending according to propensity. These will make model explainability \
 harder and potentially give way to issues like overfitting.Depending on your end goal you might want \
 
@@ -288,14 +288,16 @@ def sample_covariate_drift(self, p_thresh: float = 0.05) -> DataFrame:
         if n_drifted_feats > 0:
             self.store_warning(
                 QualityWarning(
-                    test='Sample covariate drift', category='Sampling', priority=2, data=test_summary,
+                    test=QualityWarning.Test.SAMPLE_COVARIATE_DRIFT, category=QualityWarning.Category.SAMPLING,
+                    priority=2, data=test_summary,
                     description=f"""{n_drifted_feats} features accused drift in the sample test. The covariates \
 of the test sample do not appear to be representative of the reference sample."""
                 ))
         elif n_invalid_tests > 0:
             self.store_warning(
                 QualityWarning(
-                    test='Sample covariate drift', category='Sampling', priority=3, data=test_summary,
+                    test=QualityWarning.Test.SAMPLE_COVARIATE_DRIFT, category=QualityWarning.Category.SAMPLING,
+                    priority=3, data=test_summary,
                     description=f"""There were {n_invalid_tests} invalid tests found. This is likely due to a small \
 test sample size. The data summary should be analyzed before considering the test conclusive."""
                 ))
@@ -323,14 +325,16 @@ def sample_label_drift(self, p_thresh: float = 0.05) -> Series:
         if test_summary['Verdict'] == 'Drift':
             self.store_warning(
                 QualityWarning(
-                    test='Sample label drift', category='Sampling', priority=2, data=test_summary,
+                    test=QualityWarning.Test.SAMPLE_LABEL_DRIFT, category=QualityWarning.Category.SAMPLING,
+                    priority=2, data=test_summary,
                     description=f"The label accused drift in the sample test with a p-test of {p_val:.4f}, which is \
 under the threshold {p_thresh:.2f}. The test sample labels do not appear to be representative of the reference sample."
                 ))
         elif test_summary['Verdict'] == 'Invalid test':
             self.store_warning(
                 QualityWarning(
-                    test='Sample label drift', category='Sampling', priority=3, data=test_summary,
+                    test=QualityWarning.Test.SAMPLE_LABEL_DRIFT, category=QualityWarning.Category.SAMPLING,
+                    priority=3, data=test_summary,
                     description="The test was invalid. This is likely due to a small test sample size."
                 ))
         else:
@@ -363,15 +367,17 @@ def sample_concept_drift(self, p_thresh: float = 0.05) -> Series:
         if test_summary['Verdict'] == 'Drift':
             self.store_warning(
                 QualityWarning(
-                    test='Concept drift', category='Sampling', priority=2, data=test_summary,
+                    test=QualityWarning.Test.CONCEPT_DRIFT, category=QualityWarning.Category.SAMPLING,
+                    priority=2, data=test_summary,
                     description=f"There was concept drift detected with a p-test of {p_val:.4f}, which is under the \
 threshold {p_thresh:.2f}. The model's predicted labels for the test sample do not appear to be representative of the \
 distribution of labels predicted for the reference sample."
                 ))
         elif test_summary['Verdict'] == 'Invalid test':
             self.store_warning(
                 QualityWarning(
-                    test='Concept drift', category='Sampling', priority=3, data=test_summary,
+                    test=QualityWarning.Test.CONCEPT_DRIFT, category=QualityWarning.Category.SAMPLING,
+                    priority=3, data=test_summary,
                     description="The test was invalid. This is likely due to a small test sample size."
                 ))
         else:
 
@@ -6,6 +6,8 @@
 
 from pandas import DataFrame
 
+from src.ydata_quality.core.warnings import Priority
+
 from ..core import QualityEngine, QualityWarning
 from ..utils.auxiliary import find_duplicate_columns
 
@@ -69,7 +71,8 @@ def exact_duplicates(self):
         if len(dups) > 0:
             self.store_warning(
                 QualityWarning(
-                    test='Exact Duplicates', category='Duplicates', priority=2, data=dups,
+                    test=QualityWarning.Test.EXACT_DUPLICATES, category=QualityWarning.Category.DUPLICATES,
+                    priority=Priority.P2, data=dups,
                     description=f"Found {len(dups)} instances with exact duplicate feature values."
                 ))
         else:
@@ -84,7 +87,7 @@ def __provided_entity_dups(self, entity: Optional[Union[str, List[str]]] = None)
         if len(dups) > 0:                        # if we have any duplicates
             self.store_warning(
                 QualityWarning(
-                    test='Entity Duplicates', category='Duplicates', priority=2, data=dups,
+                    test='Entity Duplicates', category='Duplicates', priority=Priority.P2, data=dups,
                     description=f"Found {len(dups)} duplicates after grouping by entities."
                 ))
             if isinstance(entity, str):
@@ -124,7 +127,8 @@ def duplicate_columns(self):
         if cols_with_dups > 0:
             self.store_warning(
                 QualityWarning(
-                    test='Duplicate Columns', category='Duplicates', priority=1, data=dups,
+                    test=QualityWarning.Test.DUPLICATE_COLUMNS, category=QualityWarning.Category.DUPLICATES,
+                    priority=Priority.P1, data=dups,
                     description=f"Found {cols_with_dups} columns with exactly the same feature values as other columns."
                 )
             )