Merge pull request #240 from whylabs/validation-none

Add none-ness to failidation report
whylabs · Feb 23, 2024 · c841267 · c841267
2 parents 709d50f + 5afd122
commit c841267
Show file tree

Hide file tree

Showing 6 changed files with 17 additions and 5 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.0.71
+current_version = 0.0.73
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
 serialize = 

diff --git a/langkit/core/validation.py b/langkit/core/validation.py
@@ -16,6 +16,8 @@ class ValidationFailure:
     lower_threshold: Optional[float] = None
     allowed_values: Optional[List[Union[str, float, int]]] = None
     disallowed_values: Optional[List[Union[str, float, int]]] = None
+    must_be_none: Optional[bool] = None
+    must_be_non_none: Optional[bool] = None
 
 
 @dataclass(frozen=True)

diff --git a/langkit/core/workflow.py b/langkit/core/workflow.py
@@ -1,3 +1,4 @@
+import logging
 import time
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
@@ -17,6 +18,8 @@
 from langkit.core.validation import ValidationResult, Validator
 from langkit.metrics.util import is_dict_with_strings
 
+logger = logging.getLogger(__name__)
+
 
 class Row(TypedDict):
     prompt: str
@@ -89,7 +92,7 @@ def __init__(
             lazy_init: If True, the metrics will not be initialized until the first call to run.
             cache_assets: If True, the assets required for the metrics will be cached during inititialization.
         """
-        self.hooks = callbacks or []
+        self.callbacks = callbacks or []
         self.metrics = EvaluationConfigBuilder().add(metrics).build()
         self.validators = validators or []
         self._initialized = False
@@ -232,8 +235,11 @@ def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResul
         all_validators_end = time.perf_counter() - all_validators_start
 
         # Post validation hook
-        for action in self.hooks:
-            action.post_validation(df.copy(), metric_results, full_df.copy(), validation_results)
+        for callback in self.callbacks:
+            try:
+                callback.post_validation(df.copy(), metric_results, full_df.copy(), validation_results)
+            except Exception as e:
+                logger.exception(f"Callback {callback} failed with exception {e}")
 
         # Performance
         run_perf = RunPerf(

diff --git a/langkit/validators/comparison.py b/langkit/validators/comparison.py
@@ -119,6 +119,7 @@ def _enforce_must_be_none(target_metric: str, value: Any, id: str) -> Sequence[V
                 metric=target_metric,
                 details=f"Value {value} is not None",
                 value=value,
+                must_be_none=True,
             )
         ]
     return []
@@ -132,6 +133,7 @@ def _enforce_must_be_non_none(target_metric: str, value: Any, id: str) -> Sequen
                 metric=target_metric,
                 details="Value is None",
                 value=value,
+                must_be_non_none=True,
             )
         ]
     return []

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "langkit"
-version = "0.0.71"
+version = "0.0.73"
 description = "A language toolkit for monitoring LLM interactions"
 authors = ["WhyLabs.ai <[email protected]>"]
 homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring"

diff --git a/tests/langkit/validators/test_comparison.py b/tests/langkit/validators/test_comparison.py
@@ -125,6 +125,7 @@ def test_must_be_none():
             metric="prompt.pii.redacted",
             details="Value My email address is <EMAIL_ADDRESS> is not None",
             value="My email address is <EMAIL_ADDRESS>",
+            must_be_none=True,
         ),
     ]
 
@@ -141,5 +142,6 @@ def test_must_be_non_none():
             metric="prompt.pii.redacted",
             details="Value is None",
             value=None,
+            must_be_non_none=True,
         ),
     ]