diff --git a/.bumpversion.cfg b/.bumpversion.cfg index fb34431..d7a6aef 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.28.dev13 +current_version = 0.0.28.dev14 tag = False parse = (?P\d+)\.(?P\d+)\.(?P\d+)(\.(?P[a-z]+)(?P\d+))? serialize = diff --git a/langkit/core/validation.py b/langkit/core/validation.py index e18781c..68ca1d6 100644 --- a/langkit/core/validation.py +++ b/langkit/core/validation.py @@ -1,9 +1,11 @@ from abc import ABC, abstractmethod from dataclasses import dataclass, field -from typing import List, Optional, Union +from typing import List, Literal, Optional, Union import pandas as pd +ValidationFailureLevel = Literal["flag", "block"] + @dataclass(frozen=True) class ValidationFailure: @@ -19,6 +21,8 @@ class ValidationFailure: must_be_none: Optional[bool] = None must_be_non_none: Optional[bool] = None + failure_level: ValidationFailureLevel = "block" + @dataclass(frozen=True) class ValidationResult: diff --git a/langkit/validators/comparison.py b/langkit/validators/comparison.py index f203f7b..6f4aba4 100644 --- a/langkit/validators/comparison.py +++ b/langkit/validators/comparison.py @@ -5,10 +5,12 @@ import numpy as np import pandas as pd -from langkit.core.validation import ValidationFailure, ValidationResult, Validator +from langkit.core.validation import ValidationFailure, ValidationFailureLevel, ValidationResult, Validator -def _enforce_upper_threshold(target_metric: str, upper_threshold: Union[int, float], value: Any, id: str) -> Sequence[ValidationFailure]: +def _enforce_upper_threshold( + target_metric: str, upper_threshold: Union[int, float], level: ValidationFailureLevel, value: Any, id: str +) -> Sequence[ValidationFailure]: if not isinstance(value, (float, int)): return [] @@ -20,13 +22,16 @@ def _enforce_upper_threshold(target_metric: str, upper_threshold: Union[int, flo details=f"Value {value} is above threshold {upper_threshold}", value=value, upper_threshold=upper_threshold, + failure_level=level, ) ] return [] -def _enforce_lower_threshold(target_metric: str, lower_threshold: Union[int, float], value: Any, id: str) -> Sequence[ValidationFailure]: +def _enforce_lower_threshold( + target_metric: str, lower_threshold: Union[int, float], level: ValidationFailureLevel, value: Any, id: str +) -> Sequence[ValidationFailure]: if not isinstance(value, (float, int)): return [] @@ -38,6 +43,7 @@ def _enforce_lower_threshold(target_metric: str, lower_threshold: Union[int, flo details=f"Value {value} is below threshold {lower_threshold}", value=value, lower_threshold=lower_threshold, + failure_level=level, ) ] @@ -45,7 +51,7 @@ def _enforce_lower_threshold(target_metric: str, lower_threshold: Union[int, flo def _enforce_upper_threshold_inclusive( - target_metric: str, upper_threshold_inclusive: Union[int, float], value: Any, id: str + target_metric: str, upper_threshold_inclusive: Union[int, float], level: ValidationFailureLevel, value: Any, id: str ) -> Sequence[ValidationFailure]: if not isinstance(value, (float, int)): return [] @@ -58,6 +64,7 @@ def _enforce_upper_threshold_inclusive( details=f"Value {value} is above or equal to threshold {upper_threshold_inclusive}", value=value, upper_threshold=upper_threshold_inclusive, + failure_level=level, ) ] @@ -65,7 +72,7 @@ def _enforce_upper_threshold_inclusive( def _enforce_lower_threshold_inclusive( - target_metric: str, lower_threshold_inclusive: Union[int, float], value: Any, id: str + target_metric: str, lower_threshold_inclusive: Union[int, float], level: ValidationFailureLevel, value: Any, id: str ) -> Sequence[ValidationFailure]: if not isinstance(value, (float, int)): return [] @@ -78,13 +85,16 @@ def _enforce_lower_threshold_inclusive( details=f"Value {value} is below or equal to threshold {lower_threshold_inclusive}", value=value, lower_threshold=lower_threshold_inclusive, + failure_level=level, ) ] return [] -def _enforce_one_of(target_metric: str, one_of: Set[Union[str, float, int]], value: Any, id: str) -> Sequence[ValidationFailure]: +def _enforce_one_of( + target_metric: str, one_of: Set[Union[str, float, int]], level: ValidationFailureLevel, value: Any, id: str +) -> Sequence[ValidationFailure]: if value not in one_of: return [ ValidationFailure( @@ -93,12 +103,15 @@ def _enforce_one_of(target_metric: str, one_of: Set[Union[str, float, int]], val details=f"Value {value} is not in allowed values {one_of}", value=value, allowed_values=list(one_of), + failure_level=level, ) ] return [] -def _enforce_none_of(target_metric: str, none_of: Set[Union[str, float, int]], value: Any, id: str) -> Sequence[ValidationFailure]: +def _enforce_none_of( + target_metric: str, none_of: Set[Union[str, float, int]], level: ValidationFailureLevel, value: Any, id: str +) -> Sequence[ValidationFailure]: if value in none_of: return [ ValidationFailure( @@ -107,12 +120,13 @@ def _enforce_none_of(target_metric: str, none_of: Set[Union[str, float, int]], v details=f"Value {value} is in disallowed values {none_of}", value=value, disallowed_values=list(none_of), + failure_level=level, ) ] return [] -def _enforce_must_be_none(target_metric: str, value: Any, id: str) -> Sequence[ValidationFailure]: +def _enforce_must_be_none(target_metric: str, level: ValidationFailureLevel, value: Any, id: str) -> Sequence[ValidationFailure]: if value is not None: return [ ValidationFailure( @@ -121,12 +135,13 @@ def _enforce_must_be_none(target_metric: str, value: Any, id: str) -> Sequence[V details=f"Value {value} is not None", value=value, must_be_none=True, + failure_level=level, ) ] return [] -def _enforce_must_be_non_none(target_metric: str, value: Any, id: str) -> Sequence[ValidationFailure]: +def _enforce_must_be_non_none(target_metric: str, level: ValidationFailureLevel, value: Any, id: str) -> Sequence[ValidationFailure]: if value is None: return [ ValidationFailure( @@ -135,6 +150,7 @@ def _enforce_must_be_non_none(target_metric: str, value: Any, id: str) -> Sequen details="Value is None", value=value, must_be_non_none=True, + failure_level=level, ) ] return [] @@ -151,32 +167,35 @@ class ConstraintValidatorOptions: none_of: Optional[Tuple[Union[str, float, int], ...]] = None must_be_non_none: Optional[bool] = None must_be_none: Optional[bool] = None + failure_level: Optional[ValidationFailureLevel] = None class ConstraintValidator(Validator): def __init__(self, options: ConstraintValidatorOptions): validation_functions: List[Callable[[Any, str], Sequence[ValidationFailure]]] = [] + level = options.failure_level or "block" + if options.upper_threshold is not None: - validation_functions.append(partial(_enforce_upper_threshold, options.target_metric, options.upper_threshold)) + validation_functions.append(partial(_enforce_upper_threshold, options.target_metric, options.upper_threshold, level)) if options.lower_threshold is not None: - validation_functions.append(partial(_enforce_lower_threshold, options.target_metric, options.lower_threshold)) + validation_functions.append(partial(_enforce_lower_threshold, options.target_metric, options.lower_threshold, level)) if options.upper_threshold_inclusive is not None: validation_functions.append( - partial(_enforce_upper_threshold_inclusive, options.target_metric, options.upper_threshold_inclusive) + partial(_enforce_upper_threshold_inclusive, options.target_metric, options.upper_threshold_inclusive, level) ) if options.lower_threshold_inclusive is not None: validation_functions.append( - partial(_enforce_lower_threshold_inclusive, options.target_metric, options.lower_threshold_inclusive) + partial(_enforce_lower_threshold_inclusive, options.target_metric, options.lower_threshold_inclusive, level) ) if options.one_of is not None: - validation_functions.append(partial(_enforce_one_of, options.target_metric, set(options.one_of))) + validation_functions.append(partial(_enforce_one_of, options.target_metric, set(options.one_of), level)) if options.none_of is not None: - validation_functions.append(partial(_enforce_none_of, options.target_metric, set(options.none_of))) + validation_functions.append(partial(_enforce_none_of, options.target_metric, set(options.none_of), level)) if options.must_be_non_none is not None: - validation_functions.append(partial(_enforce_must_be_non_none, options.target_metric)) + validation_functions.append(partial(_enforce_must_be_non_none, options.target_metric, level)) if options.must_be_none is not None: - validation_functions.append(partial(_enforce_must_be_none, options.target_metric)) + validation_functions.append(partial(_enforce_must_be_none, options.target_metric, level)) self._target_metric = options.target_metric self._validation_functions = validation_functions diff --git a/langkit/validators/library.py b/langkit/validators/library.py index fe08001..31f9802 100644 --- a/langkit/validators/library.py +++ b/langkit/validators/library.py @@ -1,6 +1,6 @@ from typing import List, Literal, Optional, Sequence, Union -from langkit.core.validation import Validator +from langkit.core.validation import ValidationFailureLevel, Validator from langkit.validators.comparison import ( ConstraintValidator, ConstraintValidatorOptions, @@ -72,6 +72,7 @@ def constraint( none_of: Optional[Sequence[Union[str, float, int]]] = None, must_be_non_none: Optional[bool] = None, must_be_none: Optional[bool] = None, + failure_level: Optional[ValidationFailureLevel] = None, ) -> Validator: return ConstraintValidator( ConstraintValidatorOptions( @@ -84,6 +85,7 @@ def constraint( none_of=tuple(none_of) if none_of else None, must_be_non_none=must_be_non_none, must_be_none=must_be_none, + failure_level=failure_level, ) ) diff --git a/pyproject.toml b/pyproject.toml index d6d8093..aecc781 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "langkit" -version = "0.0.28.dev13" +version = "0.0.28.dev14" description = "A language toolkit for monitoring LLM interactions" authors = ["WhyLabs.ai "] homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring" diff --git a/tests/langkit/metrics/test_workflow.py b/tests/langkit/metrics/test_workflow.py index d9aa111..96c99bb 100644 --- a/tests/langkit/metrics/test_workflow.py +++ b/tests/langkit/metrics/test_workflow.py @@ -2,6 +2,7 @@ import pytest +from langkit.core.validation import ValidationFailure from langkit.core.workflow import MetricFilterOptions, RunOptions, Workflow from langkit.metrics.library import lib from langkit.validators.library import lib as validator_lib @@ -208,6 +209,58 @@ def test_just_prompt_validation(): ] +def test_just_prompt_validation_flag(): + rule = validator_lib.constraint(target_metric="response.stats.token_count", upper_threshold=1, failure_level="flag") + wf = Workflow(metrics=[lib.presets.recommended()], validators=[rule]) + + result = wf.run({"prompt": "hi", "response": "hello there"}) + metrics = result.metrics + + metric_names: List[str] = metrics.columns.tolist() # pyright: ignore[reportUnknownMemberType] + + assert metric_names == [ + "prompt.pii.phone_number", + "prompt.pii.email_address", + "prompt.pii.credit_card", + "prompt.pii.us_ssn", + "prompt.pii.us_bank_number", + "prompt.pii.redacted", + "prompt.stats.token_count", + "prompt.stats.char_count", + "prompt.similarity.injection", + "prompt.similarity.jailbreak", + "response.pii.phone_number", + "response.pii.email_address", + "response.pii.credit_card", + "response.pii.us_ssn", + "response.pii.us_bank_number", + "response.pii.redacted", + "response.stats.token_count", + "response.stats.char_count", + "response.stats.flesch_reading_ease", + "response.sentiment.sentiment_score", + "response.toxicity.toxicity_score", + "response.similarity.refusal", + "id", + ] + + assert result.validation_results.report == [ + ValidationFailure( + id="0", + metric="response.stats.token_count", + details="Value 2 is above threshold 1", + value=2, + upper_threshold=1, + lower_threshold=None, + allowed_values=None, + disallowed_values=None, + must_be_none=None, + must_be_non_none=None, + failure_level="flag", + ), + ] + + def test_just_response(): wf = Workflow(metrics=[lib.presets.recommended()]) result = wf.run({"response": "I'm doing great!"})