Skip to content

Commit

Permalink
Merge pull request #268 from whylabs/workflow-name-refactor
Browse files Browse the repository at this point in the history
Workflow name refactor
  • Loading branch information
naddeoa authored Mar 18, 2024
2 parents aee2aca + ef4f8ac commit ed31282
Show file tree
Hide file tree
Showing 22 changed files with 150 additions and 147 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 0.0.95
current_version = 0.0.96
tag = False
parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?
serialize =
Expand Down
12 changes: 6 additions & 6 deletions langkit/core/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class MultiMetric:


@dataclass(frozen=True)
class EvaluationConfig:
class WorkflowMetricConfig:
metrics: List[Metric]


Expand All @@ -128,7 +128,7 @@ class MetricNameCapture:

def __init__(self, creator: MetricCreator) -> None:
self._creator = creator
self._metrics = LazyInit(lambda: EvaluationConfigBuilder().add(self._creator).build().metrics)
self._metrics = LazyInit(lambda: WorkflowMetricConfigBuilder().add(self._creator).build().metrics)
self._metric_names = LazyInit(lambda: MetricNameCapture.__get_metric_names(self._metrics.value))

@staticmethod
Expand All @@ -149,12 +149,12 @@ def metric_names(self) -> List[str]:
return self._metric_names.value


class EvaluationConfigBuilder:
class WorkflowMetricConfigBuilder:
def __init__(self, metric_creators: Optional[List[MetricCreator]] = None) -> None:
super().__init__()
self._modules: List[MetricCreator] = metric_creators or []

def add(self, module: MetricCreator) -> "EvaluationConfigBuilder":
def add(self, module: MetricCreator) -> "WorkflowMetricConfigBuilder":
if isinstance(module, list):
self._modules.extend(module)
elif callable(module):
Expand Down Expand Up @@ -186,7 +186,7 @@ def _build_metrics(self, modules: List[MetricCreator]) -> List[Metric]:

return schemas

def build(self) -> EvaluationConfig:
def build(self) -> WorkflowMetricConfig:
schemas: List[Metric] = self._build_metrics(self._modules)

return EvaluationConfig(metrics=schemas)
return WorkflowMetricConfig(metrics=schemas)
4 changes: 2 additions & 2 deletions langkit/core/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,13 +35,13 @@ def get_target_metric_names(self) -> List[str]:
# MetricResults of varying lengths, which you really couldn't combine into a single one anymore
# WELL, it would be ok if we just made the failures None I guess, that would preserve cardinatlity/shape
# How do you say "remove these things because they failed?"
# - Short circuiting the evaluation because of validation might be important, which implies validation has to occur earlier
# - Short circuiting the workflow because of validation might be important, which implies validation has to occur earlier
def validate_result(self, df: pd.DataFrame) -> Optional[ValidationResult]:
"""
Validate the final result after all of the metrics have been evaluated.
Args:
df: A data frame that contains a series for every metric, as well as the original input data.
by default, that will include a prompt and a resopnse column if both were supplied to the evaluation.
by default, that will include a prompt and a resopnse column if both were supplied to the workflow.
"""
return None
28 changes: 14 additions & 14 deletions langkit/core/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,13 @@
import pandas as pd

from langkit.core.metric import (
EvaluationConfigBuilder,
Metric,
MetricCreator,
MetricResult,
MultiMetricResult,
SingleMetric,
SingleMetricResult,
WorkflowMetricConfigBuilder,
)
from langkit.core.validation import ValidationResult, Validator
from langkit.metrics.util import is_dict_with_strings
Expand All @@ -35,7 +35,7 @@ class RunPerf:


@dataclass(frozen=True)
class EvaluationResult:
class WorkflowResult:
metrics: pd.DataFrame
validation_results: ValidationResult
perf_info: RunPerf
Expand Down Expand Up @@ -75,7 +75,7 @@ def post_validation(
# - DONE replace PII with <redacted>


class EvaluationWorkflow:
class Workflow:
def __init__(
self,
metrics: List[MetricCreator],
Expand All @@ -87,13 +87,13 @@ def __init__(
"""
Args:
metrics: A list of metrics to evaluate.
validators: A list of validators to run after the evaluation is complete.
callbacks: A list of callbacks to run after the evaluation is complete.
validators: A list of validators to run after the workflow is complete.
callbacks: A list of callbacks to run after the workflow is complete.
lazy_init: If True, the metrics will not be initialized until the first call to run.
cache_assets: If True, the assets required for the metrics will be cached during inititialization.
"""
self.callbacks = callbacks or []
self.metrics = EvaluationConfigBuilder().add(metrics).build()
self.metrics = WorkflowMetricConfigBuilder().add(metrics).build()
self.validators = validators or []
self._initialized = False
self._cache_assets = cache_assets
Expand Down Expand Up @@ -152,31 +152,31 @@ def get_metric_names(self) -> List[str]:
return names

@overload
def run(self, data: pd.DataFrame) -> EvaluationResult:
def run(self, data: pd.DataFrame) -> WorkflowResult:
"""
This form is intended for batch evaluation,
This form is intended for batch inputs,
where the input is a pandas DataFrame.
"""
...

@overload
def run(self, data: Row) -> EvaluationResult:
def run(self, data: Row) -> WorkflowResult:
"""
This form is intended for single row evaluation,
This form is intended for single row inputs,
where the input is a dictionary with the keys "prompt" and "response".
"""
...

@overload
def run(self, data: Dict[str, str]) -> EvaluationResult:
def run(self, data: Dict[str, str]) -> WorkflowResult:
"""
This form doesn't assume the "prompt" and "response" key names.
This would be required in cases where the user wants to use different
column names, for example "question" and "answer", or "input" and "output".
"""
...

def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResult:
def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> WorkflowResult:
start = time.perf_counter()

self.init()
Expand All @@ -188,7 +188,7 @@ def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResul
else:
df = data

# Evaluation
# Metrics
metric_results: Dict[str, SingleMetricResult] = {}
all_metrics_start = time.perf_counter()
metric_times: List[Tuple[str, float]] = []
Expand Down Expand Up @@ -264,7 +264,7 @@ def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResul
metrics_total_sec=round(all_metrics_end, 3),
)

return EvaluationResult(full_df, self._condense_validation_results(validation_results), perf_info=run_perf)
return WorkflowResult(full_df, self._condense_validation_results(validation_results), perf_info=run_perf)

def _validate_evaluate(self, input_df: pd.DataFrame, metric: Metric, metric_result: MetricResult) -> None:
"""
Expand Down
4 changes: 2 additions & 2 deletions langkit/metrics/whylogs_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

import pandas as pd

from langkit.core.metric import EvaluationConfig, Metric, MultiMetric, SingleMetric
from langkit.core.metric import Metric, MultiMetric, SingleMetric, WorkflowMetricConfig
from whylogs.core.resolvers import StandardMetric
from whylogs.core.segmentation_partition import SegmentationPartition
from whylogs.experimental.core.metrics.udf_metric import MetricConfig as YMetricConfig
Expand Down Expand Up @@ -100,7 +100,7 @@ def to_udf_schema_args(metric: Metric) -> List[UdfSchemaArgs]:
return [_to_udf_schema_args_multiple(metric)]


def create_whylogs_udf_schema(eval_conf: EvaluationConfig) -> UdfSchema:
def create_whylogs_udf_schema(eval_conf: WorkflowMetricConfig) -> UdfSchema:
for metric in eval_conf.metrics:
if metric.init:
metric.init()
Expand Down
4 changes: 2 additions & 2 deletions langkit/scripts/langkit_cache.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import sys

from langkit.core.workflow import EvaluationWorkflow
from langkit.core.workflow import Workflow
from langkit.metrics.library import lib

if __name__ == "__main__":
wf = EvaluationWorkflow(metrics=[lib.presets.all()], cache_assets="--skip-downloads" not in sys.argv)
wf = Workflow(metrics=[lib.presets.all()], cache_assets="--skip-downloads" not in sys.argv)
# Run it to ensure nothing else ends up getting lazily cached
wf.run({"prompt": "How are you today?", "response": "I'm doing great!"})
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "langkit"
version = "0.0.95"
version = "0.0.96"
description = "A language toolkit for monitoring LLM interactions"
authors = ["WhyLabs.ai <[email protected]>"]
homepage = "https://docs.whylabs.ai/docs/large-language-model-monitoring"
Expand Down
4 changes: 2 additions & 2 deletions tests/langkit/callbacks/test_webhook.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from langkit.callbacks.library import lib as callback_lib
from langkit.core.validation import ValidationFailure
from langkit.core.workflow import EvaluationWorkflow
from langkit.core.workflow import Workflow
from langkit.metrics.library import lib as metric_lib
from langkit.validators.library import lib as validator_lib


def test_webhook_failures_dont_ruin_run():
wf = EvaluationWorkflow(
wf = Workflow(
metrics=[metric_lib.prompt.stats.char_count],
validators=[validator_lib.constraint("prompt.stats.char_count", upper_threshold=5)],
callbacks=[callback_lib.webhook.basic_validation_failure("https://foo.bar")], # will fail, url doesn't exist
Expand Down
4 changes: 2 additions & 2 deletions tests/langkit/metrics/conftest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from langkit.core.workflow import EvaluationWorkflow
from langkit.core.workflow import Workflow
from langkit.metrics.library import lib

wf = EvaluationWorkflow([lib.presets.all()])
wf = Workflow([lib.presets.all()])
10 changes: 5 additions & 5 deletions tests/langkit/metrics/test_injections.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
import pytest

from langkit.core.workflow import EvaluationWorkflow
from langkit.core.workflow import Workflow
from langkit.metrics.injections import prompt_injections_metric


Expand All @@ -31,7 +31,7 @@ def df_2() -> pd.DataFrame:


def test_injections_pd(df: pd.DataFrame):
wf = EvaluationWorkflow([prompt_injections_metric])
wf = Workflow([prompt_injections_metric])
wf.init()
res = wf.run(df)
print(res.metrics)
Expand All @@ -41,7 +41,7 @@ def test_injections_pd(df: pd.DataFrame):


def test_injections_pd_not_close(df_2: pd.DataFrame):
wf = EvaluationWorkflow([prompt_injections_metric])
wf = Workflow([prompt_injections_metric])
wf.init()
res = wf.run(df_2)
metric_values: List[float] = res.metrics["prompt.similarity.injection"].to_list()
Expand All @@ -51,7 +51,7 @@ def test_injections_pd_not_close(df_2: pd.DataFrame):

def test_injections_dict(df: pd.DataFrame):
data: Dict[str, str] = {"prompt": df["prompt"].to_list()[0]}
wf = EvaluationWorkflow([prompt_injections_metric])
wf = Workflow([prompt_injections_metric])
wf.init()
res = wf.run(data)
metric_values: List[float] = res.metrics["prompt.similarity.injection"].to_list()
Expand All @@ -61,7 +61,7 @@ def test_injections_dict(df: pd.DataFrame):

def test_injections_dict_not_close(df_2: pd.DataFrame):
data: Dict[str, str] = {"prompt": df_2["prompt"].to_list()[0]}
wf = EvaluationWorkflow([prompt_injections_metric])
wf = Workflow([prompt_injections_metric])
wf.init()
res = wf.run(data)
metric_values: List[float] = res.metrics["prompt.similarity.injection"].to_list()
Expand Down
10 changes: 5 additions & 5 deletions tests/langkit/metrics/test_input_output_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd

import whylogs as why
from langkit.core.metric import EvaluationConfig, EvaluationConfigBuilder
from langkit.core.metric import WorkflowMetricConfig, WorkflowMetricConfigBuilder
from langkit.metrics.input_output_similarity import input_output_similarity_metric, prompt_response_input_output_similarity_metric
from langkit.metrics.whylogs_compat import create_whylogs_udf_schema

Expand Down Expand Up @@ -40,7 +40,7 @@
]


def _log(item: Any, conf: EvaluationConfig) -> pd.DataFrame:
def _log(item: Any, conf: WorkflowMetricConfig) -> pd.DataFrame:
schema = create_whylogs_udf_schema(conf)
return why.log(item, schema=schema).view().to_pandas() # type: ignore

Expand All @@ -57,7 +57,7 @@ def test_input_output():
}
)

schema = EvaluationConfigBuilder().add(input_output_similarity_metric).build()
schema = WorkflowMetricConfigBuilder().add(input_output_similarity_metric).build()

actual = _log(df, schema)
assert list(actual.columns) == expected_metrics
Expand All @@ -83,7 +83,7 @@ def test_input_output_row():
"response": "I'm going to answer that question!",
}

schema = EvaluationConfigBuilder().add(prompt_response_input_output_similarity_metric).build()
schema = WorkflowMetricConfigBuilder().add(prompt_response_input_output_similarity_metric).build()

actual = _log(row, schema)
assert list(actual.columns) == expected_metrics
Expand Down Expand Up @@ -115,7 +115,7 @@ def test_input_output_multiple():
}
)

schema = EvaluationConfigBuilder().add(prompt_response_input_output_similarity_metric).build()
schema = WorkflowMetricConfigBuilder().add(prompt_response_input_output_similarity_metric).build()

actual = _log(df, schema)
assert list(actual.columns) == expected_metrics
Expand Down
4 changes: 2 additions & 2 deletions tests/langkit/metrics/test_library.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
from typing import List

from langkit.core.workflow import EvaluationWorkflow
from langkit.core.workflow import Workflow
from langkit.metrics.library import lib


def test_recommended():
wf = EvaluationWorkflow(metrics=[lib.presets.recommended()])
wf = Workflow(metrics=[lib.presets.recommended()])
result = wf.run({"prompt": "hi", "response": "I'm doing great!"})
metrics = result.metrics

Expand Down
10 changes: 5 additions & 5 deletions tests/langkit/metrics/test_pii.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import pandas as pd

import whylogs as why
from langkit.core.metric import EvaluationConfig, EvaluationConfigBuilder
from langkit.core.workflow import EvaluationWorkflow
from langkit.core.metric import WorkflowMetricConfig, WorkflowMetricConfigBuilder
from langkit.core.workflow import Workflow
from langkit.metrics.pii import prompt_response_presidio_pii_metric
from langkit.metrics.whylogs_compat import create_whylogs_udf_schema

Expand Down Expand Up @@ -41,13 +41,13 @@
]


def _log(item: pd.DataFrame, conf: EvaluationConfig) -> pd.DataFrame:
def _log(item: pd.DataFrame, conf: WorkflowMetricConfig) -> pd.DataFrame:
schema = create_whylogs_udf_schema(conf)
return why.log(item, schema=schema).view().to_pandas() # type: ignore


def test_prompt_response_pii_metric_whylogs():
all_config = EvaluationConfigBuilder().add(prompt_response_presidio_pii_metric).build()
all_config = WorkflowMetricConfigBuilder().add(prompt_response_presidio_pii_metric).build()

df = pd.DataFrame(
{
Expand Down Expand Up @@ -122,7 +122,7 @@ def test_prompt_response_pii_metric():
"id",
]

wf = EvaluationWorkflow(metrics=[prompt_response_presidio_pii_metric])
wf = Workflow(metrics=[prompt_response_presidio_pii_metric])
logged = wf.run(df).metrics

pd.set_option("display.max_columns", None)
Expand Down
Loading

0 comments on commit ed31282

Please sign in to comment.