Refactor the metric library to conform to new naming structure

whylabs · Mar 5, 2024 · 570abc2 · 570abc2
1 parent 5f9fd4c
commit 570abc2
Show file tree

Hide file tree

Showing 23 changed files with 769 additions and 786 deletions.
diff --git a/TODO.md b/TODO.md
@@ -35,3 +35,4 @@
 - Implement real multi metric conversion in the whylogs_compat file. Jamie said that whylogs actually can return multiple metrics at once.
 - Add validation options for string based metrics like topic. Right now there are only validation creators for numeric things.
 - Add multiple python version builds to the CI matrix. Things like `isinstance(foo, Union[..])` are in the code still.
+- Create version of @cache for python 3.8. Can apparently use @lru_cache(maxsize=None)
diff --git a/langkit/core/workflow.py b/langkit/core/workflow.py
@@ -221,6 +221,7 @@ def run(self, data: Union[pd.DataFrame, Row, Dict[str, str]]) -> EvaluationResul
         else:
             condensed["id"] = df["id"]
 
+        # TODO set column names `metric` and `value`
         full_df = condensed.copy()  # guard against mutations
         validation_results: List[ValidationResult] = []
         all_validators_start = time.perf_counter()

diff --git a/langkit/metrics/injections.py b/langkit/metrics/injections.py
@@ -87,7 +87,9 @@ def udf(text: pd.DataFrame) -> SingleMetricResult:
         metrics = [float(score) for _, score in zip(max_indices, max_similarities)]
         return SingleMetricResult(metrics=metrics)
 
-    return SingleMetric(name=f"{column_name}.injections", input_name=column_name, evaluate=udf, cache_assets=cache_assets, init=init)
+    return SingleMetric(
+        name=f"{column_name}.similarity.injection", input_name=column_name, evaluate=udf, cache_assets=cache_assets, init=init
+    )
 
 
-prompt_injections_module = partial(injections_metric, "prompt")
+prompt_injections_metric = partial(injections_metric, "prompt")
diff --git a/langkit/metrics/input_output_similarity.py b/langkit/metrics/input_output_similarity.py
@@ -23,11 +23,11 @@ def udf(text: pd.DataFrame) -> SingleMetricResult:
             return SingleMetricResult(similarity.squeeze(dim=0).tolist())  # type: ignore[reportUnknownVariableType]
 
     return SingleMetric(
-        name=f"{output_column_name}.relevance_to_{input_column_name}",
+        name=f"{output_column_name}.similarity.{input_column_name}",
         input_name=input_column_name,
         evaluate=udf,
         init=init,
     )
 
 
-prompt_response_input_output_similarity_module = partial(input_output_similarity_metric, "prompt", "response")
+prompt_response_input_output_similarity_metric = partial(input_output_similarity_metric, "prompt", "response")
diff --git a/langkit/metrics/library.py b/langkit/metrics/library.py
diff --git a/langkit/metrics/regexes/regexes.py b/langkit/metrics/regexes/regexes.py
@@ -47,7 +47,7 @@ def udf(text: pd.DataFrame) -> SingleMetricResult:
         return SingleMetricResult(metric)
 
     return SingleMetric(
-        name=f"{column_name}.has_patterns",
+        name=f"{column_name}.regex.has_patterns",
         input_name=column_name,
         evaluate=udf,
     )
@@ -112,29 +112,45 @@ def udf(text: Union[pd.DataFrame, Dict[str, List[Any]]]) -> SingleMetricResult:
         return SingleMetricResult(metrics)
 
     return SingleMetric(
-        name=f"{column_name}.{__sanitize_name_for_metric(pattern_name)}",
+        name=f"{column_name}.regex.{__sanitize_name_for_metric(pattern_name)}",
         input_name=column_name,
         evaluate=udf,
     )
 
 
-prompt_ssn_regex_module = partial(__single_regex_module, "prompt", __default_patterns, "SSN")
-prompt_credit_card_number_regex_module = partial(__single_regex_module, "prompt", __default_patterns, "credit card number")
-prompt_phone_number_regex_module = partial(__single_regex_module, "prompt", __default_patterns, "phone number")
-prompt_mailing_address_regex_module = partial(__single_regex_module, "prompt", __default_patterns, "mailing address")
-prompt_email_address_regex_module = partial(__single_regex_module, "prompt", __default_patterns, "email address")
-
-response_ssn_regex_module = partial(__single_regex_module, "response", __default_patterns, "SSN")
-response_credit_card_number_regex_module = partial(__single_regex_module, "response", __default_patterns, "credit card number")
-response_phone_number_regex_module = partial(__single_regex_module, "response", __default_patterns, "phone number")
-response_mailing_address_regex_module = partial(__single_regex_module, "response", __default_patterns, "mailing address")
-response_email_address_regex_module = partial(__single_regex_module, "response", __default_patterns, "email address")
-
-prompt_response_ssn_regex_module = [prompt_ssn_regex_module, response_ssn_regex_module]
-prompt_response_credit_card_number_regex_module = [prompt_credit_card_number_regex_module, response_credit_card_number_regex_module]
-prompt_response_phone_number_regex_module = [prompt_phone_number_regex_module, response_phone_number_regex_module]
-prompt_response_mailing_address_regex_module = [prompt_mailing_address_regex_module, response_mailing_address_regex_module]
-prompt_response_email_address_regex_module = [prompt_email_address_regex_module, response_email_address_regex_module]
+prompt_ssn_regex_metric = partial(__single_regex_module, "prompt", __default_patterns, "SSN")
+prompt_credit_card_number_regex_metric = partial(__single_regex_module, "prompt", __default_patterns, "credit card number")
+prompt_phone_number_regex_metric = partial(__single_regex_module, "prompt", __default_patterns, "phone number")
+prompt_mailing_address_regex_metric = partial(__single_regex_module, "prompt", __default_patterns, "mailing address")
+prompt_email_address_regex_metric = partial(__single_regex_module, "prompt", __default_patterns, "email address")
+
+prompt_regex_metric = [
+    prompt_ssn_regex_metric,
+    prompt_credit_card_number_regex_metric,
+    prompt_phone_number_regex_metric,
+    prompt_mailing_address_regex_metric,
+    prompt_email_address_regex_metric,
+]
+
+response_ssn_regex_metric = partial(__single_regex_module, "response", __default_patterns, "SSN")
+response_credit_card_number_regex_metric = partial(__single_regex_module, "response", __default_patterns, "credit card number")
+response_phone_number_regex_metric = partial(__single_regex_module, "response", __default_patterns, "phone number")
+response_mailing_address_regex_metric = partial(__single_regex_module, "response", __default_patterns, "mailing address")
+response_email_address_regex_metric = partial(__single_regex_module, "response", __default_patterns, "email address")
+
+response_regex_metric = [
+    response_ssn_regex_metric,
+    response_credit_card_number_regex_metric,
+    response_phone_number_regex_metric,
+    response_mailing_address_regex_metric,
+    response_email_address_regex_metric,
+]
+
+prompt_response_ssn_regex_module = [prompt_ssn_regex_metric, response_ssn_regex_metric]
+prompt_response_credit_card_number_regex_module = [prompt_credit_card_number_regex_metric, response_credit_card_number_regex_metric]
+prompt_response_phone_number_regex_module = [prompt_phone_number_regex_metric, response_phone_number_regex_metric]
+prompt_response_mailing_address_regex_module = [prompt_mailing_address_regex_metric, response_mailing_address_regex_metric]
+prompt_response_email_address_regex_module = [prompt_email_address_regex_metric, response_email_address_regex_metric]
 
 
 def custom_regex_metric(column_name: str, file_or_patterns: Optional[Union[str, CompiledPatternGroups]] = None) -> MetricCreator:

diff --git a/langkit/metrics/sentiment_polarity.py b/langkit/metrics/sentiment_polarity.py
@@ -29,7 +29,7 @@ def udf(text: pd.DataFrame) -> SingleMetricResult:
         return SingleMetricResult(metrics)
 
     return SingleMetric(
-        name=f"{column_name}.sentiment_polarity",
+        name=f"{column_name}.sentiment.sentiment_score",
         input_name=column_name,
         evaluate=udf,
         init=init,

diff --git a/langkit/metrics/text_statistics.py b/langkit/metrics/text_statistics.py
@@ -14,110 +14,92 @@ def udf(text: pd.DataFrame) -> SingleMetricResult:
         return SingleMetricResult(metrics)
 
     return SingleMetric(
-        name=f"{column_name}.{stat}",  # TODO make this ...text_stat...
+        name=f"{column_name}.text_stat.{stat}",
         input_name=column_name,
         evaluate=udf,
     )
 
 
 __reading_ease_module = partial(textstat_module, "flesch_reading_ease")
-prompt_reading_ease_module = partial(__reading_ease_module, column_name="prompt")
-response_reading_ease_module = partial(__reading_ease_module, column_name="response")
-prompt_response_reading_ease_module = [prompt_reading_ease_module, response_reading_ease_module]
+prompt_reading_ease_metric = partial(__reading_ease_module, column_name="prompt")
+response_reading_ease_metric = partial(__reading_ease_module, column_name="response")
+prompt_response_reading_ease_module = [prompt_reading_ease_metric, response_reading_ease_metric]
 
 
-__flesch_kincaid_grade_level_module = partial(textstat_module, "flesch_kincaid_grade")
-prompt_flesch_kincaid_grade_level_module = partial(__flesch_kincaid_grade_level_module, column_name="prompt")
-response_flesch_kincaid_grade_level_module = partial(__flesch_kincaid_grade_level_module, column_name="response")
-prompt_response_flesch_kincaid_grade_level_module = [
-    prompt_flesch_kincaid_grade_level_module,
-    response_flesch_kincaid_grade_level_module,
+__flesch_kincaid_grade_metric = partial(textstat_module, "flesch_kincaid_grade")
+prompt_grade_metric = partial(__flesch_kincaid_grade_metric, column_name="prompt")
+response_grade_metric = partial(__flesch_kincaid_grade_metric, column_name="response")
+prompt_response_grade_metric = [
+    prompt_grade_metric,
+    response_grade_metric,
 ]
 
 
 __char_count_module = partial(textstat_module, "char_count")
-prompt_char_count_module = partial(__char_count_module, column_name="prompt")
-response_char_count_module = partial(__char_count_module, column_name="response")
-prompt_response_char_count_module = [prompt_char_count_module, response_char_count_module]
+prompt_char_count_metric = partial(__char_count_module, column_name="prompt")
+response_char_count_metric = partial(__char_count_module, column_name="response")
+prompt_response_char_count_module = [prompt_char_count_metric, response_char_count_metric]
 
 
 __syllable_count_module = partial(textstat_module, "syllable_count")
-prompt_syllable_count_module = partial(__syllable_count_module, column_name="prompt")
-response_syllable_count_module = partial(__syllable_count_module, column_name="response")
-prompt_response_syllable_count_module = [prompt_syllable_count_module, response_syllable_count_module]
+prompt_syllable_count_metric = partial(__syllable_count_module, column_name="prompt")
+response_syllable_count_metric = partial(__syllable_count_module, column_name="response")
+prompt_response_syllable_count_module = [prompt_syllable_count_metric, response_syllable_count_metric]
 
 
 __lexicon_count_module = partial(textstat_module, "lexicon_count")
-prompt_lexicon_count_module = partial(__lexicon_count_module, column_name="prompt")
-response_lexicon_count_module = partial(__lexicon_count_module, column_name="response")
-prompt_response_lexicon_count_module = [prompt_lexicon_count_module, response_lexicon_count_module]
+prompt_lexicon_count_metric = partial(__lexicon_count_module, column_name="prompt")
+response_lexicon_count_metric = partial(__lexicon_count_module, column_name="response")
+prompt_response_lexicon_count_module = [prompt_lexicon_count_metric, response_lexicon_count_metric]
 
 
 __sentence_count_module = partial(textstat_module, "sentence_count")
-prompt_sentence_count_module = partial(__sentence_count_module, column_name="prompt")
-response_sentence_count_module = partial(__sentence_count_module, column_name="response")
-prompt_response_sentence_count_module = [prompt_sentence_count_module, response_sentence_count_module]
+prompt_sentence_count_metric = partial(__sentence_count_module, column_name="prompt")
+response_sentence_count_metric = partial(__sentence_count_module, column_name="response")
+prompt_response_sentence_count_module = [prompt_sentence_count_metric, response_sentence_count_metric]
 
 
 __letter_count_module = partial(textstat_module, "letter_count")
-prompt_letter_count_module = partial(__letter_count_module, column_name="prompt")
-response_letter_count_module = partial(__letter_count_module, column_name="response")
-prompt_response_letter_count_module = [prompt_letter_count_module, response_letter_count_module]
-
-
-__polysyllabcount_module = partial(textstat_module, "polysyllabcount")
-prompt_polysyllabcount_module = partial(__polysyllabcount_module, column_name="prompt")
-response_polysyllabcount_module = partial(__polysyllabcount_module, column_name="response")
-prompt_response_polysyllabcount_module = [prompt_polysyllabcount_module, response_polysyllabcount_module]
-
-
-__monosyllabcount_module = partial(textstat_module, "monosyllabcount")
-prompt_monosyllabcount_module = partial(__monosyllabcount_module, column_name="prompt")
-response_monosyllabcount_module = partial(__monosyllabcount_module, column_name="response")
-prompt_response_monosyllabcount_module = [prompt_monosyllabcount_module, response_monosyllabcount_module]
+prompt_letter_count_metric = partial(__letter_count_module, column_name="prompt")
+response_letter_count_metric = partial(__letter_count_module, column_name="response")
+prompt_response_letter_count_module = [prompt_letter_count_metric, response_letter_count_metric]
 
 
 __difficult_words_module = partial(textstat_module, "difficult_words")
-prompt_difficult_words_module = partial(__difficult_words_module, column_name="prompt")
-response_difficult_words_module = partial(__difficult_words_module, column_name="response")
-prompt_response_difficult_words_module = [prompt_difficult_words_module, response_difficult_words_module]
+prompt_difficult_words_metric = partial(__difficult_words_module, column_name="prompt")
+response_difficult_words_metric = partial(__difficult_words_module, column_name="response")
+prompt_response_difficult_words_module = [prompt_difficult_words_metric, response_difficult_words_metric]
 
 
 prompt_response_textstat_module: MetricCreator = [
     *prompt_response_reading_ease_module,
-    *prompt_response_flesch_kincaid_grade_level_module,
+    *prompt_response_grade_metric,
     *prompt_response_char_count_module,
     *prompt_response_syllable_count_module,
     *prompt_response_lexicon_count_module,
     *prompt_response_sentence_count_module,
     *prompt_response_letter_count_module,
-    *prompt_response_polysyllabcount_module,
-    *prompt_response_monosyllabcount_module,
     *prompt_response_difficult_words_module,
 ]
 
-prompt_textstat_module: MetricCreator = [
-    prompt_reading_ease_module,
-    prompt_flesch_kincaid_grade_level_module,
-    prompt_char_count_module,
-    prompt_syllable_count_module,
-    prompt_lexicon_count_module,
-    prompt_sentence_count_module,
-    prompt_letter_count_module,
-    prompt_polysyllabcount_module,
-    prompt_monosyllabcount_module,
-    prompt_difficult_words_module,
+prompt_textstat_metric: MetricCreator = [
+    prompt_reading_ease_metric,
+    prompt_grade_metric,
+    prompt_char_count_metric,
+    prompt_syllable_count_metric,
+    prompt_lexicon_count_metric,
+    prompt_sentence_count_metric,
+    prompt_letter_count_metric,
+    prompt_difficult_words_metric,
 ]
 
-response_textstat_module: MetricCreator = [
-    response_reading_ease_module,
-    response_flesch_kincaid_grade_level_module,
-    response_char_count_module,
-    response_syllable_count_module,
-    response_lexicon_count_module,
-    response_sentence_count_module,
-    response_letter_count_module,
-    response_polysyllabcount_module,
-    response_monosyllabcount_module,
-    response_difficult_words_module,
+response_textstat_metric: MetricCreator = [
+    response_reading_ease_metric,
+    response_grade_metric,
+    response_char_count_metric,
+    response_syllable_count_metric,
+    response_lexicon_count_metric,
+    response_sentence_count_metric,
+    response_letter_count_metric,
+    response_difficult_words_metric,
 ]