From 6d114e5ab200aa6f43ee3cb6fa17bf3db1dbc19e Mon Sep 17 00:00:00 2001 From: Yuri Albuquerque Date: Fri, 25 Oct 2024 15:18:55 -0300 Subject: [PATCH] This commit implements the F-beta score metric (#1543) for the AnswerCorrectness class. The beta parameter is introduced to control the relative importance of recall and precision when calculating the score. Specifically: - beta > 1 places more emphasis on recall. - beta < 1 favors precision. - beta ==1 stands for the regular F1 score that can be interpreted as a harmonic mean of the precision and recall. Key Changes: The method _compute_statement_presence is updated to calculate the F-beta score based on true positives (TP), false positives (FP), and false negatives (FN). This ensures that we can balance between recall and precision, depending on the task's requirements, by tuning the beta value. source: https://scikit-learn.org/1.5/modules/generated/sklearn.metrics.fbeta_score.html --------- Co-authored-by: Shahules786 --- src/ragas/metrics/__init__.py | 10 ------ src/ragas/metrics/_answer_correctness.py | 9 ++++- src/ragas/metrics/_factual_correctness.py | 44 ++++++++++++++++++----- src/ragas/metrics/utils.py | 37 +++++++++---------- tests/unit/test_metric.py | 16 +-------- 5 files changed, 63 insertions(+), 53 deletions(-) diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 12862288e..01cab7d46 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -1,6 +1,3 @@ -import inspect -import sys - from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness from ragas.metrics._answer_relevance import ( AnswerRelevancy, @@ -120,10 +117,3 @@ "MultiModalRelevance", "multimodal_relevance", ] - -current_module = sys.modules[__name__] -ALL_METRICS = [ - obj - for name, obj in inspect.getmembers(current_module) - if name in __all__ and not inspect.isclass(obj) and not inspect.isbuiltin(obj) -] diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index edbd138ec..aa61dd3dc 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -21,6 +21,7 @@ SingleTurnMetric, get_segmenter, ) +from ragas.metrics.utils import fbeta_score from ragas.prompt import PydanticPrompt from ragas.run_config import RunConfig @@ -167,6 +168,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): default_factory=LongFormAnswerPrompt ) weights: list[float] = field(default_factory=lambda: [0.75, 0.25]) + beta: float = 1.0 answer_similarity: t.Optional[AnswerSimilarity] = None sentence_segmenter: t.Optional[HasSegmentMethod] = None max_retries: int = 1 @@ -185,6 +187,11 @@ def __post_init__(self: t.Self): language = self.long_form_answer_prompt.language self.sentence_segmenter = get_segmenter(language=language, clean=False) + if type(self.beta) is not float: + raise ValueError( + "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision." + ) + def init(self, run_config: RunConfig): super().init(run_config) if self.answer_similarity is None and self.weights[1] != 0: @@ -198,7 +205,7 @@ def _compute_statement_presence( tp = len(prediction.TP) fp = len(prediction.FP) fn = len(prediction.FN) - score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0 + score = fbeta_score(tp, fp, fn, self.beta) return score async def _create_simplified_statements( diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py index 7db09cfda..1d887cfab 100644 --- a/src/ragas/metrics/_factual_correctness.py +++ b/src/ragas/metrics/_factual_correctness.py @@ -16,6 +16,7 @@ SingleTurnMetric, get_segmenter, ) +from ragas.metrics.utils import fbeta_score from ragas.prompt import PydanticPrompt if t.TYPE_CHECKING: @@ -181,11 +182,32 @@ class ClaimDecompositionPrompt( @dataclass class FactualCorrectness(MetricWithLLM, SingleTurnMetric): + """ + FactualCorrectness is a metric class that evaluates the factual correctness of responses + generated by a language model. It uses claim decomposition and natural language inference (NLI) + to verify the claims made in the responses against reference texts. + + Attributes: + name (str): The name of the metric, default is "factual_correctness". + _required_columns (Dict[MetricType, Set[str]]): A dictionary specifying the required columns + for each metric type. Default is {"SINGLE_TURN": {"response", "reference"}}. + mode (Literal["precision", "recall", "f1"]): The mode of evaluation, can be "precision", + "recall", or "f1". Default is "f1". + beta (float): The beta value used for the F1 score calculation. A beta > 1 gives more weight + to recall, while beta < 1 favors precision. Default is 1.0. + atomicity (Literal["low", "high"]): The level of atomicity for claim decomposition. Default is "low". + coverage (Literal["low", "high"]): The level of coverage for claim decomposition. Default is "low". + claim_decomposition_prompt (PydanticPrompt): The prompt used for claim decomposition. + nli_prompt (PydanticPrompt): The prompt used for natural language inference (NLI). + + """ + name: str = "factual_correctness" # type: ignore _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: {MetricType.SINGLE_TURN: {"response", "reference"}} ) mode: t.Literal["precision", "recall", "f1"] = "f1" + beta: float = 1.0 atomicity: t.Literal["low", "high"] = "low" coverage: t.Literal["low", "high"] = "low" claim_decomposition_prompt: PydanticPrompt = ClaimDecompositionPrompt() @@ -204,6 +226,11 @@ def __post_init__(self): ) self.segmenter = get_segmenter(language="english") + if type(self.beta) is not float: + raise ValueError( + "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision." + ) + async def decompose_claims( self, response: str, callbacks: Callbacks ) -> t.List[str]: @@ -253,21 +280,20 @@ async def _single_turn_ascore( else: response_reference = np.array([]) - true_positives = sum(reference_response) - false_positives = sum(~reference_response) + tp = sum(reference_response) + fp = sum(~reference_response) if self.mode != "precision": - false_negatives = sum(~response_reference) + fn = sum(~response_reference) else: - false_negatives = 0 + fn = 0 + if self.mode == "precision": - score = true_positives / (true_positives + false_positives + 1e-8) + score = tp / (tp + fp + 1e-8) elif self.mode == "recall": - score = true_positives / (true_positives + false_negatives + 1e-8) + score = tp / (tp + fp + 1e-8) else: - precision = true_positives / (true_positives + false_positives + 1e-8) - recall = true_positives / (true_positives + false_negatives + 1e-8) - score = 2 * (precision * recall) / (precision + recall + 1e-8) + score = fbeta_score(tp, fp, fn, self.beta) return np.round(score, 2) diff --git a/src/ragas/metrics/utils.py b/src/ragas/metrics/utils.py index 1b0b6ef2f..0f612c3fd 100644 --- a/src/ragas/metrics/utils.py +++ b/src/ragas/metrics/utils.py @@ -1,21 +1,22 @@ -from ragas.dataset_schema import EvaluationDataset -from ragas.metrics import ALL_METRICS -from ragas.metrics.base import Metric -from ragas.validation import validate_required_columns +def fbeta_score(tp, fp, fn, beta=1.0): + if tp + fp == 0: + precision = 0 + else: + precision = tp / (tp + fp) + if tp + fn == 0: + recall = 0 + else: + recall = tp / (tp + fn) -def get_available_metrics(ds: EvaluationDataset) -> list[Metric]: - """ - Get the available metrics for the given dataset. - E.g. if the dataset contains ("question", "answer", "contexts") columns, - the available metrics are those that can be evaluated in [qa, qac, qc] mode. - """ - available_metrics = [] - for metric in ALL_METRICS: - try: - validate_required_columns(ds, [metric]) - available_metrics.append(metric) - except ValueError: - pass + if precision == 0 and recall == 0: + return 0.0 - return available_metrics + beta_squared = beta**2 + fbeta = ( + (1 + beta_squared) + * (precision * recall) + / ((beta_squared * precision) + recall) + ) + + return fbeta diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py index 4f589318d..7c8026cba 100644 --- a/tests/unit/test_metric.py +++ b/tests/unit/test_metric.py @@ -1,22 +1,8 @@ import typing as t from dataclasses import dataclass, field -from ragas.dataset_schema import EvaluationDataset, SingleTurnSample +from ragas.dataset_schema import SingleTurnSample from ragas.metrics.base import MetricType -from ragas.metrics.utils import get_available_metrics - - -def test_get_available_metrics(): - sample1 = SingleTurnSample(user_input="What is X", response="Y") - sample2 = SingleTurnSample(user_input="What is Z", response="W") - ds = EvaluationDataset(samples=[sample1, sample2]) - - assert all( - [ - m.required_columns["SINGLE_TURN"] == {"response", "user_input"} - for m in get_available_metrics(ds) - ] - ), "All metrics should have required columns ('user_input', 'response')" def test_single_turn_metric():