diff --git a/src/ragas/metrics/__init__.py b/src/ragas/metrics/__init__.py index 12862288e..01cab7d46 100644 --- a/src/ragas/metrics/__init__.py +++ b/src/ragas/metrics/__init__.py @@ -1,6 +1,3 @@ -import inspect -import sys - from ragas.metrics._answer_correctness import AnswerCorrectness, answer_correctness from ragas.metrics._answer_relevance import ( AnswerRelevancy, @@ -120,10 +117,3 @@ "MultiModalRelevance", "multimodal_relevance", ] - -current_module = sys.modules[__name__] -ALL_METRICS = [ - obj - for name, obj in inspect.getmembers(current_module) - if name in __all__ and not inspect.isclass(obj) and not inspect.isbuiltin(obj) -] diff --git a/src/ragas/metrics/_answer_correctness.py b/src/ragas/metrics/_answer_correctness.py index edbd138ec..aa61dd3dc 100644 --- a/src/ragas/metrics/_answer_correctness.py +++ b/src/ragas/metrics/_answer_correctness.py @@ -21,6 +21,7 @@ SingleTurnMetric, get_segmenter, ) +from ragas.metrics.utils import fbeta_score from ragas.prompt import PydanticPrompt from ragas.run_config import RunConfig @@ -167,6 +168,7 @@ class AnswerCorrectness(MetricWithLLM, MetricWithEmbeddings, SingleTurnMetric): default_factory=LongFormAnswerPrompt ) weights: list[float] = field(default_factory=lambda: [0.75, 0.25]) + beta: float = 1.0 answer_similarity: t.Optional[AnswerSimilarity] = None sentence_segmenter: t.Optional[HasSegmentMethod] = None max_retries: int = 1 @@ -185,6 +187,11 @@ def __post_init__(self: t.Self): language = self.long_form_answer_prompt.language self.sentence_segmenter = get_segmenter(language=language, clean=False) + if type(self.beta) is not float: + raise ValueError( + "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision." + ) + def init(self, run_config: RunConfig): super().init(run_config) if self.answer_similarity is None and self.weights[1] != 0: @@ -198,7 +205,7 @@ def _compute_statement_presence( tp = len(prediction.TP) fp = len(prediction.FP) fn = len(prediction.FN) - score = tp / (tp + 0.5 * (fp + fn)) if tp > 0 else 0 + score = fbeta_score(tp, fp, fn, self.beta) return score async def _create_simplified_statements( diff --git a/src/ragas/metrics/_factual_correctness.py b/src/ragas/metrics/_factual_correctness.py index 7db09cfda..1d887cfab 100644 --- a/src/ragas/metrics/_factual_correctness.py +++ b/src/ragas/metrics/_factual_correctness.py @@ -16,6 +16,7 @@ SingleTurnMetric, get_segmenter, ) +from ragas.metrics.utils import fbeta_score from ragas.prompt import PydanticPrompt if t.TYPE_CHECKING: @@ -181,11 +182,32 @@ class ClaimDecompositionPrompt( @dataclass class FactualCorrectness(MetricWithLLM, SingleTurnMetric): + """ + FactualCorrectness is a metric class that evaluates the factual correctness of responses + generated by a language model. It uses claim decomposition and natural language inference (NLI) + to verify the claims made in the responses against reference texts. + + Attributes: + name (str): The name of the metric, default is "factual_correctness". + _required_columns (Dict[MetricType, Set[str]]): A dictionary specifying the required columns + for each metric type. Default is {"SINGLE_TURN": {"response", "reference"}}. + mode (Literal["precision", "recall", "f1"]): The mode of evaluation, can be "precision", + "recall", or "f1". Default is "f1". + beta (float): The beta value used for the F1 score calculation. A beta > 1 gives more weight + to recall, while beta < 1 favors precision. Default is 1.0. + atomicity (Literal["low", "high"]): The level of atomicity for claim decomposition. Default is "low". + coverage (Literal["low", "high"]): The level of coverage for claim decomposition. Default is "low". + claim_decomposition_prompt (PydanticPrompt): The prompt used for claim decomposition. + nli_prompt (PydanticPrompt): The prompt used for natural language inference (NLI). + + """ + name: str = "factual_correctness" # type: ignore _required_columns: t.Dict[MetricType, t.Set[str]] = field( default_factory=lambda: {MetricType.SINGLE_TURN: {"response", "reference"}} ) mode: t.Literal["precision", "recall", "f1"] = "f1" + beta: float = 1.0 atomicity: t.Literal["low", "high"] = "low" coverage: t.Literal["low", "high"] = "low" claim_decomposition_prompt: PydanticPrompt = ClaimDecompositionPrompt() @@ -204,6 +226,11 @@ def __post_init__(self): ) self.segmenter = get_segmenter(language="english") + if type(self.beta) is not float: + raise ValueError( + "Beta must be a float. A beta > 1 gives more weight to recall, while beta < 1 favors precision." + ) + async def decompose_claims( self, response: str, callbacks: Callbacks ) -> t.List[str]: @@ -253,21 +280,20 @@ async def _single_turn_ascore( else: response_reference = np.array([]) - true_positives = sum(reference_response) - false_positives = sum(~reference_response) + tp = sum(reference_response) + fp = sum(~reference_response) if self.mode != "precision": - false_negatives = sum(~response_reference) + fn = sum(~response_reference) else: - false_negatives = 0 + fn = 0 + if self.mode == "precision": - score = true_positives / (true_positives + false_positives + 1e-8) + score = tp / (tp + fp + 1e-8) elif self.mode == "recall": - score = true_positives / (true_positives + false_negatives + 1e-8) + score = tp / (tp + fp + 1e-8) else: - precision = true_positives / (true_positives + false_positives + 1e-8) - recall = true_positives / (true_positives + false_negatives + 1e-8) - score = 2 * (precision * recall) / (precision + recall + 1e-8) + score = fbeta_score(tp, fp, fn, self.beta) return np.round(score, 2) diff --git a/src/ragas/metrics/utils.py b/src/ragas/metrics/utils.py index 1b0b6ef2f..0f612c3fd 100644 --- a/src/ragas/metrics/utils.py +++ b/src/ragas/metrics/utils.py @@ -1,21 +1,22 @@ -from ragas.dataset_schema import EvaluationDataset -from ragas.metrics import ALL_METRICS -from ragas.metrics.base import Metric -from ragas.validation import validate_required_columns +def fbeta_score(tp, fp, fn, beta=1.0): + if tp + fp == 0: + precision = 0 + else: + precision = tp / (tp + fp) + if tp + fn == 0: + recall = 0 + else: + recall = tp / (tp + fn) -def get_available_metrics(ds: EvaluationDataset) -> list[Metric]: - """ - Get the available metrics for the given dataset. - E.g. if the dataset contains ("question", "answer", "contexts") columns, - the available metrics are those that can be evaluated in [qa, qac, qc] mode. - """ - available_metrics = [] - for metric in ALL_METRICS: - try: - validate_required_columns(ds, [metric]) - available_metrics.append(metric) - except ValueError: - pass + if precision == 0 and recall == 0: + return 0.0 - return available_metrics + beta_squared = beta**2 + fbeta = ( + (1 + beta_squared) + * (precision * recall) + / ((beta_squared * precision) + recall) + ) + + return fbeta diff --git a/tests/unit/test_metric.py b/tests/unit/test_metric.py index 4f589318d..7c8026cba 100644 --- a/tests/unit/test_metric.py +++ b/tests/unit/test_metric.py @@ -1,22 +1,8 @@ import typing as t from dataclasses import dataclass, field -from ragas.dataset_schema import EvaluationDataset, SingleTurnSample +from ragas.dataset_schema import SingleTurnSample from ragas.metrics.base import MetricType -from ragas.metrics.utils import get_available_metrics - - -def test_get_available_metrics(): - sample1 = SingleTurnSample(user_input="What is X", response="Y") - sample2 = SingleTurnSample(user_input="What is Z", response="W") - ds = EvaluationDataset(samples=[sample1, sample2]) - - assert all( - [ - m.required_columns["SINGLE_TURN"] == {"response", "user_input"} - for m in get_available_metrics(ds) - ] - ), "All metrics should have required columns ('user_input', 'response')" def test_single_turn_metric():