From f2ec7782a209b58ea0b761eae72f11e53154b783 Mon Sep 17 00:00:00 2001 From: ikka Date: Tue, 24 Dec 2024 15:12:20 +0530 Subject: [PATCH] fix: remove reproducibility arg (#1790) Remove reproducibility arg in light on alignment feature which better solves the same problem. fixes: https://github.com/explodinggradients/ragas/issues/1711 --- src/ragas/evaluation.py | 16 ---------------- src/ragas/integrations/llama_index.py | 1 - src/ragas/metrics/_context_precision.py | 18 ------------------ src/ragas/metrics/_context_recall.py | 23 ----------------------- src/ragas/metrics/_faithfulness.py | 17 ----------------- src/ragas/metrics/_noise_sensitivity.py | 17 ----------------- src/ragas/metrics/base.py | 7 ------- 7 files changed, 99 deletions(-) diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py index 7f3bddeb2..6128901de 100644 --- a/src/ragas/evaluation.py +++ b/src/ragas/evaluation.py @@ -35,7 +35,6 @@ MetricWithLLM, MultiTurnMetric, SingleTurnMetric, - is_reproducable, ) from ragas.run_config import RunConfig from ragas.utils import convert_v1_to_v2_dataset @@ -60,7 +59,6 @@ def evaluate( llm: t.Optional[BaseRagasLLM | LangchainLLM] = None, embeddings: t.Optional[BaseRagasEmbeddings | LangchainEmbeddings] = None, callbacks: Callbacks = None, - in_ci: bool = False, run_config: t.Optional[RunConfig] = None, token_usage_parser: t.Optional[TokenUsageParser] = None, raise_exceptions: bool = False, @@ -93,10 +91,6 @@ def evaluate( Lifecycle Langchain Callbacks to run during evaluation. Check the [langchain documentation](https://python.langchain.com/docs/modules/callbacks/) for more information. - in_ci: bool - Whether the evaluation is running in CI or not. If set to True then some - metrics will be run to increase the reproducability of the evaluations. This - will increase the runtime and cost of evaluations. Default is False. run_config: RunConfig, optional Configuration for runtime settings like timeout and retries. If not provided, default values are used. @@ -193,7 +187,6 @@ def evaluate( binary_metrics = [] llm_changed: t.List[int] = [] embeddings_changed: t.List[int] = [] - reproducable_metrics: t.List[int] = [] answer_correctness_is_set = -1 # loop through the metrics and perform initializations @@ -214,12 +207,6 @@ def evaluate( if isinstance(metric, AnswerCorrectness): if metric.answer_similarity is None: answer_correctness_is_set = i - # set reproducibility for metrics if in CI - if in_ci and is_reproducable(metric): - if metric.reproducibility == 1: # type: ignore - # only set a value if not already set - metric.reproducibility = 3 # type: ignore - reproducable_metrics.append(i) # init all the models metric.init(run_config) @@ -354,9 +341,6 @@ def evaluate( AnswerCorrectness, metrics[answer_correctness_is_set] ).answer_similarity = None - for i in reproducable_metrics: - metrics[i].reproducibility = 1 # type: ignore - # flush the analytics batcher from ragas._analytics import _analytics_batcher diff --git a/src/ragas/integrations/llama_index.py b/src/ragas/integrations/llama_index.py index b93961800..0eb4f5aef 100644 --- a/src/ragas/integrations/llama_index.py +++ b/src/ragas/integrations/llama_index.py @@ -97,7 +97,6 @@ def evaluate( callbacks=callbacks, show_progress=show_progress, run_config=run_config or RunConfig(), - in_ci=in_ci, token_usage_parser=token_usage_parser, ) diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py index b10e61cfc..f32b3c4b2 100644 --- a/src/ragas/metrics/_context_precision.py +++ b/src/ragas/metrics/_context_precision.py @@ -109,27 +109,10 @@ class LLMContextPrecisionWithReference(MetricWithLLM, SingleTurnMetric): default_factory=ContextPrecisionPrompt ) max_retries: int = 1 - _reproducibility: int = 1 def _get_row_attributes(self, row: t.Dict) -> t.Tuple[str, t.List[str], t.Any]: return row["user_input"], row["retrieved_contexts"], row["reference"] - @property - def reproducibility(self): - return self._reproducibility - - @reproducibility.setter - def reproducibility(self, value): - if value < 1: - logger.warning("reproducibility cannot be less than 1, setting to 1") - value = 1 - elif value % 2 == 0: - logger.warning( - "reproducibility level cannot be set to even number, setting to odd" - ) - value += 1 - self._reproducibility = value - def _calculate_average_precision( self, verifications: t.List[Verification] ) -> float: @@ -173,7 +156,6 @@ async def _ascore( context=context, answer=reference, ), - n=self.reproducibility, llm=self.llm, callbacks=callbacks, ) diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py index 0f7122cff..7af9915f9 100644 --- a/src/ragas/metrics/_context_recall.py +++ b/src/ragas/metrics/_context_recall.py @@ -113,28 +113,6 @@ class LLMContextRecall(MetricWithLLM, SingleTurnMetric): default_factory=ContextRecallClassificationPrompt ) max_retries: int = 1 - _reproducibility: int = 1 - - @property - def reproducibility(self): - return self._reproducibility - - @reproducibility.setter - def reproducibility(self, value): - if value < 1: - logger.warning("reproducibility cannot be less than 1, setting to 1") - value = 1 - elif value % 2 == 0: - logger.warning( - "reproducibility level cannot be set to even number, setting to odd" - ) - value += 1 - self._reproducibility = value - - def __post_init__(self) -> None: - if self.reproducibility < 1: - logger.warning("reproducibility cannot be less than 1, setting to 1") - self.reproducibility = 1 def _compute_score(self, responses: t.List[ContextRecallClassification]) -> float: response = [1 if item.attributed else 0 for item in responses] @@ -166,7 +144,6 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float: ), llm=self.llm, callbacks=callbacks, - n=self.reproducibility, ) ) classification_dicts = [] diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py index 5cf1229a7..f689bc1a8 100644 --- a/src/ragas/metrics/_faithfulness.py +++ b/src/ragas/metrics/_faithfulness.py @@ -178,23 +178,6 @@ class Faithfulness(MetricWithLLM, SingleTurnMetric): statement_prompt: PydanticPrompt = field(default_factory=LongFormAnswerPrompt) sentence_segmenter: t.Optional[HasSegmentMethod] = None max_retries: int = 1 - _reproducibility: int = 1 - - @property - def reproducibility(self): - return self._reproducibility - - @reproducibility.setter - def reproducibility(self, value): - if value < 1: - logger.warning("reproducibility cannot be less than 1, setting to 1") - value = 1 - elif value % 2 == 0: - logger.warning( - "reproducibility level cannot be set to even number, setting to odd" - ) - value += 1 - self._reproducibility = value def __post_init__(self): if self.sentence_segmenter is None: diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py index c7026de16..7856f0426 100644 --- a/src/ragas/metrics/_noise_sensitivity.py +++ b/src/ragas/metrics/_noise_sensitivity.py @@ -49,23 +49,6 @@ class NoiseSensitivity(MetricWithLLM, SingleTurnMetric): statement_prompt: PydanticPrompt = field(default_factory=LongFormAnswerPrompt) sentence_segmenter: t.Optional[HasSegmentMethod] = None max_retries: int = 1 - _reproducibility: int = 1 - - @property - def reproducibility(self): - return self._reproducibility - - @reproducibility.setter - def reproducibility(self, value): - if value < 1: - logger.warning("reproducibility cannot be less than 1, setting to 1") - value = 1 - elif value % 2 == 0: - logger.warning( - "reproducibility level cannot be set to even number, setting to odd" - ) - value += 1 - self._reproducibility = value def __post_init__(self): if self.sentence_segmenter is None: diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py index ffd6d038d..daf7b8d03 100644 --- a/src/ragas/metrics/base.py +++ b/src/ragas/metrics/base.py @@ -712,11 +712,4 @@ def get_segmenter( ) -def is_reproducable(metric: Metric) -> bool: - """ - Check if a metric is reproducible by checking if it has a `_reproducibility` attribute. - """ - return hasattr(metric, "_reproducibility") - - ensembler = Ensember()