From f2ec7782a209b58ea0b761eae72f11e53154b783 Mon Sep 17 00:00:00 2001
From: ikka <shahules786@gmail.com>
Date: Tue, 24 Dec 2024 15:12:20 +0530
Subject: [PATCH] fix: remove reproducibility arg (#1790)

Remove reproducibility arg in light on alignment feature which better
solves the same problem.
fixes: https://github.com/explodinggradients/ragas/issues/1711
---
 src/ragas/evaluation.py                 | 16 ----------------
 src/ragas/integrations/llama_index.py   |  1 -
 src/ragas/metrics/_context_precision.py | 18 ------------------
 src/ragas/metrics/_context_recall.py    | 23 -----------------------
 src/ragas/metrics/_faithfulness.py      | 17 -----------------
 src/ragas/metrics/_noise_sensitivity.py | 17 -----------------
 src/ragas/metrics/base.py               |  7 -------
 7 files changed, 99 deletions(-)

diff --git a/src/ragas/evaluation.py b/src/ragas/evaluation.py
index 7f3bddeb2..6128901de 100644
--- a/src/ragas/evaluation.py
+++ b/src/ragas/evaluation.py
@@ -35,7 +35,6 @@
     MetricWithLLM,
     MultiTurnMetric,
     SingleTurnMetric,
-    is_reproducable,
 )
 from ragas.run_config import RunConfig
 from ragas.utils import convert_v1_to_v2_dataset
@@ -60,7 +59,6 @@ def evaluate(
     llm: t.Optional[BaseRagasLLM | LangchainLLM] = None,
     embeddings: t.Optional[BaseRagasEmbeddings | LangchainEmbeddings] = None,
     callbacks: Callbacks = None,
-    in_ci: bool = False,
     run_config: t.Optional[RunConfig] = None,
     token_usage_parser: t.Optional[TokenUsageParser] = None,
     raise_exceptions: bool = False,
@@ -93,10 +91,6 @@ def evaluate(
         Lifecycle Langchain Callbacks to run during evaluation. Check the
         [langchain documentation](https://python.langchain.com/docs/modules/callbacks/)
         for more information.
-    in_ci: bool
-        Whether the evaluation is running in CI or not. If set to True then some
-        metrics will be run to increase the reproducability of the evaluations. This
-        will increase the runtime and cost of evaluations. Default is False.
     run_config: RunConfig, optional
         Configuration for runtime settings like timeout and retries. If not provided,
         default values are used.
@@ -193,7 +187,6 @@ def evaluate(
     binary_metrics = []
     llm_changed: t.List[int] = []
     embeddings_changed: t.List[int] = []
-    reproducable_metrics: t.List[int] = []
     answer_correctness_is_set = -1
 
     # loop through the metrics and perform initializations
@@ -214,12 +207,6 @@ def evaluate(
         if isinstance(metric, AnswerCorrectness):
             if metric.answer_similarity is None:
                 answer_correctness_is_set = i
-        # set reproducibility for metrics if in CI
-        if in_ci and is_reproducable(metric):
-            if metric.reproducibility == 1:  # type: ignore
-                # only set a value if not already set
-                metric.reproducibility = 3  # type: ignore
-                reproducable_metrics.append(i)
 
         # init all the models
         metric.init(run_config)
@@ -354,9 +341,6 @@ def evaluate(
                 AnswerCorrectness, metrics[answer_correctness_is_set]
             ).answer_similarity = None
 
-        for i in reproducable_metrics:
-            metrics[i].reproducibility = 1  # type: ignore
-
         # flush the analytics batcher
         from ragas._analytics import _analytics_batcher
 
diff --git a/src/ragas/integrations/llama_index.py b/src/ragas/integrations/llama_index.py
index b93961800..0eb4f5aef 100644
--- a/src/ragas/integrations/llama_index.py
+++ b/src/ragas/integrations/llama_index.py
@@ -97,7 +97,6 @@ def evaluate(
         callbacks=callbacks,
         show_progress=show_progress,
         run_config=run_config or RunConfig(),
-        in_ci=in_ci,
         token_usage_parser=token_usage_parser,
     )
 
diff --git a/src/ragas/metrics/_context_precision.py b/src/ragas/metrics/_context_precision.py
index b10e61cfc..f32b3c4b2 100644
--- a/src/ragas/metrics/_context_precision.py
+++ b/src/ragas/metrics/_context_precision.py
@@ -109,27 +109,10 @@ class LLMContextPrecisionWithReference(MetricWithLLM, SingleTurnMetric):
         default_factory=ContextPrecisionPrompt
     )
     max_retries: int = 1
-    _reproducibility: int = 1
 
     def _get_row_attributes(self, row: t.Dict) -> t.Tuple[str, t.List[str], t.Any]:
         return row["user_input"], row["retrieved_contexts"], row["reference"]
 
-    @property
-    def reproducibility(self):
-        return self._reproducibility
-
-    @reproducibility.setter
-    def reproducibility(self, value):
-        if value < 1:
-            logger.warning("reproducibility cannot be less than 1, setting to 1")
-            value = 1
-        elif value % 2 == 0:
-            logger.warning(
-                "reproducibility level cannot be set to even number, setting to odd"
-            )
-            value += 1
-        self._reproducibility = value
-
     def _calculate_average_precision(
         self, verifications: t.List[Verification]
     ) -> float:
@@ -173,7 +156,6 @@ async def _ascore(
                         context=context,
                         answer=reference,
                     ),
-                    n=self.reproducibility,
                     llm=self.llm,
                     callbacks=callbacks,
                 )
diff --git a/src/ragas/metrics/_context_recall.py b/src/ragas/metrics/_context_recall.py
index 0f7122cff..7af9915f9 100644
--- a/src/ragas/metrics/_context_recall.py
+++ b/src/ragas/metrics/_context_recall.py
@@ -113,28 +113,6 @@ class LLMContextRecall(MetricWithLLM, SingleTurnMetric):
         default_factory=ContextRecallClassificationPrompt
     )
     max_retries: int = 1
-    _reproducibility: int = 1
-
-    @property
-    def reproducibility(self):
-        return self._reproducibility
-
-    @reproducibility.setter
-    def reproducibility(self, value):
-        if value < 1:
-            logger.warning("reproducibility cannot be less than 1, setting to 1")
-            value = 1
-        elif value % 2 == 0:
-            logger.warning(
-                "reproducibility level cannot be set to even number, setting to odd"
-            )
-            value += 1
-        self._reproducibility = value
-
-    def __post_init__(self) -> None:
-        if self.reproducibility < 1:
-            logger.warning("reproducibility cannot be less than 1, setting to 1")
-            self.reproducibility = 1
 
     def _compute_score(self, responses: t.List[ContextRecallClassification]) -> float:
         response = [1 if item.attributed else 0 for item in responses]
@@ -166,7 +144,6 @@ async def _ascore(self, row: t.Dict, callbacks: Callbacks) -> float:
                 ),
                 llm=self.llm,
                 callbacks=callbacks,
-                n=self.reproducibility,
             )
         )
         classification_dicts = []
diff --git a/src/ragas/metrics/_faithfulness.py b/src/ragas/metrics/_faithfulness.py
index 5cf1229a7..f689bc1a8 100644
--- a/src/ragas/metrics/_faithfulness.py
+++ b/src/ragas/metrics/_faithfulness.py
@@ -178,23 +178,6 @@ class Faithfulness(MetricWithLLM, SingleTurnMetric):
     statement_prompt: PydanticPrompt = field(default_factory=LongFormAnswerPrompt)
     sentence_segmenter: t.Optional[HasSegmentMethod] = None
     max_retries: int = 1
-    _reproducibility: int = 1
-
-    @property
-    def reproducibility(self):
-        return self._reproducibility
-
-    @reproducibility.setter
-    def reproducibility(self, value):
-        if value < 1:
-            logger.warning("reproducibility cannot be less than 1, setting to 1")
-            value = 1
-        elif value % 2 == 0:
-            logger.warning(
-                "reproducibility level cannot be set to even number, setting to odd"
-            )
-            value += 1
-        self._reproducibility = value
 
     def __post_init__(self):
         if self.sentence_segmenter is None:
diff --git a/src/ragas/metrics/_noise_sensitivity.py b/src/ragas/metrics/_noise_sensitivity.py
index c7026de16..7856f0426 100644
--- a/src/ragas/metrics/_noise_sensitivity.py
+++ b/src/ragas/metrics/_noise_sensitivity.py
@@ -49,23 +49,6 @@ class NoiseSensitivity(MetricWithLLM, SingleTurnMetric):
     statement_prompt: PydanticPrompt = field(default_factory=LongFormAnswerPrompt)
     sentence_segmenter: t.Optional[HasSegmentMethod] = None
     max_retries: int = 1
-    _reproducibility: int = 1
-
-    @property
-    def reproducibility(self):
-        return self._reproducibility
-
-    @reproducibility.setter
-    def reproducibility(self, value):
-        if value < 1:
-            logger.warning("reproducibility cannot be less than 1, setting to 1")
-            value = 1
-        elif value % 2 == 0:
-            logger.warning(
-                "reproducibility level cannot be set to even number, setting to odd"
-            )
-            value += 1
-        self._reproducibility = value
 
     def __post_init__(self):
         if self.sentence_segmenter is None:
diff --git a/src/ragas/metrics/base.py b/src/ragas/metrics/base.py
index ffd6d038d..daf7b8d03 100644
--- a/src/ragas/metrics/base.py
+++ b/src/ragas/metrics/base.py
@@ -712,11 +712,4 @@ def get_segmenter(
     )
 
 
-def is_reproducable(metric: Metric) -> bool:
-    """
-    Check if a metric is reproducible by checking if it has a `_reproducibility` attribute.
-    """
-    return hasattr(metric, "_reproducibility")
-
-
 ensembler = Ensember()