From 61e11ef4af9759d3a4af3c7a52f18294c495926a Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Tue, 23 Jul 2024 16:20:42 +0330
Subject: [PATCH 1/4] Working on fewshot

---
 src/lighteval/tasks/default_prompts.py |  8 ++------
 src/lighteval/tasks/prompt_manager.py  |  8 +++-----
 src/lighteval/tasks/requests.py        | 15 ++++++---------
 3 files changed, 11 insertions(+), 20 deletions(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 3a9b97f0..136acb77 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -176,7 +176,6 @@ def bbh_harness(line, task_name: str = None):
         query=query,
         choices=choices,
         gold_index=correct_index,
-        target_for_fewshot_sorting=choices,
         instruction=line.get("task_prefix", None),
     )
 
@@ -196,7 +195,6 @@ def bbh_lighteval(line, task_name: str = None):
         query=query,
         choices=LETTER_INDICES[: len(line["choices"])],
         gold_index=line["target_idx"],
-        target_for_fewshot_sorting=LETTER_INDICES[: len(line["choices"])],
         instruction=line.get("task_prefix", None),
     )
 
@@ -205,9 +203,8 @@ def bbh(line, instruction, choices, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=f"{instruction}Q: {line['input']}\nA:",
-        choices=choices,
+        choices=[(' ' if line["__few_shots"] else '') + c for c in choices],
         gold_index=choices.index(line["target"]),
-        target_for_fewshot_sorting=[f" {c}" for c in choices],
         instruction=instruction,
     )
 
@@ -793,10 +790,9 @@ def hellaswag_helm(line, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]],
+        choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]] + ([""] if line["__fewshot"] else []),
         gold_index=gold_ix,  # -1 for test,
         instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
-        target_for_fewshot_sorting=line["endings"][gold_ix] if gold_ix > -1 else "",
         specific={
             "label_to_choices": {f" {key}": choice for key, choice in zip(LETTER_INDICES, line["endings"])},
         },
diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py
index ad4c8fc8..ff38f922 100644
--- a/src/lighteval/tasks/prompt_manager.py
+++ b/src/lighteval/tasks/prompt_manager.py
@@ -65,20 +65,18 @@ def doc_to_text(doc: Doc, return_instructions: bool = False) -> Union[str, Tuple
         )
 
     @staticmethod
-    def doc_to_target(formatted_doc: Doc, few_shot: bool = False) -> str:
+    def doc_to_target(formatted_doc: Doc) -> str:
         """
         Returns the target of the given document.
 
         Args:
             formatted_doc (Doc): Formatted document.
-            few_shot (bool, optional): Whether the document is used for few
-                shot examples. Defaults to False.
 
         Returns:
             str: Target of the document, which is the correct answer for a document.
         """
         # likely we mostly need one example not all
-        return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
+        return as_list(formatted_doc.get_golds())[0]
 
     def add_context_to_doc(
         self,
@@ -363,7 +361,7 @@ def _init_fewshot_sampling_balanced(
         # Sort by counts of labels
         label_to_instances = defaultdict(list)
         for instance in fewshotpool:
-            target = PromptManager.doc_to_target(instance, few_shot=True)
+            target = instance.get_target_for_fewshot_sorting()
             label_to_instances[target].append(instance)
 
         counts_to_labels = defaultdict(list)
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index dd8b0d6d..041b5506 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -177,7 +177,7 @@ class Doc:
 
     # For few-shot
     instruction: Optional[str] = ""
-    target_for_fewshot_sorting: Optional[str] = None  # will probably have to be removed in the future
+    target_for_fewshot_sorting: Optional[str] = None
 
     # Filled when parsing and adding the few-shot context
     ctx: Optional[str] = ""
@@ -193,19 +193,16 @@ def __post_init__(self):
         if self.instruction is None:
             self.instruction = ""
 
-    def get_golds(self, few_shot: bool = False):
+    def get_golds(self):
         """Return gold targets extracted from the target dict"""
         gold_indices = as_list(self.gold_index)
-        if few_shot and self.target_for_fewshot_sorting is not None:
-            choices = self.target_for_fewshot_sorting
-            if isinstance(choices, str):  # correct choice is already selected
-                return choices
-        else:
-            choices = self.choices
         golds = []
         for gold_ix in gold_indices:
-            golds.extend(as_list(choices[gold_ix]))
+            golds.extend(as_list(self.choices[gold_ix]))
         return golds
+    
+    def get_target_for_fewshot_sorting(self) -> str:
+        return self.target_for_fewshot_sorting or as_list(self.get_golds())[0]
 
     def __repr__(self):
         doc_dict = asdict(self)

From f7ed5ffc9adeb461d7f6738f7aef2f9f0e395a37 Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sat, 27 Jul 2024 08:54:08 +0330
Subject: [PATCH 2/4] Adapt prompts to removing target_for_fewshot_sorting

---
 src/lighteval/tasks/default_prompts.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 136acb77..6db85f9e 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -1348,7 +1348,6 @@ def mmlu(line, topic, task_name: str = None):
         choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
     )
 
 
@@ -1369,7 +1368,6 @@ def custom_mmlu_thom(line, task_name: str = None):
         choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about  {topic.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
     )
 
 
@@ -1609,7 +1607,6 @@ def mmlu_harness(line, task_name: str = None):
     query += "Answer:"
 
     gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
-    "__few_shots" in line and line["__few_shots"] is True  # We are adding few shots
 
     return Doc(
         task_name=task_name,
@@ -1617,7 +1614,6 @@ def mmlu_harness(line, task_name: str = None):
         choices=[" A", " B", " C", " D"],
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix],
     )
 
 
@@ -1628,14 +1624,14 @@ def mmlu_helm(line, task_name: str = None):
     query += "\nAnswer:"
 
     gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"]
+    is_few_shots = line.get("__few_shots", False)  # We are adding few shots
 
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" A", " B", " C", " D"],
+        choices=[" A", " B", " C", " D"] if not is_few_shots else ["A", "B", "C", "D"], # specific to HELM evals
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
-        target_for_fewshot_sorting=line["choices"][gold_ix],  # specific to HELM evals
     )
 
 
@@ -1794,6 +1790,7 @@ def openbookqa_helm(line, task_name: str = None):
     query += "Answer: "
 
     gold_ix = ["A", "B", "C", "D", "E"].index(line["answerKey"].strip())
+    # I don't get this.
     return Doc(
         task_name=task_name,
         query=query,
@@ -1821,7 +1818,7 @@ def piqa_helm(line, task_name: str = None):
     query += "Answer: "
 
     gold_ix = int(line["label"])
-
+    # Also this.
     return Doc(
         task_name=task_name,
         query=query,
@@ -1861,7 +1858,7 @@ def pubmed_qa_helm(line, task_name: str = None):
     )
     query += f"\n\nQuestion: {line['question']}\nAnswer: "
     gold_ix = ["yes", "no", "maybe"].index(line["final_decision"])
-
+    # And this
     return Doc(
         task_name=task_name,
         query=query,
@@ -2247,7 +2244,7 @@ def truthful_qa_helm(line, task_name: str = None):
     query = f"Question: {line['question']}\n"
     query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
     query += "Answer:"
-
+    # And this.
     return Doc(
         task_name=task_name,
         query=query,

From 6146844b7137e6d4c126acf739c75e0034a57d3f Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sat, 27 Jul 2024 09:30:46 +0330
Subject: [PATCH 3/4] Fix a bug related to target_for_fewshot_sorting

---
 src/lighteval/tasks/default_prompts.py | 8 --------
 src/lighteval/tasks/lighteval_task.py  | 6 ++----
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 6db85f9e..902de3d1 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -1790,14 +1790,12 @@ def openbookqa_helm(line, task_name: str = None):
     query += "Answer: "
 
     gold_ix = ["A", "B", "C", "D", "E"].index(line["answerKey"].strip())
-    # I don't get this.
     return Doc(
         task_name=task_name,
         query=query,
         choices=["A", "B", "C", "D", "E"],
         gold_index=gold_ix,
         instruction="The following are multiple choice questions (with answers) about common sense.\n",
-        target_for_fewshot_sorting=line["choices"]["text"][gold_ix],  # specific to HELM evals
     )
 
 
@@ -1818,14 +1816,12 @@ def piqa_helm(line, task_name: str = None):
     query += "Answer: "
 
     gold_ix = int(line["label"])
-    # Also this.
     return Doc(
         task_name=task_name,
         query=query,
         choices=["A", "B"],
         gold_index=gold_ix,
         instruction="The following are multiple choice questions (with answers) about common sense.\n",
-        target_for_fewshot_sorting=[line["sol1"], line["sol2"]][gold_ix],
     )
 
 
@@ -1858,13 +1854,11 @@ def pubmed_qa_helm(line, task_name: str = None):
     )
     query += f"\n\nQuestion: {line['question']}\nAnswer: "
     gold_ix = ["yes", "no", "maybe"].index(line["final_decision"])
-    # And this
     return Doc(
         task_name=task_name,
         query=query,
         choices=["A", "B", "C"],
         gold_index=gold_ix,
-        target_for_fewshot_sorting=["yes", "no", "maybe"][gold_ix],
     )
 
 
@@ -2244,13 +2238,11 @@ def truthful_qa_helm(line, task_name: str = None):
     query = f"Question: {line['question']}\n"
     query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])])
     query += "Answer:"
-    # And this.
     return Doc(
         task_name=task_name,
         query=query,
         choices=LETTER_INDICES[: len(line["choices"])],
         gold_index=line["gold_index"],
-        target_for_fewshot_sorting=line["choices"][line["gold_index"]],
     )
 
 
diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py
index 00b4763b..e6411607 100644
--- a/src/lighteval/tasks/lighteval_task.py
+++ b/src/lighteval/tasks/lighteval_task.py
@@ -340,20 +340,18 @@ def eval_docs(self) -> list[Doc]:
                 self._docs = self.remove_duplicate_docs(self._docs)
         return self._docs
 
-    def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str:
+    def doc_to_target(self, formatted_doc: Doc) -> str:
         """
         Returns the target of the given document.
 
         Args:
             formatted_doc (Doc): Formatted document.
-            few_shot (bool, optional): Whether the document is used for few
-                shot examples. Defaults to False.
 
         Returns:
             str: Target of the document, which is the correct answer for a document.
         """
         # likely we mostly need one example not all
-        return as_list(formatted_doc.get_golds(few_shot=few_shot))[0]
+        return as_list(formatted_doc.get_golds())[0]
 
     def construct_requests(
         self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str

From dc8134369ef45c0946e17da5d9b208ff1788c82f Mon Sep 17 00:00:00 2001
From: Sadra Barikbin <sadraqazvin1@yahoo.com>
Date: Sat, 17 Aug 2024 20:15:46 +0330
Subject: [PATCH 4/4] Fix a tiny bug and apply ruff

---
 src/lighteval/tasks/default_prompts.py | 6 +++---
 src/lighteval/tasks/prompt_manager.py  | 4 +---
 src/lighteval/tasks/requests.py        | 2 +-
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py
index 902de3d1..37ed922e 100644
--- a/src/lighteval/tasks/default_prompts.py
+++ b/src/lighteval/tasks/default_prompts.py
@@ -203,7 +203,7 @@ def bbh(line, instruction, choices, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=f"{instruction}Q: {line['input']}\nA:",
-        choices=[(' ' if line["__few_shots"] else '') + c for c in choices],
+        choices=[(" " if line["__few_shots"] else "") + c for c in choices],
         gold_index=choices.index(line["target"]),
         instruction=instruction,
     )
@@ -790,7 +790,7 @@ def hellaswag_helm(line, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]] + ([""] if line["__fewshot"] else []),
+        choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]] + ([""] if line["__few_shot"] else []),
         gold_index=gold_ix,  # -1 for test,
         instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
         specific={
@@ -1629,7 +1629,7 @@ def mmlu_helm(line, task_name: str = None):
     return Doc(
         task_name=task_name,
         query=query,
-        choices=[" A", " B", " C", " D"] if not is_few_shots else ["A", "B", "C", "D"], # specific to HELM evals
+        choices=[" A", " B", " C", " D"] if not is_few_shots else ["A", "B", "C", "D"],  # specific to HELM evals
         gold_index=gold_ix,
         instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n",
     )
diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py
index ff38f922..d29f8360 100644
--- a/src/lighteval/tasks/prompt_manager.py
+++ b/src/lighteval/tasks/prompt_manager.py
@@ -253,9 +253,7 @@ def get_examples(
 class FewShotSelectionMethod:
     sorting: str  # sorting method for the overall few shot pool (balanced, random, sequential)
     with_sampling: bool  # samples item randomly from the few shot pool
-    fewshotpool_unique: (
-        bool
-    )  # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set
+    fewshotpool_unique: bool  # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set
 
 
 class FewShotSelection(Enum):
diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py
index 041b5506..a82521ed 100644
--- a/src/lighteval/tasks/requests.py
+++ b/src/lighteval/tasks/requests.py
@@ -200,7 +200,7 @@ def get_golds(self):
         for gold_ix in gold_indices:
             golds.extend(as_list(self.choices[gold_ix]))
         return golds
-    
+
     def get_target_for_fewshot_sorting(self) -> str:
         return self.target_for_fewshot_sorting or as_list(self.get_golds())[0]