From 61e11ef4af9759d3a4af3c7a52f18294c495926a Mon Sep 17 00:00:00 2001 From: Sadra Barikbin Date: Tue, 23 Jul 2024 16:20:42 +0330 Subject: [PATCH 1/4] Working on fewshot --- src/lighteval/tasks/default_prompts.py | 8 ++------ src/lighteval/tasks/prompt_manager.py | 8 +++----- src/lighteval/tasks/requests.py | 15 ++++++--------- 3 files changed, 11 insertions(+), 20 deletions(-) diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 3a9b97f0..136acb77 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -176,7 +176,6 @@ def bbh_harness(line, task_name: str = None): query=query, choices=choices, gold_index=correct_index, - target_for_fewshot_sorting=choices, instruction=line.get("task_prefix", None), ) @@ -196,7 +195,6 @@ def bbh_lighteval(line, task_name: str = None): query=query, choices=LETTER_INDICES[: len(line["choices"])], gold_index=line["target_idx"], - target_for_fewshot_sorting=LETTER_INDICES[: len(line["choices"])], instruction=line.get("task_prefix", None), ) @@ -205,9 +203,8 @@ def bbh(line, instruction, choices, task_name: str = None): return Doc( task_name=task_name, query=f"{instruction}Q: {line['input']}\nA:", - choices=choices, + choices=[(' ' if line["__few_shots"] else '') + c for c in choices], gold_index=choices.index(line["target"]), - target_for_fewshot_sorting=[f" {c}" for c in choices], instruction=instruction, ) @@ -793,10 +790,9 @@ def hellaswag_helm(line, task_name: str = None): return Doc( task_name=task_name, query=query, - choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]], + choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]] + ([""] if line["__fewshot"] else []), gold_index=gold_ix, # -1 for test, instruction="The following are multiple choice questions (with answers) about common sense.\n\n", - target_for_fewshot_sorting=line["endings"][gold_ix] if gold_ix > -1 else "", specific={ "label_to_choices": {f" {key}": choice for key, choice in zip(LETTER_INDICES, line["endings"])}, }, diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py index ad4c8fc8..ff38f922 100644 --- a/src/lighteval/tasks/prompt_manager.py +++ b/src/lighteval/tasks/prompt_manager.py @@ -65,20 +65,18 @@ def doc_to_text(doc: Doc, return_instructions: bool = False) -> Union[str, Tuple ) @staticmethod - def doc_to_target(formatted_doc: Doc, few_shot: bool = False) -> str: + def doc_to_target(formatted_doc: Doc) -> str: """ Returns the target of the given document. Args: formatted_doc (Doc): Formatted document. - few_shot (bool, optional): Whether the document is used for few - shot examples. Defaults to False. Returns: str: Target of the document, which is the correct answer for a document. """ # likely we mostly need one example not all - return as_list(formatted_doc.get_golds(few_shot=few_shot))[0] + return as_list(formatted_doc.get_golds())[0] def add_context_to_doc( self, @@ -363,7 +361,7 @@ def _init_fewshot_sampling_balanced( # Sort by counts of labels label_to_instances = defaultdict(list) for instance in fewshotpool: - target = PromptManager.doc_to_target(instance, few_shot=True) + target = instance.get_target_for_fewshot_sorting() label_to_instances[target].append(instance) counts_to_labels = defaultdict(list) diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py index dd8b0d6d..041b5506 100644 --- a/src/lighteval/tasks/requests.py +++ b/src/lighteval/tasks/requests.py @@ -177,7 +177,7 @@ class Doc: # For few-shot instruction: Optional[str] = "" - target_for_fewshot_sorting: Optional[str] = None # will probably have to be removed in the future + target_for_fewshot_sorting: Optional[str] = None # Filled when parsing and adding the few-shot context ctx: Optional[str] = "" @@ -193,19 +193,16 @@ def __post_init__(self): if self.instruction is None: self.instruction = "" - def get_golds(self, few_shot: bool = False): + def get_golds(self): """Return gold targets extracted from the target dict""" gold_indices = as_list(self.gold_index) - if few_shot and self.target_for_fewshot_sorting is not None: - choices = self.target_for_fewshot_sorting - if isinstance(choices, str): # correct choice is already selected - return choices - else: - choices = self.choices golds = [] for gold_ix in gold_indices: - golds.extend(as_list(choices[gold_ix])) + golds.extend(as_list(self.choices[gold_ix])) return golds + + def get_target_for_fewshot_sorting(self) -> str: + return self.target_for_fewshot_sorting or as_list(self.get_golds())[0] def __repr__(self): doc_dict = asdict(self) From f7ed5ffc9adeb461d7f6738f7aef2f9f0e395a37 Mon Sep 17 00:00:00 2001 From: Sadra Barikbin Date: Sat, 27 Jul 2024 08:54:08 +0330 Subject: [PATCH 2/4] Adapt prompts to removing target_for_fewshot_sorting --- src/lighteval/tasks/default_prompts.py | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 136acb77..6db85f9e 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -1348,7 +1348,6 @@ def mmlu(line, topic, task_name: str = None): choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"], gold_index=gold_ix, instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n", - target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix], ) @@ -1369,7 +1368,6 @@ def custom_mmlu_thom(line, task_name: str = None): choices=[" A", " B", " C", " D"] if is_few_shots else ["A", "B", "C", "D"], gold_index=gold_ix, instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n", - target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix], ) @@ -1609,7 +1607,6 @@ def mmlu_harness(line, task_name: str = None): query += "Answer:" gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"] - "__few_shots" in line and line["__few_shots"] is True # We are adding few shots return Doc( task_name=task_name, @@ -1617,7 +1614,6 @@ def mmlu_harness(line, task_name: str = None): choices=[" A", " B", " C", " D"], gold_index=gold_ix, instruction=f"The following are multiple choice questions (with answers) about {topic.replace('_', ' ')}.\n\n", - target_for_fewshot_sorting=[" A", " B", " C", " D"][gold_ix], ) @@ -1628,14 +1624,14 @@ def mmlu_helm(line, task_name: str = None): query += "\nAnswer:" gold_ix = LETTER_INDICES.index(line["answer"]) if isinstance(line["answer"], str) else line["answer"] + is_few_shots = line.get("__few_shots", False) # We are adding few shots return Doc( task_name=task_name, query=query, - choices=[" A", " B", " C", " D"], + choices=[" A", " B", " C", " D"] if not is_few_shots else ["A", "B", "C", "D"], # specific to HELM evals gold_index=gold_ix, instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n", - target_for_fewshot_sorting=line["choices"][gold_ix], # specific to HELM evals ) @@ -1794,6 +1790,7 @@ def openbookqa_helm(line, task_name: str = None): query += "Answer: " gold_ix = ["A", "B", "C", "D", "E"].index(line["answerKey"].strip()) + # I don't get this. return Doc( task_name=task_name, query=query, @@ -1821,7 +1818,7 @@ def piqa_helm(line, task_name: str = None): query += "Answer: " gold_ix = int(line["label"]) - + # Also this. return Doc( task_name=task_name, query=query, @@ -1861,7 +1858,7 @@ def pubmed_qa_helm(line, task_name: str = None): ) query += f"\n\nQuestion: {line['question']}\nAnswer: " gold_ix = ["yes", "no", "maybe"].index(line["final_decision"]) - + # And this return Doc( task_name=task_name, query=query, @@ -2247,7 +2244,7 @@ def truthful_qa_helm(line, task_name: str = None): query = f"Question: {line['question']}\n" query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])]) query += "Answer:" - + # And this. return Doc( task_name=task_name, query=query, From 6146844b7137e6d4c126acf739c75e0034a57d3f Mon Sep 17 00:00:00 2001 From: Sadra Barikbin Date: Sat, 27 Jul 2024 09:30:46 +0330 Subject: [PATCH 3/4] Fix a bug related to target_for_fewshot_sorting --- src/lighteval/tasks/default_prompts.py | 8 -------- src/lighteval/tasks/lighteval_task.py | 6 ++---- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 6db85f9e..902de3d1 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -1790,14 +1790,12 @@ def openbookqa_helm(line, task_name: str = None): query += "Answer: " gold_ix = ["A", "B", "C", "D", "E"].index(line["answerKey"].strip()) - # I don't get this. return Doc( task_name=task_name, query=query, choices=["A", "B", "C", "D", "E"], gold_index=gold_ix, instruction="The following are multiple choice questions (with answers) about common sense.\n", - target_for_fewshot_sorting=line["choices"]["text"][gold_ix], # specific to HELM evals ) @@ -1818,14 +1816,12 @@ def piqa_helm(line, task_name: str = None): query += "Answer: " gold_ix = int(line["label"]) - # Also this. return Doc( task_name=task_name, query=query, choices=["A", "B"], gold_index=gold_ix, instruction="The following are multiple choice questions (with answers) about common sense.\n", - target_for_fewshot_sorting=[line["sol1"], line["sol2"]][gold_ix], ) @@ -1858,13 +1854,11 @@ def pubmed_qa_helm(line, task_name: str = None): ) query += f"\n\nQuestion: {line['question']}\nAnswer: " gold_ix = ["yes", "no", "maybe"].index(line["final_decision"]) - # And this return Doc( task_name=task_name, query=query, choices=["A", "B", "C"], gold_index=gold_ix, - target_for_fewshot_sorting=["yes", "no", "maybe"][gold_ix], ) @@ -2244,13 +2238,11 @@ def truthful_qa_helm(line, task_name: str = None): query = f"Question: {line['question']}\n" query += "".join([f"{key}. {choice}\n" for key, choice in zip(LETTER_INDICES, line["choices"])]) query += "Answer:" - # And this. return Doc( task_name=task_name, query=query, choices=LETTER_INDICES[: len(line["choices"])], gold_index=line["gold_index"], - target_for_fewshot_sorting=line["choices"][line["gold_index"]], ) diff --git a/src/lighteval/tasks/lighteval_task.py b/src/lighteval/tasks/lighteval_task.py index 00b4763b..e6411607 100644 --- a/src/lighteval/tasks/lighteval_task.py +++ b/src/lighteval/tasks/lighteval_task.py @@ -340,20 +340,18 @@ def eval_docs(self) -> list[Doc]: self._docs = self.remove_duplicate_docs(self._docs) return self._docs - def doc_to_target(self, formatted_doc: Doc, few_shot: bool = False) -> str: + def doc_to_target(self, formatted_doc: Doc) -> str: """ Returns the target of the given document. Args: formatted_doc (Doc): Formatted document. - few_shot (bool, optional): Whether the document is used for few - shot examples. Defaults to False. Returns: str: Target of the document, which is the correct answer for a document. """ # likely we mostly need one example not all - return as_list(formatted_doc.get_golds(few_shot=few_shot))[0] + return as_list(formatted_doc.get_golds())[0] def construct_requests( self, formatted_doc: Doc, context: str, document_id_seed: str, current_task_name: str From dc8134369ef45c0946e17da5d9b208ff1788c82f Mon Sep 17 00:00:00 2001 From: Sadra Barikbin Date: Sat, 17 Aug 2024 20:15:46 +0330 Subject: [PATCH 4/4] Fix a tiny bug and apply ruff --- src/lighteval/tasks/default_prompts.py | 6 +++--- src/lighteval/tasks/prompt_manager.py | 4 +--- src/lighteval/tasks/requests.py | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/lighteval/tasks/default_prompts.py b/src/lighteval/tasks/default_prompts.py index 902de3d1..37ed922e 100644 --- a/src/lighteval/tasks/default_prompts.py +++ b/src/lighteval/tasks/default_prompts.py @@ -203,7 +203,7 @@ def bbh(line, instruction, choices, task_name: str = None): return Doc( task_name=task_name, query=f"{instruction}Q: {line['input']}\nA:", - choices=[(' ' if line["__few_shots"] else '') + c for c in choices], + choices=[(" " if line["__few_shots"] else "") + c for c in choices], gold_index=choices.index(line["target"]), instruction=instruction, ) @@ -790,7 +790,7 @@ def hellaswag_helm(line, task_name: str = None): return Doc( task_name=task_name, query=query, - choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]] + ([""] if line["__fewshot"] else []), + choices=[" " + i for i in LETTER_INDICES[: len(line["endings"])]] + ([""] if line["__few_shot"] else []), gold_index=gold_ix, # -1 for test, instruction="The following are multiple choice questions (with answers) about common sense.\n\n", specific={ @@ -1629,7 +1629,7 @@ def mmlu_helm(line, task_name: str = None): return Doc( task_name=task_name, query=query, - choices=[" A", " B", " C", " D"] if not is_few_shots else ["A", "B", "C", "D"], # specific to HELM evals + choices=[" A", " B", " C", " D"] if not is_few_shots else ["A", "B", "C", "D"], # specific to HELM evals gold_index=gold_ix, instruction=f"The following are multiple choice questions (with answers) about {subject.replace('_', ' ')}.\n\n", ) diff --git a/src/lighteval/tasks/prompt_manager.py b/src/lighteval/tasks/prompt_manager.py index ff38f922..d29f8360 100644 --- a/src/lighteval/tasks/prompt_manager.py +++ b/src/lighteval/tasks/prompt_manager.py @@ -253,9 +253,7 @@ def get_examples( class FewShotSelectionMethod: sorting: str # sorting method for the overall few shot pool (balanced, random, sequential) with_sampling: bool # samples item randomly from the few shot pool - fewshotpool_unique: ( - bool - ) # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set + fewshotpool_unique: bool # set to true if you are CERTAIN there is no intersection between the few shot pool and your evaluation set class FewShotSelection(Enum): diff --git a/src/lighteval/tasks/requests.py b/src/lighteval/tasks/requests.py index 041b5506..a82521ed 100644 --- a/src/lighteval/tasks/requests.py +++ b/src/lighteval/tasks/requests.py @@ -200,7 +200,7 @@ def get_golds(self): for gold_ix in gold_indices: golds.extend(as_list(self.choices[gold_ix])) return golds - + def get_target_for_fewshot_sorting(self) -> str: return self.target_for_fewshot_sorting or as_list(self.get_golds())[0]