From 2df94ff32721c68f8ea0f454b45fb9cdb184de6d Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 22 Oct 2024 15:24:59 +0200 Subject: [PATCH 1/2] fix math parser --- lm_eval/tasks/leaderboard/math/utils.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/lm_eval/tasks/leaderboard/math/utils.py b/lm_eval/tasks/leaderboard/math/utils.py index e3ebcf991b..8c6e1878df 100644 --- a/lm_eval/tasks/leaderboard/math/utils.py +++ b/lm_eval/tasks/leaderboard/math/utils.py @@ -70,7 +70,10 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]: unnormalized_answer = get_unnormalized_answer(candidates) answer = normalize_final_answer(unnormalized_answer) - if is_equiv(answer, doc["answer"]): + if answer == "[invalidanswer]": + return {"exact_match": 0} + + if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]): retval = 1 else: retval = 0 @@ -112,15 +115,19 @@ def last_boxed_only_string(string: str) -> Optional[str]: def remove_boxed(s: str) -> str: - if "\\boxed " in s: - left = "\\boxed " - assert s[: len(left)] == left - return s[len(left) :] + try: + if "\\boxed " in s: + left = "\\boxed " + assert s[: len(left)] == left + return s[len(left) :] - left = "\\boxed{" + left = "\\boxed{" - assert s[: len(left)] == left - assert s[-1] == "}" + assert s[: len(left)] == left + assert s[-1] == "}" + except AssertionError: + print(s) + return s return s[len(left) : -1] @@ -146,7 +153,7 @@ def is_equiv(x1: str, x2: str) -> bool: x1 and x2 are normalized latex string """ try: - with timeout(seconds=5): + with timeout(seconds=1): try: parsed_x1 = parse_latex(x1) parsed_x2 = parse_latex(x2) From ce116aed154c0f7367589a3bc39895dd60053ab9 Mon Sep 17 00:00:00 2001 From: Nathan Habib Date: Tue, 22 Oct 2024 15:47:53 +0200 Subject: [PATCH 2/2] fix math parser --- lm_eval/tasks/leaderboard/math/utils.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/lm_eval/tasks/leaderboard/math/utils.py b/lm_eval/tasks/leaderboard/math/utils.py index 8c6e1878df..607be3016c 100644 --- a/lm_eval/tasks/leaderboard/math/utils.py +++ b/lm_eval/tasks/leaderboard/math/utils.py @@ -17,6 +17,9 @@ ) +INVALID_ANSWER = "[invalidanswer]" + + # taken from # https://github.com/wellecks/lm-evaluation-harness/blob/master/lm_eval/tasks/minerva_math.py def doc_to_text(doc: dict) -> str: @@ -70,7 +73,7 @@ def process_results(doc: dict, results: List[str]) -> Dict[str, int]: unnormalized_answer = get_unnormalized_answer(candidates) answer = normalize_final_answer(unnormalized_answer) - if answer == "[invalidanswer]": + if answer == INVALID_ANSWER: return {"exact_match": 0} if answer.strip() == doc["answer"].strip() or is_equiv(answer, doc["answer"]): @@ -125,11 +128,9 @@ def remove_boxed(s: str) -> str: assert s[: len(left)] == left assert s[-1] == "}" + return s[len(left) : -1] except AssertionError: - print(s) - return s - - return s[len(left) : -1] + return INVALID_ANSWER class timeout: @@ -192,7 +193,6 @@ def is_equiv(x1: str, x2: str) -> bool: def get_unnormalized_answer(text: str) -> str: - INVALID_ANSWER = "[invalidanswer]" end_seq = "I hope it is correct." text += end_seq match = re.search(