Merge remote-tracking branch 'origin/dev' into fix/useless_file

ku-nlp · Nov 21, 2023 · 8b4c2a9 · 8b4c2a9
2 parents 1ea2bf3 + de33589
commit 8b4c2a9
Show file tree

Hide file tree

Showing 106 changed files with 576 additions and 14,644 deletions.
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
@@ -0,0 +1,18 @@
+name: Lint
+
+on: [push, pull_request]
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+      - name: Set up Python 3.8
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Run linters
+        run: |
+          pipx install pre-commit
+          pre-commit run --files llm_judge/*.py
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
diff --git a/.gitignore b/.gitignore
@@ -1,10 +1,183 @@
-# Python
-__pycache__
-*.pyc
-*.egg-info
-dist
+# Created by https://www.toptal.com/developers/gitignore/api/python
+# Edit at https://www.toptal.com/developers/gitignore?templates=python
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
 .venv
+<<<<<<< HEAD
 .env
+=======
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+# End of https://www.toptal.com/developers/gitignore/api/python
+>>>>>>> origin/dev
 
 # folder
 data

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,14 @@
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: end-of-file-fixer
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: check-toml
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.1.4
+    hooks:
+      - id: ruff
+        args: [--fix, --exit-non-zero-on-fix]
+      - id: ruff-format
diff --git a/..._bench/model_answer/gpt-3.5-davinci.jsonl → ..._bench/model_answer/gpt-3.5-davinci.jsonl b/..._bench/model_answer/gpt-3.5-davinci.jsonl → ..._bench/model_answer/gpt-3.5-davinci.jsonl
diff --git a/...odel_answer/japanese-alpaca-lora-7b.jsonl → ...odel_answer/japanese-alpaca-lora-7b.jsonl b/...odel_answer/japanese-alpaca-lora-7b.jsonl → ...odel_answer/japanese-alpaca-lora-7b.jsonl
diff --git a/...ta/jp_bench/model_answer/llama2_13b.jsonl → data/jp_bench/model_answer/llama2_13b.jsonl b/...ta/jp_bench/model_answer/llama2_13b.jsonl → data/jp_bench/model_answer/llama2_13b.jsonl
diff --git a/...del_answer/llama2_13b_mt_data_52000.jsonl → ...del_answer/llama2_13b_mt_data_52000.jsonl b/...del_answer/llama2_13b_mt_data_52000.jsonl → ...del_answer/llama2_13b_mt_data_52000.jsonl
diff --git a/...ma2_13b_self-instruction_data_52000.jsonl → ...ma2_13b_self-instruction_data_52000.jsonl b/...ma2_13b_self-instruction_data_52000.jsonl → ...ma2_13b_self-instruction_data_52000.jsonl
diff --git a/...ch/model_answer/llama2_mt_data_1000.jsonl → ...ch/model_answer/llama2_mt_data_1000.jsonl b/...ch/model_answer/llama2_mt_data_1000.jsonl → ...ch/model_answer/llama2_mt_data_1000.jsonl
diff --git a/...h/model_answer/llama2_mt_data_52000.jsonl → ...h/model_answer/llama2_mt_data_52000.jsonl b/...h/model_answer/llama2_mt_data_52000.jsonl → ...h/model_answer/llama2_mt_data_52000.jsonl
diff --git a/...r/llama2_self-instruction_data_1000.jsonl → ...r/llama2_self-instruction_data_1000.jsonl b/...r/llama2_self-instruction_data_1000.jsonl → ...r/llama2_self-instruction_data_1000.jsonl
diff --git a/.../llama2_self-instruction_data_52000.jsonl → .../llama2_self-instruction_data_52000.jsonl b/.../llama2_self-instruction_data_52000.jsonl → .../llama2_self-instruction_data_52000.jsonl
diff --git a/...data/jp_bench/model_answer/llama_7b.jsonl → data/jp_bench/model_answer/llama_7b.jsonl b/...data/jp_bench/model_answer/llama_7b.jsonl → data/jp_bench/model_answer/llama_7b.jsonl
diff --git a/...nch/model_answer/llama_mt_data_5000.jsonl → ...nch/model_answer/llama_mt_data_5000.jsonl b/...nch/model_answer/llama_mt_data_5000.jsonl → ...nch/model_answer/llama_mt_data_5000.jsonl
diff --git a/...ch/model_answer/llama_mt_data_52000.jsonl → ...ch/model_answer/llama_mt_data_52000.jsonl b/...ch/model_answer/llama_mt_data_52000.jsonl → ...ch/model_answer/llama_mt_data_52000.jsonl
diff --git a/...er/llama_self-instruction_data_5000.jsonl → ...er/llama_self-instruction_data_5000.jsonl b/...er/llama_self-instruction_data_5000.jsonl → ...er/llama_self-instruction_data_5000.jsonl
diff --git a/...r/llama_self-instruction_data_52000.jsonl → ...r/llama_self-instruction_data_52000.jsonl b/...r/llama_self-instruction_data_52000.jsonl → ...r/llama_self-instruction_data_52000.jsonl
diff --git a/...ch/model_answer/llm-jp-1.3b-refined.jsonl → ...ch/model_answer/llm-jp-1.3b-refined.jsonl b/...ch/model_answer/llm-jp-1.3b-refined.jsonl → ...ch/model_answer/llm-jp-1.3b-refined.jsonl
diff --git a/...a/jp_bench/model_answer/llm-jp-1.3b.jsonl → data/jp_bench/model_answer/llm-jp-1.3b.jsonl b/...a/jp_bench/model_answer/llm-jp-1.3b.jsonl → data/jp_bench/model_answer/llm-jp-1.3b.jsonl
diff --git a/...lm-jp-13b-lora-sft-dolly-oasst-run1.jsonl → ...lm-jp-13b-lora-sft-dolly-oasst-run1.jsonl b/...lm-jp-13b-lora-sft-dolly-oasst-run1.jsonl → ...lm-jp-13b-lora-sft-dolly-oasst-run1.jsonl
diff --git a/...lm-jp-13b-lora-sft-dolly-oasst-run2.jsonl → ...lm-jp-13b-lora-sft-dolly-oasst-run2.jsonl b/...lm-jp-13b-lora-sft-dolly-oasst-run2.jsonl → ...lm-jp-13b-lora-sft-dolly-oasst-run2.jsonl
diff --git a/...lm-jp-13b-lora-sft-dolly-oasst-run3.jsonl → ...lm-jp-13b-lora-sft-dolly-oasst-run3.jsonl b/...lm-jp-13b-lora-sft-dolly-oasst-run3.jsonl → ...lm-jp-13b-lora-sft-dolly-oasst-run3.jsonl
diff --git a/...-jp-13b-lora-sft-gpt4-self-instruct.jsonl → ...-jp-13b-lora-sft-gpt4-self-instruct.jsonl b/...-jp-13b-lora-sft-gpt4-self-instruct.jsonl → ...-jp-13b-lora-sft-gpt4-self-instruct.jsonl
diff --git a/...jp-13b-lora-sft-js-dolly-oasst-run1.jsonl → ...jp-13b-lora-sft-js-dolly-oasst-run1.jsonl b/...jp-13b-lora-sft-js-dolly-oasst-run1.jsonl → ...jp-13b-lora-sft-js-dolly-oasst-run1.jsonl
diff --git a/...jp-13b-lora-sft-js-dolly-oasst-run2.jsonl → ...jp-13b-lora-sft-js-dolly-oasst-run2.jsonl b/...jp-13b-lora-sft-js-dolly-oasst-run2.jsonl → ...jp-13b-lora-sft-js-dolly-oasst-run2.jsonl
diff --git a/...jp-13b-lora-sft-js-dolly-oasst-run3.jsonl → ...jp-13b-lora-sft-js-dolly-oasst-run3.jsonl b/...jp-13b-lora-sft-js-dolly-oasst-run3.jsonl → ...jp-13b-lora-sft-js-dolly-oasst-run3.jsonl
diff --git a/..._answer/llm-jp-13b-lora-sft-js-run1.jsonl → ..._answer/llm-jp-13b-lora-sft-js-run1.jsonl b/..._answer/llm-jp-13b-lora-sft-js-run1.jsonl → ..._answer/llm-jp-13b-lora-sft-js-run1.jsonl
diff --git a/..._answer/llm-jp-13b-lora-sft-js-run2.jsonl → ..._answer/llm-jp-13b-lora-sft-js-run2.jsonl b/..._answer/llm-jp-13b-lora-sft-js-run2.jsonl → ..._answer/llm-jp-13b-lora-sft-js-run2.jsonl
diff --git a/..._answer/llm-jp-13b-lora-sft-js-run3.jsonl → ..._answer/llm-jp-13b-lora-sft-js-run3.jsonl b/..._answer/llm-jp-13b-lora-sft-js-run3.jsonl → ..._answer/llm-jp-13b-lora-sft-js-run3.jsonl
diff --git a/...nch/model_answer/llm-jp-13b-refined.jsonl → ...nch/model_answer/llm-jp-13b-refined.jsonl b/...nch/model_answer/llm-jp-13b-refined.jsonl → ...nch/model_answer/llm-jp-13b-refined.jsonl
diff --git a/...wer/llm-jp-13b-sft-dolly-oasst-run2.jsonl → ...wer/llm-jp-13b-sft-dolly-oasst-run2.jsonl b/...wer/llm-jp-13b-sft-dolly-oasst-run2.jsonl → ...wer/llm-jp-13b-sft-dolly-oasst-run2.jsonl
diff --git a/...wer/llm-jp-13b-sft-dolly-oasst-run3.jsonl → ...wer/llm-jp-13b-sft-dolly-oasst-run3.jsonl b/...wer/llm-jp-13b-sft-dolly-oasst-run3.jsonl → ...wer/llm-jp-13b-sft-dolly-oasst-run3.jsonl
diff --git a/...l_answer/llm-jp-13b-sft-dolly-oasst.jsonl → ...l_answer/llm-jp-13b-sft-dolly-oasst.jsonl b/...l_answer/llm-jp-13b-sft-dolly-oasst.jsonl → ...l_answer/llm-jp-13b-sft-dolly-oasst.jsonl
diff --git a/.../llm-jp-13b-sft-js-dolly-oasst-run2.jsonl → .../llm-jp-13b-sft-js-dolly-oasst-run2.jsonl b/.../llm-jp-13b-sft-js-dolly-oasst-run2.jsonl → .../llm-jp-13b-sft-js-dolly-oasst-run2.jsonl
diff --git a/.../llm-jp-13b-sft-js-dolly-oasst-run3.jsonl → .../llm-jp-13b-sft-js-dolly-oasst-run3.jsonl b/.../llm-jp-13b-sft-js-dolly-oasst-run3.jsonl → .../llm-jp-13b-sft-js-dolly-oasst-run3.jsonl
diff --git a/...nswer/llm-jp-13b-sft-js-dolly-oasst.jsonl → ...nswer/llm-jp-13b-sft-js-dolly-oasst.jsonl b/...nswer/llm-jp-13b-sft-js-dolly-oasst.jsonl → ...nswer/llm-jp-13b-sft-js-dolly-oasst.jsonl
diff --git a/...model_answer/llm-jp-13b-sft-js-run2.jsonl → ...model_answer/llm-jp-13b-sft-js-run2.jsonl b/...model_answer/llm-jp-13b-sft-js-run2.jsonl → ...model_answer/llm-jp-13b-sft-js-run2.jsonl
diff --git a/...model_answer/llm-jp-13b-sft-js-run3.jsonl → ...model_answer/llm-jp-13b-sft-js-run3.jsonl b/...model_answer/llm-jp-13b-sft-js-run3.jsonl → ...model_answer/llm-jp-13b-sft-js-run3.jsonl
diff --git a/...ench/model_answer/llm-jp-13b-sft-js.jsonl → ...ench/model_answer/llm-jp-13b-sft-js.jsonl b/...ench/model_answer/llm-jp-13b-sft-js.jsonl → ...ench/model_answer/llm-jp-13b-sft-js.jsonl
diff --git a/...ta/jp_bench/model_answer/llm-jp-13b.jsonl → data/jp_bench/model_answer/llm-jp-13b.jsonl b/...ta/jp_bench/model_answer/llm-jp-13b.jsonl → data/jp_bench/model_answer/llm-jp-13b.jsonl
diff --git a/...r/llm-jp_13b_self-instruction_52000.jsonl → ...r/llm-jp_13b_self-instruction_52000.jsonl b/...r/llm-jp_13b_self-instruction_52000.jsonl → ...r/llm-jp_13b_self-instruction_52000.jsonl
diff --git a/.../jp_bench/model_answer/open-calm_7b.jsonl → .../jp_bench/model_answer/open-calm_7b.jsonl b/.../jp_bench/model_answer/open-calm_7b.jsonl → .../jp_bench/model_answer/open-calm_7b.jsonl
diff --git a/.../open-calm_mt_data_52000_jptemplate.jsonl → .../open-calm_mt_data_52000_jptemplate.jsonl b/.../open-calm_mt_data_52000_jptemplate.jsonl → .../open-calm_mt_data_52000_jptemplate.jsonl
diff --git a/...f-instruction_data_52000_jptemplate.jsonl → ...f-instruction_data_52000_jptemplate.jsonl b/...f-instruction_data_52000_jptemplate.jsonl → ...f-instruction_data_52000_jptemplate.jsonl
diff --git a/...p_bench/model_answer/rinna-3.6b-ppo.jsonl → ...p_bench/model_answer/rinna-3.6b-ppo.jsonl b/...p_bench/model_answer/rinna-3.6b-ppo.jsonl → ...p_bench/model_answer/rinna-3.6b-ppo.jsonl
diff --git a/...ench/model_answer/rinna-3.6b-sft-v2.jsonl → ...ench/model_answer/rinna-3.6b-sft-v2.jsonl b/...ench/model_answer/rinna-3.6b-sft-v2.jsonl → ...ench/model_answer/rinna-3.6b-sft-v2.jsonl
diff --git a/...ta/jp_bench/model_answer/rinna-3.6b.jsonl → data/jp_bench/model_answer/rinna-3.6b.jsonl b/...ta/jp_bench/model_answer/rinna-3.6b.jsonl → data/jp_bench/model_answer/rinna-3.6b.jsonl
diff --git a/.../jp_bench/model_judgment/gpt-4_pair.jsonl → .../jp_bench/model_judgment/gpt-4_pair.jsonl b/.../jp_bench/model_judgment/gpt-4_pair.jsonl → .../jp_bench/model_judgment/gpt-4_pair.jsonl
diff --git a/...bench/model_judgment/ori-gpt-4_pair.jsonl → ...bench/model_judgment/ori-gpt-4_pair.jsonl b/...bench/model_judgment/ori-gpt-4_pair.jsonl → ...bench/model_judgment/ori-gpt-4_pair.jsonl
diff --git a/llm_judge/data/jp_bench/question.jsonl → data/jp_bench/question.jsonl b/llm_judge/data/jp_bench/question.jsonl → data/jp_bench/question.jsonl
diff --git a/...ata/jp_bench/reference_answer/gpt-4.jsonl → data/jp_bench/reference_answer/gpt-4.jsonl b/...ata/jp_bench/reference_answer/gpt-4.jsonl → data/jp_bench/reference_answer/gpt-4.jsonl
diff --git a/llm_judge/data/judge_prompts_jp2.jsonl → data/judge_prompts_jp.jsonl b/llm_judge/data/judge_prompts_jp2.jsonl → data/judge_prompts_jp.jsonl
diff --git a/llm_judge/common.py b/llm_judge/common.py
@@ -10,7 +10,6 @@
 import re
 import time
 from typing import Optional
-import sys
 import openai
 from dotenv import load_dotenv
 
@@ -29,15 +28,15 @@
 
 # Categories that need reference answers
 NEED_REF_CATS = ["math", "reasoning", "coding"]
-#NEED_REF_CATS = []
+# NEED_REF_CATS = []
 
 # Extract scores from judgments
 two_score_pattern = re.compile("\[\[(\d+\.?\d*),\s?(\d+\.?\d*)\]\]")
 two_score_pattern_backup = re.compile("\[(\d+\.?\d*),\s?(\d+\.?\d*)\]")
 one_score_pattern = re.compile("\[\[(\d+\.?\d*)\]\]")
-#one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
+# one_score_pattern_backup = re.compile("\[(\d+\.?\d*)\]")
 one_score_pattern_another_format = re.compile("\[\[rating:(\d+)\]\]")
-one_score_pattern_another_format2 =re.compile("\[\[rating: (\d+)\]\]")
+one_score_pattern_another_format2 = re.compile("\[\[rating: (\d+)\]\]")
 
 # Sampling temperature configs for
 temperature_config = {
@@ -112,8 +111,8 @@ def load_model_answers(answer_dir: str):
     for filename in filenames:
         model_name = os.path.basename(filename)[:-6]
         answer = {}
-        
-        #print(filename)
+
+        # print(filename)
         with open(filename, "r") as fin:
             # distinguished process for lora predictions
             # if "lora" not in filename:
@@ -153,7 +152,7 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
     model = judge.model_name
     if ref_answer is not None:
         kwargs["ref_answer_1"] = ref_answer["choices"][0]["turns"][0]
-        #kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]
+        # kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]
 
     if multi_turn:
         user_prompt = judge.prompt_template["prompt_template"].format(
@@ -188,9 +187,9 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
         # if not match:
         #     match = re.search(one_score_pattern_backup, judgment)
         if not match:
-            match = re.search(one_score_pattern_another_format,judgment)
+            match = re.search(one_score_pattern_another_format, judgment)
         if not match:
-            match = re.search(one_score_pattern_another_format2,judgment)
+            match = re.search(one_score_pattern_another_format2, judgment)
         if match:
             rating = ast.literal_eval(match.groups()[0])
         else:
@@ -241,7 +240,7 @@ def play_a_match_single(match: MatchPair, output_file: str):
     if output_file:
         os.makedirs(os.path.dirname(output_file), exist_ok=True)
         with open(output_file, "a") as fout:
-            fout.write(json.dumps(result,ensure_ascii=False) + "\n")
+            fout.write(json.dumps(result, ensure_ascii=False) + "\n")
 
     return result
 
@@ -251,9 +250,9 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F
     model = judge.model_name
     if ref_answer is not None:
         kwargs["ref_answer_1"] = ref_answer["choices"][0]["turns"][0]
-        print("参考回答1:",ref_answer["choices"][0]["turns"][0])
+        print("参考回答1:", ref_answer["choices"][0]["turns"][0])
         kwargs["ref_answer_2"] = ref_answer["choices"][0]["turns"][1]
-        print("参考回答2:",ref_answer["choices"][0]["turns"][1])
+        print("参考回答2:", ref_answer["choices"][0]["turns"][1])
 
     if multi_turn:
         system_prompt = judge.prompt_template["system_prompt"]
@@ -362,7 +361,7 @@ def play_a_match_pair(match: MatchPair, output_file: str):
             "turn": turn,
             "tstamp": time.time(),
         }
-        
+
         print(
             f"question: {question_id}, turn: {turn}, model_1: {model_1}, model_2: {model_2}, "
             f"g1_winner: {g1_winner}, g2_winner: {g2_winner}, "
@@ -410,7 +409,7 @@ def play_a_match_pair(match: MatchPair, output_file: str):
     if output_file:
         os.makedirs(os.path.dirname(output_file), exist_ok=True)
         with open(output_file, "a") as fout:
-            fout.write(json.dumps(result,ensure_ascii=False) + "\n")
+            fout.write(json.dumps(result, ensure_ascii=False) + "\n")
 
     return result
 
@@ -577,7 +576,7 @@ def get_pairwise_judge_explanation(gamekey, judgment_dict):
         return (
             f"**Game 1**. **A**: {model_1}, **B**: {model_2}\n\n"
             f"**Judgment**: {g1_judgment}"
-            + f"\n\n`--------------------------`\n\n"
+            + "\n\n`--------------------------`\n\n"
             + f"**Game 2**. **A**: {model_2}, **B**: {model_1}\n\n"
             f"**Judgment**: {g2_judgment}"
         )
@@ -619,8 +618,8 @@ def check_data(questions, model_answers, ref_answers, models, judges):
         for q in questions:
             if q["category"] not in NEED_REF_CATS:
                 continue
-            #print(q["question_id"])
-            #print(ref_answers[jg.model_name])
+            # print(q["question_id"])
+            # print(ref_answers[jg.model_name])
             assert (
                 int(q["question_id"]) in ref_answers[jg.model_name]
             ), f"Missing reference answer to Question {q['question_id']} for judge {jg.model_name}"