From a6a3d26ed406d23ce6542891750c34628fb415ce Mon Sep 17 00:00:00 2001
From: Sh1gechan <U.bolt2170@gmail.com>
Date: Tue, 25 Jun 2024 02:14:02 +0900
Subject: [PATCH 1/6] OpenAI version upgrade (latest version)

---
 configs/cyberagent--calm2-7b-chat.json                      | 3 +--
 ...p--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json | 1 -
 ...p--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json | 1 -
 configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json  | 1 -
 .../rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json   | 1 -
 configs/tokyotech-llm--Swallow-70b-instruct-hf.json         | 1 -
 pyproject.toml                                              | 6 +++---
 7 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/configs/cyberagent--calm2-7b-chat.json b/configs/cyberagent--calm2-7b-chat.json
index fe5a486..356f184 100644
--- a/configs/cyberagent--calm2-7b-chat.json
+++ b/configs/cyberagent--calm2-7b-chat.json
@@ -6,8 +6,7 @@
   "prompt_template": "USER: {instruction}\nASSISTANT: ",
   "generation_config": {
     "do_sample": true,
-    "max_length": 2048,
-    "temperature": 0.8
+    "max_length": 2048
   },
   "special_token_map": {}
 }
diff --git a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json
index 7db6c30..86d0642 100644
--- a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json
+++ b/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json
@@ -7,7 +7,6 @@
   "generation_config": {
     "do_sample": true,
     "max_length": 2048,
-    "temperature": 0.7,
     "top_p": 0.95
   },
   "special_token_map": {}
diff --git a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json
index 73fac46..c11c1e6 100644
--- a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json
+++ b/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json
@@ -7,7 +7,6 @@
   "generation_config": {
     "do_sample": true,
     "max_length": 2048,
-    "temperature": 0.7,
     "top_p": 0.95
   },
   "special_token_map": {}
diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
index 147e6a7..c83e10c 100644
--- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
+++ b/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
@@ -7,7 +7,6 @@
   "generation_config": {
     "do_sample": true,
     "max_length": 2048,
-    "temperature": 0.7,
     "repetition_penalty": 1.1
   },
   "special_token_map": {
diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json
index d69e5ab..f5343a2 100644
--- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json
+++ b/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json
@@ -7,7 +7,6 @@
   "generation_config": {
     "do_sample": true,
     "max_length": 2048,
-    "temperature": 0.7,
     "repetition_penalty": 1.1
   },
   "special_token_map": {
diff --git a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json b/configs/tokyotech-llm--Swallow-70b-instruct-hf.json
index f6b5efa..2fbe0e3 100644
--- a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json
+++ b/configs/tokyotech-llm--Swallow-70b-instruct-hf.json
@@ -7,7 +7,6 @@
     "generation_config": {
       "do_sample": true,
       "max_length": 2048,
-      "temperature": 0.99,
       "top_p": 0.95
     }
   }
diff --git a/pyproject.toml b/pyproject.toml
index 12f7ebc..2a793de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,10 +15,10 @@ classifiers = [
 dependencies = [
     "accelerate", "fastapi", "gradio==3.35.2", "httpx", "markdown2[all]", "nh3", "numpy",
     "peft==0.5", "prompt_toolkit>=3.0.0", "pydantic<=2.0", "requests", "rich>=10.0.0", "sentencepiece",
-    "shortuuid", "shortuuid", "tiktoken", "tokenizers>=0.12.1", "torch",
-    "transformers", "uvicorn", "wandb", "openai==0.28.1", "ray", "python-dotenv", "protobuf==3.19",
-    "wandb", "tiktoken"
+    "shortuuid", "tiktoken", "tokenizers>=0.12.1", "torch",
+    "transformers", "uvicorn", "wandb", "openai==1.35.3", "ray", "python-dotenv", "protobuf==3.19"
 ]
 
+
 [tool.setuptools.packages.find]
 exclude = ["*"]

From b2ab6c3d9dbcd175e8cd543e7ebd87bca7c5cecb Mon Sep 17 00:00:00 2001
From: Sh1gechan <U.bolt2170@gmail.com>
Date: Tue, 25 Jun 2024 02:31:01 +0900
Subject: [PATCH 2/6] Add --num_answers_per_question option to
 gen_model_answer.py and gen_judgment.py

---
 README.md                     |  9 +++++--
 llm_judge/common.py           | 24 ++++++++++++-------
 llm_judge/gen_judgment.py     | 16 ++++++++++++-
 llm_judge/gen_model_answer.py | 44 ++++++++++++++++++-----------------
 4 files changed, 61 insertions(+), 32 deletions(-)

diff --git a/README.md b/README.md
index 854eb92..5d05154 100644
--- a/README.md
+++ b/README.md
@@ -26,13 +26,16 @@ python llm_judge/gen_model_answer.py --config <CONFIG-PATH>
 
 Arguments & Options:
   - `<CONFIG-PATH>` is the path to a configuration file. Examples are in `configs/`.
+  - `num_answers_per_question` specifies how many to generate (default: all)
 
 For example:
 
 ```bash
-python llm_judge/gen_model_answer.py --config configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
+python llm_judge/gen_model_answer.py --config configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json --num_answers_per_question <n>
 ```
 
+
+
 #### Step 2. Generate GPT-4 judgments
 
 There are several options to use GPT-4 as a judge, such as pairwise win-rate and single-answer grading.
@@ -43,7 +46,8 @@ OPENAI_API_KEY=<YOUR-KEY> python llm_judge/gen_judgment.py \
     [--baseline-model <BASELINE-MODEL-ID>] \
     [--model-list <LIST-OF-MODEL-IDS>] \
     [--yes] \
-    [--wandb]
+    [--wandb] \
+    [--num_answers_per_question]
 ```
 
 Arguments & Options:
@@ -55,6 +59,7 @@ Arguments & Options:
 - `--model-list <LIST-OF-MODEL-IDS>` is a list of model IDs to be evaluated. If not specified, all models in `data/jp_bench/model_answer` will be evaluated.
 - `--yes` is a flag to skip the confirmation prompt.
 - `--wandb` is a flag to enable logging to W&B. You can upload the results later to W&B by running `upload_result.py`, as described in the next section.
+- `num_answers_per_question` : Number of answers to evaluate per question
 
 **Mode: `pairwise-baseline` (Default)**
 
diff --git a/llm_judge/common.py b/llm_judge/common.py
index 789bf12..d783c2d 100644
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -9,17 +9,20 @@
 from typing import Optional, Union
 
 import openai
+from openai import AzureOpenAI
+
+client = AzureOpenAI(api_key=os.getenv("OPENAI_API_KEY"),
+api_version=os.getenv("OPENAI_API_VERSION"))
 import tiktoken
 from dotenv import load_dotenv
 
 logger = logging.getLogger(__name__)
 
 load_dotenv()
-openai.api_key = os.getenv("OPENAI_API_KEY")
-openai.organization = os.getenv("OPENAI_ORGANIZATION")
-openai.api_type = os.getenv("OPENAI_API_TYPE")
-openai.api_base = os.getenv("OPENAI_API_BASE")
-openai.api_version = os.getenv("OPENAI_API_VERSION")
+# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.getenv("OPENAI_ORGANIZATION"))'
+# openai.organization = os.getenv("OPENAI_ORGANIZATION")
+# TODO: The 'openai.api_base' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(base_url=os.getenv("OPENAI_API_BASE"))'
+# openai.api_base = os.getenv("OPENAI_API_BASE")
 
 # Data paths
 JP_BENCH_DIR = Path(__file__).resolve().parent.parent / "data" / "jp_bench"
@@ -68,9 +71,9 @@ def judge(self, **kwargs):
                     params["engine"] = self.model
                 else:
                     params["model"] = self.model
-                response = openai.ChatCompletion.create(**params)
-                return response["choices"][0]["message"]["content"]
-            except openai.error.OpenAIError as e:
+                response = client.chat.completions.create(**params)
+                return response.choices[0].message.content
+            except openai.OpenAIError as e:
                 logger.warning(f"OpenAI API error: {e}")
                 time.sleep(API_RETRY_SLEEP)
 
@@ -363,3 +366,8 @@ def filter_pairwise_judgements(
         else:
             filtered_result_id_results_map[result_id] = results
     return filtered_result_id_results_map
+
+
+
+
+
diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
index 6f6379a..51688fa 100644
--- a/llm_judge/gen_judgment.py
+++ b/llm_judge/gen_judgment.py
@@ -32,6 +32,7 @@ def make_match_groups_single(
     ref_answers: dict[str, dict[int, dict]],
     judge_default: Judge,
     judge_math: Judge,
+    num_answers_per_question: Optional[int] = None,
 ):
     """Make match groups for single answer grading.
 
@@ -41,6 +42,7 @@ def make_match_groups_single(
         ref_answers (dict): A dict of reference answers.
         judge_default (Judge): A judge for default questions.
         judge_math (Judge): A judge for math questions.
+        num_answers_per_question (Optional[int]): Number of answers to evaluate per question.
     """
     match_groups = {}
     for model in model_answers:
@@ -63,6 +65,8 @@ def make_match_groups_single(
                     ref_answer=ref_answer,
                 )
             )
+        if num_answers_per_question:
+            matches = matches[:num_answers_per_question]
         match_groups[f"single:{model}"] = matches
     return match_groups
 
@@ -74,6 +78,7 @@ def make_match_groups_pairwise(
     judge_default: Judge,
     judge_math: Judge,
     baseline_model: Optional[str] = None,
+    num_answers_per_question: Optional[int] = None,
 ):
     """Make match groups for pairwise comparison.
 
@@ -84,6 +89,7 @@ def make_match_groups_pairwise(
         judge_default (Judge): A judge for default questions.
         judge_math (Judge): A judge for math questions.
         baseline_model (Optional[str]): The baseline model.
+        num_answers_per_question (Optional[int]): Number of answers to evaluate per question.
     """
     match_groups = {}
     for model_1, model_2 in combinations(model_answers, 2):
@@ -111,6 +117,8 @@ def make_match_groups_pairwise(
                     ref_answer=ref_answer,
                 )
             )
+        if num_answers_per_question:
+            matches = matches[:num_answers_per_question]
         match_groups[f"pairwise:{model_1}_{model_2}"] = matches
     return match_groups
 
@@ -132,7 +140,7 @@ def make_match_groups_pairwise(
     parser.add_argument(
         "--judge-model",
         type=str,
-        default="gpt-4",
+        default="gpt-4-0613",
         choices=["gpt-4", "gpt-4-0613", "gpt-4-1106-preview", "gpt-3.5-turbo"],
         help="The judge model.",
     )
@@ -167,6 +175,9 @@ def make_match_groups_pairwise(
     parser.add_argument(
         "--verbose", "-v", action="count", default=0, help="Verbosity level"
     )
+    parser.add_argument(
+        "--num_answers_per_question", type=int, default=None, help="Number of answers to evaluate per question."
+    )
     args = parser.parse_args()
 
     if args.verbose == 0:
@@ -227,6 +238,7 @@ def make_match_groups_pairwise(
             ref_answers=ref_answers,
             judge_default=Judge(args.judge_model, judge_prompts["single"]),
             judge_math=Judge(args.judge_model, judge_prompts["single-math"]),
+            num_answers_per_question=args.num_answers_per_question,
         )
         output_dir = JUDGEMENT_DIR / "single" / args.judge_model
     else:
@@ -242,6 +254,7 @@ def make_match_groups_pairwise(
             judge_default=Judge(args.judge_model, judge_prompts["pair"]),
             judge_math=Judge(args.judge_model, judge_prompts["pair-math"]),
             baseline_model=baseline_model,
+            num_answers_per_question=args.num_answers_per_question,
         )
         output_dir = JUDGEMENT_DIR / "pairwise" / args.judge_model
     target_match_ids = set()
@@ -290,3 +303,4 @@ def make_match_groups_pairwise(
         if args.wandb:
             logger.info("Log to wandb")
             upload_results(args.mode, match_id, results, args.baseline_model)
+
diff --git a/llm_judge/gen_model_answer.py b/llm_judge/gen_model_answer.py
index e92de77..3e00e37 100644
--- a/llm_judge/gen_model_answer.py
+++ b/llm_judge/gen_model_answer.py
@@ -26,7 +26,6 @@
     "generic": 0.1,
 }
 
-
 def generate_response(
     input_text, model, tokenizer, generation_config=None, special_token_map=None
 ):
@@ -64,7 +63,6 @@ def generate_response(
 
     return output
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -79,6 +77,9 @@ def generate_response(
     parser.add_argument(
         "--overwrite", action="store_true", help="Overwrite the existing results"
     )
+    parser.add_argument(
+        "--num_answers_per_question", type=int, default=1, help="Number of answers to generate per question"
+    )
     args = parser.parse_args()
 
     if args.verbose == 0:
@@ -159,25 +160,26 @@ def generate_response(
             category = question["category"]
             generation_config["temperature"] = DEFAULT_TEMPERATURE_MAP[category]
 
-        output = generate_response(
-            input_text=prompt_template.format_map({"instruction": instruction}),
-            model=model,
-            tokenizer=tokenizer,
-            generation_config=generation_config,
-            special_token_map=special_token_map,
-        )
-
-        logger.debug(f"{instruction}\n\n{output}")
-
-        results.append(
-            {
-                "question_id": int(question["question_id"]),
-                "answer_id": shortuuid.uuid(),
-                "model_id": model_id,
-                "choices": [{"index": 0, "turns": [output]}],
-                "tstamp": time.time(),
-            }
-        )
+        for _ in range(args.num_answers_per_question):
+            output = generate_response(
+                input_text=prompt_template.format_map({"instruction": instruction}),
+                model=model,
+                tokenizer=tokenizer,
+                generation_config=generation_config,
+                special_token_map=special_token_map,
+            )
+
+            logger.debug(f"{instruction}\n\n{output}")
+
+            results.append(
+                {
+                    "question_id": int(question["question_id"]),
+                    "answer_id": shortuuid.uuid(),
+                    "model_id": model_id,
+                    "choices": [{"index": 0, "turns": [output]}],
+                    "tstamp": time.time(),
+                }
+            )
 
     logger.info("Save the results")
     prediction_dir.mkdir(parents=True, exist_ok=True)

From 21de29b6850bbcd9137935b22826244eaf85c9fa Mon Sep 17 00:00:00 2001
From: Sh1gechan <U.bolt2170@gmail.com>
Date: Tue, 25 Jun 2024 02:51:04 +0900
Subject: [PATCH 3/6] Remove configs folder from the repository

---
 configs/README.md                             | 30 -------------------
 configs/cyberagent--calm2-7b-chat.json        | 12 --------
 ...instruct-full-jaster-dolly-oasst-v1.0.json | 13 --------
 ...instruct-lora-jaster-dolly-oasst-v1.0.json | 13 --------
 configs/openai--text-davinci-003.json         | 16 ----------
 ...apanese-gpt-neox-3.6b-instruction-ppo.json | 15 ----------
 ...nese-gpt-neox-3.6b-instruction-sft-v2.json | 15 ----------
 ...okyotech-llm--Swallow-70b-instruct-hf.json | 12 --------
 8 files changed, 126 deletions(-)
 delete mode 100644 configs/README.md
 delete mode 100644 configs/cyberagent--calm2-7b-chat.json
 delete mode 100644 configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json
 delete mode 100644 configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json
 delete mode 100644 configs/openai--text-davinci-003.json
 delete mode 100644 configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
 delete mode 100644 configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json
 delete mode 100644 configs/tokyotech-llm--Swallow-70b-instruct-hf.json

diff --git a/configs/README.md b/configs/README.md
deleted file mode 100644
index 889f306..0000000
--- a/configs/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# Configuration files
-
-Each configuration file is a JSON file with the following structure:
-
-```json5
-// rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
-{
-  // The ID of the model
-  "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-ppo",
-  // The name of the model
-  "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-ppo",
-  // The name of the lora model (optional)
-  "lora_model_name_or_path": null,
-  // The name of the tokenizer (optional)
-  "tokenizer_name_or_path": null,
-  // The prompt template
-  "prompt_template": "ユーザー: {instruction}<NL>システム: ",
-  // The generation configuration (optional)
-  // NOTE: `temperature` will be set to a default value for each task category if left empty
-  "generation_config": {
-    "do_sample": true,
-    "max_new_tokens": 512,
-    "repetition_penalty": 1.1
-  },
-  // The special token map (optional); this is used to replace special tokens in the output
-  "special_token_map": {
-    "<NL>": "\n"
-  }
-}
-```
diff --git a/configs/cyberagent--calm2-7b-chat.json b/configs/cyberagent--calm2-7b-chat.json
deleted file mode 100644
index 356f184..0000000
--- a/configs/cyberagent--calm2-7b-chat.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-  "model_id": "cyberagent--calm2-7b-chat",
-  "model_name_or_path": "cyberagent/calm2-7b-chat",
-  "lora_model_name_or_path": null,
-  "tokenizer_name_or_path": null,
-  "prompt_template": "USER: {instruction}\nASSISTANT: ",
-  "generation_config": {
-    "do_sample": true,
-    "max_length": 2048
-  },
-  "special_token_map": {}
-}
diff --git a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json
deleted file mode 100644
index 86d0642..0000000
--- a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "model_id": "llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0",
-  "model_name_or_path": "llm-jp/llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0",
-  "lora_model_name_or_path": null,
-  "tokenizer_name_or_path": null,
-  "prompt_template": "{instruction} ### 回答：",
-  "generation_config": {
-    "do_sample": true,
-    "max_length": 2048,
-    "top_p": 0.95
-  },
-  "special_token_map": {}
-}
diff --git a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json
deleted file mode 100644
index c11c1e6..0000000
--- a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "model_id": "llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0",
-  "model_name_or_path": "llm-jp/llm-jp-13b-v1.0",
-  "lora_model_name_or_path": "llm-jp/llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0",
-  "tokenizer_name_or_path": null,
-  "prompt_template": "{instruction} ### 回答：",
-  "generation_config": {
-    "do_sample": true,
-    "max_length": 2048,
-    "top_p": 0.95
-  },
-  "special_token_map": {}
-}
diff --git a/configs/openai--text-davinci-003.json b/configs/openai--text-davinci-003.json
deleted file mode 100644
index a07e3c2..0000000
--- a/configs/openai--text-davinci-003.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "model_id": "openai--text-davinci-003",
-  "model_name_or_path": null,
-  "lora_model_name_or_path": null,
-  "tokenizer_name_or_path": null,
-  "prompt_template": "{instruction}",
-  "generation_config": {
-    "engine": "text-davinci-003",
-    "temperature": 0.0,
-    "max_tokens": 2048,
-    "top_p": 1.0,
-    "frequency_penalty": 0.0,
-    "presence_penalty": 0.0
-  },
-  "special_token_map": {}
-}
diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
deleted file mode 100644
index c83e10c..0000000
--- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-  "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-ppo",
-  "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-ppo",
-  "lora_model_name_or_path": null,
-  "tokenizer_name_or_path": null,
-  "prompt_template": "ユーザー: {instruction}<NL>システム: ",
-  "generation_config": {
-    "do_sample": true,
-    "max_length": 2048,
-    "repetition_penalty": 1.1
-  },
-  "special_token_map": {
-    "<NL>": "\n"
-  }
-}
diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json
deleted file mode 100644
index f5343a2..0000000
--- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json
+++ /dev/null
@@ -1,15 +0,0 @@
-{
-  "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-sft-v2",
-  "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-sft-v2",
-  "lora_model_name_or_path": null,
-  "tokenizer_name_or_path": null,
-  "prompt_template": "ユーザー: {instruction}<NL>システム: ",
-  "generation_config": {
-    "do_sample": true,
-    "max_length": 2048,
-    "repetition_penalty": 1.1
-  },
-  "special_token_map": {
-    "<NL>": "\n"
-  }
-}
diff --git a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json b/configs/tokyotech-llm--Swallow-70b-instruct-hf.json
deleted file mode 100644
index 2fbe0e3..0000000
--- a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json
+++ /dev/null
@@ -1,12 +0,0 @@
-{
-    "model_id": "tokyotech-llm--Swallow-70b-instruct-hf",
-    "model_name_or_path": "tokyotech-llm/Swallow-70b-instruct-hf",
-    "lora_model_name_or_path": null,
-    "tokenizer_name_or_path": null,
-    "prompt_template": "以下に、あるタスクを説明する指示があります。リクエストを適切に完了するための回答を記述してください。\n\n### 指示:\n{instruction}\n\n### 応答:\n",
-    "generation_config": {
-      "do_sample": true,
-      "max_length": 2048,
-      "top_p": 0.95
-    }
-  }

From 01e60431c0135c2086a8dfa5591ba197070ba5ae Mon Sep 17 00:00:00 2001
From: Sh1gechan <U.bolt2170@gmail.com>
Date: Tue, 25 Jun 2024 17:11:46 +0900
Subject: [PATCH 4/6] fix READEME

---
 README.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 5d05154..ef336b7 100644
--- a/README.md
+++ b/README.md
@@ -26,12 +26,12 @@ python llm_judge/gen_model_answer.py --config <CONFIG-PATH>
 
 Arguments & Options:
   - `<CONFIG-PATH>` is the path to a configuration file. Examples are in `configs/`.
-  - `num_answers_per_question` specifies how many to generate (default: all)
+  - `num_answers_per_question` specifies the number of answers to generate per question (default: all)
 
 For example:
 
 ```bash
-python llm_judge/gen_model_answer.py --config configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json --num_answers_per_question <n>
+python llm_judge/gen_model_answer.py --config configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
 ```
 
 
@@ -59,7 +59,7 @@ Arguments & Options:
 - `--model-list <LIST-OF-MODEL-IDS>` is a list of model IDs to be evaluated. If not specified, all models in `data/jp_bench/model_answer` will be evaluated.
 - `--yes` is a flag to skip the confirmation prompt.
 - `--wandb` is a flag to enable logging to W&B. You can upload the results later to W&B by running `upload_result.py`, as described in the next section.
-- `num_answers_per_question` : Number of answers to evaluate per question
+- `num_answers_per_question` : specifies the number of answers to evaluate per question
 
 **Mode: `pairwise-baseline` (Default)**
 
@@ -162,4 +162,3 @@ If you use our code in your research, please cite our work:
    year={2024}
 }
 ```
-

From ec5a5e0d9ff5ddd160336bc0ca584c736b96084a Mon Sep 17 00:00:00 2001
From: Sh1gechan <U.bolt2170@gmail.com>
Date: Thu, 1 Aug 2024 04:11:28 +0900
Subject: [PATCH 5/6] fix common.py

---
 llm_judge/common.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/llm_judge/common.py b/llm_judge/common.py
index d783c2d..9e6c971 100644
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -127,9 +127,11 @@ def estimate_cost(self) -> float:
             + len(enc.encode(self.judge.prompt_template["prompt_template"]))
         )
         if self.ref_answer:
-            num_input_tokens += len(
-                enc.encode(self.ref_answer["choices"][0]["turns"][0])
-            )
+            if isinstance(self.ref_answer, list):
+                ref_answer_text = self.ref_answer[0]["choices"][0]["turns"][0]
+            else:
+                ref_answer_text = self.ref_answer["choices"][0]["turns"][0]
+            num_input_tokens += len(enc.encode(ref_answer_text))
         num_output_tokens = 200  # Estimated from a few samples
         if self.judge.model in {"gpt-4", "gpt-4-0613"}:
             return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000
@@ -217,9 +219,11 @@ def estimate_cost(self) -> float:
             + len(enc.encode(self.judge.prompt_template["prompt_template"]))
         )
         if self.ref_answer:
-            num_input_tokens += len(
-                enc.encode(self.ref_answer["choices"][0]["turns"][0])
-            )
+            if isinstance(self.ref_answer, list):
+                ref_answer_text = self.ref_answer[0]["choices"][0]["turns"][0]
+            else:
+                ref_answer_text = self.ref_answer["choices"][0]["turns"][0]
+            num_input_tokens += len(enc.encode(ref_answer_text))
         num_output_tokens = 200  # Estimated from a few samples
         if self.judge.model in {"gpt-4", "gpt-4-0613"}:
             return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000

From b319fd778b78255a884a4be39a5c2d90e33680bf Mon Sep 17 00:00:00 2001
From: Sh1gechan <U.bolt2170@gmail.com>
Date: Mon, 7 Oct 2024 18:29:20 +0900
Subject: [PATCH 6/6] fix gen_judgement.py

---
 llm_judge/common.py            |  42 ++++---
 llm_judge/gen_gpt3.5_answer.py |   9 +-
 llm_judge/gen_judgment.py      | 216 ++++++++++++++++++++-------------
 3 files changed, 165 insertions(+), 102 deletions(-)

diff --git a/llm_judge/common.py b/llm_judge/common.py
index 9e6c971..e5b6db8 100644
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -127,11 +127,9 @@ def estimate_cost(self) -> float:
             + len(enc.encode(self.judge.prompt_template["prompt_template"]))
         )
         if self.ref_answer:
-            if isinstance(self.ref_answer, list):
-                ref_answer_text = self.ref_answer[0]["choices"][0]["turns"][0]
-            else:
-                ref_answer_text = self.ref_answer["choices"][0]["turns"][0]
-            num_input_tokens += len(enc.encode(ref_answer_text))
+            num_input_tokens += len(
+                enc.encode(self.ref_answer["choices"][0]["turns"][0])
+            )
         num_output_tokens = 200  # Estimated from a few samples
         if self.judge.model in {"gpt-4", "gpt-4-0613"}:
             return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000
@@ -152,7 +150,6 @@ def get_score(judgment: str) -> int:
             return ast.literal_eval(match.groups()[0])
         return -1
 
-
 @dataclasses.dataclass
 class MatchPair:
     question: dict
@@ -219,11 +216,9 @@ def estimate_cost(self) -> float:
             + len(enc.encode(self.judge.prompt_template["prompt_template"]))
         )
         if self.ref_answer:
-            if isinstance(self.ref_answer, list):
-                ref_answer_text = self.ref_answer[0]["choices"][0]["turns"][0]
-            else:
-                ref_answer_text = self.ref_answer["choices"][0]["turns"][0]
-            num_input_tokens += len(enc.encode(ref_answer_text))
+            num_input_tokens += len(
+                enc.encode(self.ref_answer["choices"][0]["turns"][0])
+            )
         num_output_tokens = 200  # Estimated from a few samples
         if self.judge.model in {"gpt-4", "gpt-4-0613"}:
             return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000
@@ -263,6 +258,19 @@ def get_model_list(answer_dir: Union[str, Path]):
     return [path.name for path in Path(answer_dir).iterdir()]
 
 
+# def load_model_answers(answer_dir: Union[str, Path]):
+#     """Load model answers.
+
+#     Args:
+#         answer_dir (Union[str, Path]): The answer directory.
+#     """
+#     answers = {}
+#     with open(Path(answer_dir) / "results.jsonl", "r") as fin:
+#         for line in fin:
+#             answer = json.loads(line)
+#             answers[answer["question_id"]] = answer
+#     return answers
+
 def load_model_answers(answer_dir: Union[str, Path]):
     """Load model answers.
 
@@ -273,7 +281,10 @@ def load_model_answers(answer_dir: Union[str, Path]):
     with open(Path(answer_dir) / "results.jsonl", "r") as fin:
         for line in fin:
             answer = json.loads(line)
-            answers[answer["question_id"]] = answer
+            qid = answer["question_id"]
+            if qid not in answers:
+                answers[qid] = []
+            answers[qid].append(answer)
     return answers
 
 
@@ -369,9 +380,4 @@ def filter_pairwise_judgements(
                 filtered_result_id_results_map[result_id] = results
         else:
             filtered_result_id_results_map[result_id] = results
-    return filtered_result_id_results_map
-
-
-
-
-
+    return filtered_result_id_results_map
\ No newline at end of file
diff --git a/llm_judge/gen_gpt3.5_answer.py b/llm_judge/gen_gpt3.5_answer.py
index a222111..2f8a4fe 100644
--- a/llm_judge/gen_gpt3.5_answer.py
+++ b/llm_judge/gen_gpt3.5_answer.py
@@ -5,6 +5,9 @@
 import time
 
 import openai
+from openai import OpenAI
+
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 import shortuuid
 from common import PREDICTION_DIR, QUESTION_FILE, load_questions
 from dotenv import load_dotenv
@@ -13,8 +16,8 @@
 logger = logging.getLogger(__name__)
 
 load_dotenv()
-openai.api_key = os.getenv("OPENAI_API_KEY")
-openai.organization = os.getenv("OPENAI_ORGANIZATION")
+# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.getenv("OPENAI_ORGANIZATION"))'
+# openai.organization = os.getenv("OPENAI_ORGANIZATION")
 
 
 def generate_response(input_text, generation_config) -> str:
@@ -24,7 +27,7 @@ def generate_response(input_text, generation_config) -> str:
         input_text: The input text.
         generation_config: The config for the generation.
     """
-    response = openai.Completion.create(prompt=input_text, **generation_config)
+    response = client.completions.create(prompt=input_text, **generation_config)
     return response.choices[0].text
 
 
diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
index 51688fa..4e44f51 100644
--- a/llm_judge/gen_judgment.py
+++ b/llm_judge/gen_judgment.py
@@ -3,7 +3,7 @@
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from itertools import combinations
-from typing import Optional
+from typing import Optional, Dict, List
 
 from common import (
     JUDGEMENT_DIR,
@@ -27,101 +27,141 @@
 
 
 def make_match_groups_single(
-    questions: list[dict],
-    model_answers: dict[str, dict[int, dict]],
-    ref_answers: dict[str, dict[int, dict]],
+    questions: List[dict],
+    model_answers: Dict[str, Dict[int, List[dict]]],
+    ref_answers: Dict[str, Dict[int, List[dict]]],
     judge_default: Judge,
     judge_math: Judge,
     num_answers_per_question: Optional[int] = None,
 ):
-    """Make match groups for single answer grading.
-
-    Args:
-        questions (list): A list of questions.
-        model_answers (dict): A dict of model answers.
-        ref_answers (dict): A dict of reference answers.
-        judge_default (Judge): A judge for default questions.
-        judge_math (Judge): A judge for math questions.
-        num_answers_per_question (Optional[int]): Number of answers to evaluate per question.
-    """
-    match_groups = {}
-    for model in model_answers:
-        matches = []
-        for question in questions:
-            qid = question["question_id"]
-            answer = model_answers[model][qid]
-            if question["category"] in NEED_REF_CATS:
-                judge = judge_math
-                ref_answer = ref_answers[judge.model][qid]
-            else:
-                judge = judge_default
+    """Make match groups for single answer grading."""
+
+    match_groups = {model: [] for model in model_answers}
+
+    for question in questions:
+        qid = question["question_id"]
+        category = question["category"]
+
+        # Determine if reference answer is needed
+        if category in NEED_REF_CATS:
+            judge = judge_math
+            ref_answer_list = ref_answers[judge.model].get(qid)
+            if not ref_answer_list:
+                logger.warning(f"No reference answer for question {qid} in model {judge.model}")
                 ref_answer = None
-            matches.append(
-                MatchSingle(
+            else:
+                ref_answer = ref_answer_list[0]
+        else:
+            judge = judge_default
+            ref_answer = None
+        # Get all models that have answers for this question
+        available_models = [model for model, answers in model_answers.items() if qid in answers]
+
+        for model in available_models:
+            answers = model_answers[model][qid]
+            if num_answers_per_question is not None:
+                selected_answers = answers[:num_answers_per_question]
+            else:
+                selected_answers = answers
+
+            for answer in selected_answers:
+                match = MatchSingle(
                     question=question,
                     model=model,
                     answer=answer,
                     judge=judge,
                     ref_answer=ref_answer,
                 )
-            )
-        if num_answers_per_question:
-            matches = matches[:num_answers_per_question]
-        match_groups[f"single:{model}"] = matches
+                match_groups[model].append(match)
+
     return match_groups
 
 
 def make_match_groups_pairwise(
-    questions: list[dict],
-    model_answers: dict[str, dict[int, dict]],
-    ref_answers: dict[str, dict[int, dict]],
+    questions: List[dict],
+    model_answers: Dict[str, Dict[int, List[dict]]],
+    ref_answers: Dict[str, Dict[int, List[dict]]],
     judge_default: Judge,
     judge_math: Judge,
     baseline_model: Optional[str] = None,
     num_answers_per_question: Optional[int] = None,
 ):
-    """Make match groups for pairwise comparison.
-
-    Args:
-        questions (list): A list of questions.
-        model_answers (dict): A dict of model answers.
-        ref_answers (dict): A dict of reference answers.
-        judge_default (Judge): A judge for default questions.
-        judge_math (Judge): A judge for math questions.
-        baseline_model (Optional[str]): The baseline model.
-        num_answers_per_question (Optional[int]): Number of answers to evaluate per question.
-    """
+    """Make match groups for pairwise comparison."""
+
     match_groups = {}
-    for model_1, model_2 in combinations(model_answers, 2):
-        if baseline_model and baseline_model not in {model_1, model_2}:
-            continue
-        matches = []
-        for question in questions:
-            qid = question["question_id"]
-            answer_1 = model_answers[model_1][qid]
-            answer_2 = model_answers[model_2][qid]
-            if question["category"] in NEED_REF_CATS:
-                judge = judge_math
-                ref_answer = ref_answers[judge.model][qid]
-            else:
-                judge = judge_default
+
+    for question in questions:
+        qid = question["question_id"]
+        category = question["category"]
+
+        # Determine if reference answer is needed
+        if category in NEED_REF_CATS:
+            judge = judge_math
+            ref_answer_list = ref_answers[judge.model].get(qid)
+            if not ref_answer_list:
+                logger.warning(f"No reference answer for question {qid} in model {judge.model}")
                 ref_answer = None
-            matches.append(
-                MatchPair(
-                    question=question,
-                    model_1=model_1,
-                    model_2=model_2,
-                    answer_1=answer_1,
-                    answer_2=answer_2,
-                    judge=judge,
-                    ref_answer=ref_answer,
-                )
-            )
-        if num_answers_per_question:
-            matches = matches[:num_answers_per_question]
-        match_groups[f"pairwise:{model_1}_{model_2}"] = matches
-    return match_groups
+            else:
+                ref_answer = ref_answer_list[0]
+        else:
+            judge = judge_default
+            ref_answer = None
+
+        # Get all models that have answers for this question
+        available_models = [model for model, answers in model_answers.items() if qid in answers]
+
+        if baseline_model:
+            if baseline_model not in available_models:
+                logger.warning(f"Baseline model {baseline_model} does not have an answer for question {qid}. Skipping.")
+                continue
+            non_baseline_models = [model for model in available_models if model != baseline_model]
+        else:
+            non_baseline_models = available_models
+
+        if num_answers_per_question is not None:
+            selected_non_baseline_models = non_baseline_models[:num_answers_per_question]
+        else:
+            selected_non_baseline_models = non_baseline_models
+
+        if baseline_model:
+            selected_models = selected_non_baseline_models + [baseline_model]
+        else:
+            selected_models = selected_non_baseline_models
 
+        # Generate all unique pairs
+        for model_1, model_2 in combinations(selected_models, 2):
+            if baseline_model and (model_1 != baseline_model and model_2 != baseline_model):
+                # In pairwise-baseline mode, only create pairs with the baseline
+                continue
+
+            pair_key = f"pairwise:{model_1}_{model_2}"
+            if pair_key not in match_groups:
+                match_groups[pair_key] = []
+
+            answers_1 = model_answers[model_1][qid]
+            answers_2 = model_answers[model_2][qid]
+
+            if num_answers_per_question is not None:
+                selected_answers_1 = answers_1[:num_answers_per_question]
+                selected_answers_2 = answers_2[:num_answers_per_question]
+            else:
+                selected_answers_1 = answers_1
+                selected_answers_2 = answers_2
+
+            for ans1 in selected_answers_1:
+                for ans2 in selected_answers_2:
+                    match = MatchPair(
+                        question=question,
+                        model_1=model_1,
+                        model_2=model_2,
+                        answer_1=ans1,
+                        answer_2=ans2,
+                        judge=judge,
+                        ref_answer=ref_answer,
+                    )
+                    match_groups[pair_key].append(match)
+
+    return match_groups
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -140,7 +180,7 @@ def make_match_groups_pairwise(
     parser.add_argument(
         "--judge-model",
         type=str,
-        default="gpt-4-0613",
+        default="gpt-4",
         choices=["gpt-4", "gpt-4-0613", "gpt-4-1106-preview", "gpt-3.5-turbo"],
         help="The judge model.",
     )
@@ -176,7 +216,10 @@ def make_match_groups_pairwise(
         "--verbose", "-v", action="count", default=0, help="Verbosity level"
     )
     parser.add_argument(
-        "--num_answers_per_question", type=int, default=None, help="Number of answers to evaluate per question."
+        "--num_answers_per_question",
+        type=int,
+        default=None,
+        help="Number of answers to evaluate per question.",
     )
     args = parser.parse_args()
 
@@ -217,14 +260,20 @@ def make_match_groups_pairwise(
     for model in sorted(models):
         answers = load_model_answers(PREDICTION_DIR / model)
         for question in questions:
-            assert question["question_id"] in answers
+            qid = question["question_id"]
+            if qid not in answers:
+                logger.error(f"Question ID {qid} missing in model {model} answers.")
+                raise ValueError(f"Question ID {qid} missing in model {model} answers.")
         model_answers[model] = answers
 
     logger.info("Load reference answers")
     judge_model = args.judge_model
     answers = load_model_answers(REFERENCE_DIR / "gpt-4")
     for question in filter(lambda x: x["category"] in NEED_REF_CATS, questions):
-        assert question["question_id"] in answers
+        qid = question["question_id"]
+        if qid not in answers:
+            logger.error(f"Reference answer for question ID {qid} missing.")
+            raise ValueError(f"Reference answer for question ID {qid} missing.")
     ref_answers = {judge_model: answers}
 
     logger.info("Load judge prompts")
@@ -257,6 +306,8 @@ def make_match_groups_pairwise(
             num_answers_per_question=args.num_answers_per_question,
         )
         output_dir = JUDGEMENT_DIR / "pairwise" / args.judge_model
+
+    # Filter out existing match_ids if not overwriting
     target_match_ids = set()
     for match_id in match_groups:
         output_file = output_dir / f"{match_id}.jsonl"
@@ -291,16 +342,19 @@ def make_match_groups_pairwise(
         with ThreadPoolExecutor(args.parallel) as executor:
             futures = [executor.submit(match.play) for match in matches]
             for future in tqdm(futures):
-                results.append(future.result())
+                try:
+                    result = future.result()
+                    results.append(result)
+                except Exception as e:
+                    logger.error(f"Error processing match {match_id}: {e}")
 
         logger.info(f"Write {len(results)} judgments")
         output_file.parent.mkdir(parents=True, exist_ok=True)
-        with open(output_file, "w") as f:
+        with open(output_file, "w", encoding="utf-8") as f:
             for result in results:
                 f.write(json.dumps(result, ensure_ascii=False) + "\n")
         logger.info(f"Saved the judgments to {output_file}")
 
         if args.wandb:
             logger.info("Log to wandb")
-            upload_results(args.mode, match_id, results, args.baseline_model)
-
+            upload_results(args.mode, match_id, results, args.baseline_model)
\ No newline at end of file