ku-nlp · Sh1gechan · Jun 24, 2024 · Jun 24, 2024 · Jun 24, 2024 · Jun 25, 2024
diff --git a/README.md b/README.md
@@ -26,13 +26,16 @@ python llm_judge/gen_model_answer.py --config <CONFIG-PATH>
 
 Arguments & Options:
   - `<CONFIG-PATH>` is the path to a configuration file. Examples are in `configs/`.
+  - `num_answers_per_question` specifies the number of answers to generate per question (default: all)
 
 For example:
 
 ```bash
 python llm_judge/gen_model_answer.py --config configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
 ```
 
+
+
 #### Step 2. Generate GPT-4 judgments
 
 There are several options to use GPT-4 as a judge, such as pairwise win-rate and single-answer grading.
@@ -43,7 +46,8 @@ OPENAI_API_KEY=<YOUR-KEY> python llm_judge/gen_judgment.py \
     [--baseline-model <BASELINE-MODEL-ID>] \
     [--model-list <LIST-OF-MODEL-IDS>] \
     [--yes] \
-    [--wandb]
+    [--wandb] \
+    [--num_answers_per_question]
 ```
 
 Arguments & Options:
@@ -55,6 +59,7 @@ Arguments & Options:
 - `--model-list <LIST-OF-MODEL-IDS>` is a list of model IDs to be evaluated. If not specified, all models in `data/jp_bench/model_answer` will be evaluated.
 - `--yes` is a flag to skip the confirmation prompt.
 - `--wandb` is a flag to enable logging to W&B. You can upload the results later to W&B by running `upload_result.py`, as described in the next section.
+- `num_answers_per_question` : specifies the number of answers to evaluate per question
 
 **Mode: `pairwise-baseline` (Default)**
 
@@ -157,4 +162,3 @@ If you use our code in your research, please cite our work:
    year={2024}
 }
 ```
-
diff --git a/configs/README.md b/configs/README.md
diff --git a/configs/cyberagent--calm2-7b-chat.json b/configs/cyberagent--calm2-7b-chat.json
diff --git a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json
diff --git a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json
diff --git a/configs/openai--text-davinci-003.json b/configs/openai--text-davinci-003.json
diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json
diff --git a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json b/configs/tokyotech-llm--Swallow-70b-instruct-hf.json
diff --git a/llm_judge/common.py b/llm_judge/common.py
@@ -9,17 +9,20 @@
 from typing import Optional, Union
 
 import openai
+from openai import AzureOpenAI
+
+client = AzureOpenAI(api_key=os.getenv("OPENAI_API_KEY"),
+api_version=os.getenv("OPENAI_API_VERSION"))
 import tiktoken
 from dotenv import load_dotenv
 
 logger = logging.getLogger(__name__)
 
 load_dotenv()
-openai.api_key = os.getenv("OPENAI_API_KEY")
-openai.organization = os.getenv("OPENAI_ORGANIZATION")
-openai.api_type = os.getenv("OPENAI_API_TYPE")
-openai.api_base = os.getenv("OPENAI_API_BASE")
-openai.api_version = os.getenv("OPENAI_API_VERSION")
+# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.getenv("OPENAI_ORGANIZATION"))'
+# openai.organization = os.getenv("OPENAI_ORGANIZATION")
+# TODO: The 'openai.api_base' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(base_url=os.getenv("OPENAI_API_BASE"))'
+# openai.api_base = os.getenv("OPENAI_API_BASE")
 
 # Data paths
 JP_BENCH_DIR = Path(__file__).resolve().parent.parent / "data" / "jp_bench"
@@ -68,9 +71,9 @@ def judge(self, **kwargs):
                     params["engine"] = self.model
                 else:
                     params["model"] = self.model
-                response = openai.ChatCompletion.create(**params)
-                return response["choices"][0]["message"]["content"]
-            except openai.error.OpenAIError as e:
+                response = client.chat.completions.create(**params)
+                return response.choices[0].message.content
+            except openai.OpenAIError as e:
                 logger.warning(f"OpenAI API error: {e}")
                 time.sleep(API_RETRY_SLEEP)
 
@@ -147,7 +150,6 @@ def get_score(judgment: str) -> int:
             return ast.literal_eval(match.groups()[0])
         return -1
 
-
 @dataclasses.dataclass
 class MatchPair:
     question: dict
@@ -256,6 +258,19 @@ def get_model_list(answer_dir: Union[str, Path]):
     return [path.name for path in Path(answer_dir).iterdir()]
 
 
+# def load_model_answers(answer_dir: Union[str, Path]):
+#     """Load model answers.
+
+#     Args:
+#         answer_dir (Union[str, Path]): The answer directory.
+#     """
+#     answers = {}
+#     with open(Path(answer_dir) / "results.jsonl", "r") as fin:
+#         for line in fin:
+#             answer = json.loads(line)
+#             answers[answer["question_id"]] = answer
+#     return answers
+
 def load_model_answers(answer_dir: Union[str, Path]):
     """Load model answers.
 
@@ -266,7 +281,10 @@ def load_model_answers(answer_dir: Union[str, Path]):
     with open(Path(answer_dir) / "results.jsonl", "r") as fin:
         for line in fin:
             answer = json.loads(line)
-            answers[answer["question_id"]] = answer
+            qid = answer["question_id"]
+            if qid not in answers:
+                answers[qid] = []
+            answers[qid].append(answer)
     return answers
 
 
@@ -362,4 +380,4 @@ def filter_pairwise_judgements(
                 filtered_result_id_results_map[result_id] = results
         else:
             filtered_result_id_results_map[result_id] = results
-    return filtered_result_id_results_map
+    return filtered_result_id_results_map
diff --git a/llm_judge/gen_gpt3.5_answer.py b/llm_judge/gen_gpt3.5_answer.py
@@ -5,6 +5,9 @@
 import time
 
 import openai
+from openai import OpenAI
+
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 import shortuuid
 from common import PREDICTION_DIR, QUESTION_FILE, load_questions
 from dotenv import load_dotenv
@@ -13,8 +16,8 @@
 logger = logging.getLogger(__name__)
 
 load_dotenv()
-openai.api_key = os.getenv("OPENAI_API_KEY")
-openai.organization = os.getenv("OPENAI_ORGANIZATION")
+# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.getenv("OPENAI_ORGANIZATION"))'
+# openai.organization = os.getenv("OPENAI_ORGANIZATION")
 
 
 def generate_response(input_text, generation_config) -> str:
@@ -24,7 +27,7 @@ def generate_response(input_text, generation_config) -> str:
         input_text: The input text.
         generation_config: The config for the generation.
     """
-    response = openai.Completion.create(prompt=input_text, **generation_config)
+    response = client.completions.create(prompt=input_text, **generation_config)
     return response.choices[0].text