diff --git a/README.md b/README.md
index 854eb92..ef336b7 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,7 @@ python llm_judge/gen_model_answer.py --config <CONFIG-PATH>
 
 Arguments & Options:
   - `<CONFIG-PATH>` is the path to a configuration file. Examples are in `configs/`.
+  - `num_answers_per_question` specifies the number of answers to generate per question (default: all)
 
 For example:
 
@@ -33,6 +34,8 @@ For example:
 python llm_judge/gen_model_answer.py --config configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
 ```
 
+
+
 #### Step 2. Generate GPT-4 judgments
 
 There are several options to use GPT-4 as a judge, such as pairwise win-rate and single-answer grading.
@@ -43,7 +46,8 @@ OPENAI_API_KEY=<YOUR-KEY> python llm_judge/gen_judgment.py \
     [--baseline-model <BASELINE-MODEL-ID>] \
     [--model-list <LIST-OF-MODEL-IDS>] \
     [--yes] \
-    [--wandb]
+    [--wandb] \
+    [--num_answers_per_question]
 ```
 
 Arguments & Options:
@@ -55,6 +59,7 @@ Arguments & Options:
 - `--model-list <LIST-OF-MODEL-IDS>` is a list of model IDs to be evaluated. If not specified, all models in `data/jp_bench/model_answer` will be evaluated.
 - `--yes` is a flag to skip the confirmation prompt.
 - `--wandb` is a flag to enable logging to W&B. You can upload the results later to W&B by running `upload_result.py`, as described in the next section.
+- `num_answers_per_question` : specifies the number of answers to evaluate per question
 
 **Mode: `pairwise-baseline` (Default)**
 
@@ -157,4 +162,3 @@ If you use our code in your research, please cite our work:
    year={2024}
 }
 ```
-
diff --git a/configs/README.md b/configs/README.md
deleted file mode 100644
index 889f306..0000000
--- a/configs/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# Configuration files
-
-Each configuration file is a JSON file with the following structure:
-
-```json5
-// rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
-{
-  // The ID of the model
-  "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-ppo",
-  // The name of the model
-  "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-ppo",
-  // The name of the lora model (optional)
-  "lora_model_name_or_path": null,
-  // The name of the tokenizer (optional)
-  "tokenizer_name_or_path": null,
-  // The prompt template
-  "prompt_template": "ユーザー: {instruction}<NL>システム: ",
-  // The generation configuration (optional)
-  // NOTE: `temperature` will be set to a default value for each task category if left empty
-  "generation_config": {
-    "do_sample": true,
-    "max_new_tokens": 512,
-    "repetition_penalty": 1.1
-  },
-  // The special token map (optional); this is used to replace special tokens in the output
-  "special_token_map": {
-    "<NL>": "\n"
-  }
-}
-```
diff --git a/configs/cyberagent--calm2-7b-chat.json b/configs/cyberagent--calm2-7b-chat.json
deleted file mode 100644
index fe5a486..0000000
--- a/configs/cyberagent--calm2-7b-chat.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-  "model_id": "cyberagent--calm2-7b-chat",
-  "model_name_or_path": "cyberagent/calm2-7b-chat",
-  "lora_model_name_or_path": null,
-  "tokenizer_name_or_path": null,
-  "prompt_template": "USER: {instruction}\nASSISTANT: ",
-  "generation_config": {
-    "do_sample": true,
-    "max_length": 2048,
-    "temperature": 0.8
-  },
-  "special_token_map": {}
-}
diff --git a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json
deleted file mode 100644
index 7db6c30..0000000
--- a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-  "model_id": "llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0",
-  "model_name_or_path": "llm-jp/llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0",
-  "lora_model_name_or_path": null,
-  "tokenizer_name_or_path": null,
-  "prompt_template": "{instruction} ### 回答：",
-  "generation_config": {
-    "do_sample": true,
-    "max_length": 2048,
-    "temperature": 0.7,
-    "top_p": 0.95
-  },
-  "special_token_map": {}
-}
diff --git a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json
deleted file mode 100644
index 73fac46..0000000
--- a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json
+++ /dev/null
@@ -1,14 +0,0 @@
-{
-  "model_id": "llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0",
-  "model_name_or_path": "llm-jp/llm-jp-13b-v1.0",
-  "lora_model_name_or_path": "llm-jp/llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0",
-  "tokenizer_name_or_path": null,
-  "prompt_template": "{instruction} ### 回答：",
-  "generation_config": {
-    "do_sample": true,
-    "max_length": 2048,
-    "temperature": 0.7,
-    "top_p": 0.95
-  },
-  "special_token_map": {}
-}
diff --git a/configs/openai--text-davinci-003.json b/configs/openai--text-davinci-003.json
deleted file mode 100644
index a07e3c2..0000000
--- a/configs/openai--text-davinci-003.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "model_id": "openai--text-davinci-003",
-  "model_name_or_path": null,
-  "lora_model_name_or_path": null,
-  "tokenizer_name_or_path": null,
-  "prompt_template": "{instruction}",
-  "generation_config": {
-    "engine": "text-davinci-003",
-    "temperature": 0.0,
-    "max_tokens": 2048,
-    "top_p": 1.0,
-    "frequency_penalty": 0.0,
-    "presence_penalty": 0.0
-  },
-  "special_token_map": {}
-}
diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
deleted file mode 100644
index 147e6a7..0000000
--- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-ppo",
-  "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-ppo",
-  "lora_model_name_or_path": null,
-  "tokenizer_name_or_path": null,
-  "prompt_template": "ユーザー: {instruction}<NL>システム: ",
-  "generation_config": {
-    "do_sample": true,
-    "max_length": 2048,
-    "temperature": 0.7,
-    "repetition_penalty": 1.1
-  },
-  "special_token_map": {
-    "<NL>": "\n"
-  }
-}
diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json
deleted file mode 100644
index d69e5ab..0000000
--- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json
+++ /dev/null
@@ -1,16 +0,0 @@
-{
-  "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-sft-v2",
-  "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-sft-v2",
-  "lora_model_name_or_path": null,
-  "tokenizer_name_or_path": null,
-  "prompt_template": "ユーザー: {instruction}<NL>システム: ",
-  "generation_config": {
-    "do_sample": true,
-    "max_length": 2048,
-    "temperature": 0.7,
-    "repetition_penalty": 1.1
-  },
-  "special_token_map": {
-    "<NL>": "\n"
-  }
-}
diff --git a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json b/configs/tokyotech-llm--Swallow-70b-instruct-hf.json
deleted file mode 100644
index f6b5efa..0000000
--- a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json
+++ /dev/null
@@ -1,13 +0,0 @@
-{
-    "model_id": "tokyotech-llm--Swallow-70b-instruct-hf",
-    "model_name_or_path": "tokyotech-llm/Swallow-70b-instruct-hf",
-    "lora_model_name_or_path": null,
-    "tokenizer_name_or_path": null,
-    "prompt_template": "以下に、あるタスクを説明する指示があります。リクエストを適切に完了するための回答を記述してください。\n\n### 指示:\n{instruction}\n\n### 応答:\n",
-    "generation_config": {
-      "do_sample": true,
-      "max_length": 2048,
-      "temperature": 0.99,
-      "top_p": 0.95
-    }
-  }
diff --git a/llm_judge/common.py b/llm_judge/common.py
index 789bf12..e5b6db8 100644
--- a/llm_judge/common.py
+++ b/llm_judge/common.py
@@ -9,17 +9,20 @@
 from typing import Optional, Union
 
 import openai
+from openai import AzureOpenAI
+
+client = AzureOpenAI(api_key=os.getenv("OPENAI_API_KEY"),
+api_version=os.getenv("OPENAI_API_VERSION"))
 import tiktoken
 from dotenv import load_dotenv
 
 logger = logging.getLogger(__name__)
 
 load_dotenv()
-openai.api_key = os.getenv("OPENAI_API_KEY")
-openai.organization = os.getenv("OPENAI_ORGANIZATION")
-openai.api_type = os.getenv("OPENAI_API_TYPE")
-openai.api_base = os.getenv("OPENAI_API_BASE")
-openai.api_version = os.getenv("OPENAI_API_VERSION")
+# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.getenv("OPENAI_ORGANIZATION"))'
+# openai.organization = os.getenv("OPENAI_ORGANIZATION")
+# TODO: The 'openai.api_base' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(base_url=os.getenv("OPENAI_API_BASE"))'
+# openai.api_base = os.getenv("OPENAI_API_BASE")
 
 # Data paths
 JP_BENCH_DIR = Path(__file__).resolve().parent.parent / "data" / "jp_bench"
@@ -68,9 +71,9 @@ def judge(self, **kwargs):
                     params["engine"] = self.model
                 else:
                     params["model"] = self.model
-                response = openai.ChatCompletion.create(**params)
-                return response["choices"][0]["message"]["content"]
-            except openai.error.OpenAIError as e:
+                response = client.chat.completions.create(**params)
+                return response.choices[0].message.content
+            except openai.OpenAIError as e:
                 logger.warning(f"OpenAI API error: {e}")
                 time.sleep(API_RETRY_SLEEP)
 
@@ -147,7 +150,6 @@ def get_score(judgment: str) -> int:
             return ast.literal_eval(match.groups()[0])
         return -1
 
-
 @dataclasses.dataclass
 class MatchPair:
     question: dict
@@ -256,6 +258,19 @@ def get_model_list(answer_dir: Union[str, Path]):
     return [path.name for path in Path(answer_dir).iterdir()]
 
 
+# def load_model_answers(answer_dir: Union[str, Path]):
+#     """Load model answers.
+
+#     Args:
+#         answer_dir (Union[str, Path]): The answer directory.
+#     """
+#     answers = {}
+#     with open(Path(answer_dir) / "results.jsonl", "r") as fin:
+#         for line in fin:
+#             answer = json.loads(line)
+#             answers[answer["question_id"]] = answer
+#     return answers
+
 def load_model_answers(answer_dir: Union[str, Path]):
     """Load model answers.
 
@@ -266,7 +281,10 @@ def load_model_answers(answer_dir: Union[str, Path]):
     with open(Path(answer_dir) / "results.jsonl", "r") as fin:
         for line in fin:
             answer = json.loads(line)
-            answers[answer["question_id"]] = answer
+            qid = answer["question_id"]
+            if qid not in answers:
+                answers[qid] = []
+            answers[qid].append(answer)
     return answers
 
 
@@ -362,4 +380,4 @@ def filter_pairwise_judgements(
                 filtered_result_id_results_map[result_id] = results
         else:
             filtered_result_id_results_map[result_id] = results
-    return filtered_result_id_results_map
+    return filtered_result_id_results_map
\ No newline at end of file
diff --git a/llm_judge/gen_gpt3.5_answer.py b/llm_judge/gen_gpt3.5_answer.py
index a222111..2f8a4fe 100644
--- a/llm_judge/gen_gpt3.5_answer.py
+++ b/llm_judge/gen_gpt3.5_answer.py
@@ -5,6 +5,9 @@
 import time
 
 import openai
+from openai import OpenAI
+
+client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
 import shortuuid
 from common import PREDICTION_DIR, QUESTION_FILE, load_questions
 from dotenv import load_dotenv
@@ -13,8 +16,8 @@
 logger = logging.getLogger(__name__)
 
 load_dotenv()
-openai.api_key = os.getenv("OPENAI_API_KEY")
-openai.organization = os.getenv("OPENAI_ORGANIZATION")
+# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.getenv("OPENAI_ORGANIZATION"))'
+# openai.organization = os.getenv("OPENAI_ORGANIZATION")
 
 
 def generate_response(input_text, generation_config) -> str:
@@ -24,7 +27,7 @@ def generate_response(input_text, generation_config) -> str:
         input_text: The input text.
         generation_config: The config for the generation.
     """
-    response = openai.Completion.create(prompt=input_text, **generation_config)
+    response = client.completions.create(prompt=input_text, **generation_config)
     return response.choices[0].text
 
 
diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
index 6f6379a..4e44f51 100644
--- a/llm_judge/gen_judgment.py
+++ b/llm_judge/gen_judgment.py
@@ -3,7 +3,7 @@
 import logging
 from concurrent.futures import ThreadPoolExecutor
 from itertools import combinations
-from typing import Optional
+from typing import Optional, Dict, List
 
 from common import (
     JUDGEMENT_DIR,
@@ -27,93 +27,141 @@
 
 
 def make_match_groups_single(
-    questions: list[dict],
-    model_answers: dict[str, dict[int, dict]],
-    ref_answers: dict[str, dict[int, dict]],
+    questions: List[dict],
+    model_answers: Dict[str, Dict[int, List[dict]]],
+    ref_answers: Dict[str, Dict[int, List[dict]]],
     judge_default: Judge,
     judge_math: Judge,
+    num_answers_per_question: Optional[int] = None,
 ):
-    """Make match groups for single answer grading.
-
-    Args:
-        questions (list): A list of questions.
-        model_answers (dict): A dict of model answers.
-        ref_answers (dict): A dict of reference answers.
-        judge_default (Judge): A judge for default questions.
-        judge_math (Judge): A judge for math questions.
-    """
-    match_groups = {}
-    for model in model_answers:
-        matches = []
-        for question in questions:
-            qid = question["question_id"]
-            answer = model_answers[model][qid]
-            if question["category"] in NEED_REF_CATS:
-                judge = judge_math
-                ref_answer = ref_answers[judge.model][qid]
-            else:
-                judge = judge_default
+    """Make match groups for single answer grading."""
+
+    match_groups = {model: [] for model in model_answers}
+
+    for question in questions:
+        qid = question["question_id"]
+        category = question["category"]
+
+        # Determine if reference answer is needed
+        if category in NEED_REF_CATS:
+            judge = judge_math
+            ref_answer_list = ref_answers[judge.model].get(qid)
+            if not ref_answer_list:
+                logger.warning(f"No reference answer for question {qid} in model {judge.model}")
                 ref_answer = None
-            matches.append(
-                MatchSingle(
+            else:
+                ref_answer = ref_answer_list[0]
+        else:
+            judge = judge_default
+            ref_answer = None
+        # Get all models that have answers for this question
+        available_models = [model for model, answers in model_answers.items() if qid in answers]
+
+        for model in available_models:
+            answers = model_answers[model][qid]
+            if num_answers_per_question is not None:
+                selected_answers = answers[:num_answers_per_question]
+            else:
+                selected_answers = answers
+
+            for answer in selected_answers:
+                match = MatchSingle(
                     question=question,
                     model=model,
                     answer=answer,
                     judge=judge,
                     ref_answer=ref_answer,
                 )
-            )
-        match_groups[f"single:{model}"] = matches
+                match_groups[model].append(match)
+
     return match_groups
 
 
 def make_match_groups_pairwise(
-    questions: list[dict],
-    model_answers: dict[str, dict[int, dict]],
-    ref_answers: dict[str, dict[int, dict]],
+    questions: List[dict],
+    model_answers: Dict[str, Dict[int, List[dict]]],
+    ref_answers: Dict[str, Dict[int, List[dict]]],
     judge_default: Judge,
     judge_math: Judge,
     baseline_model: Optional[str] = None,
+    num_answers_per_question: Optional[int] = None,
 ):
-    """Make match groups for pairwise comparison.
-
-    Args:
-        questions (list): A list of questions.
-        model_answers (dict): A dict of model answers.
-        ref_answers (dict): A dict of reference answers.
-        judge_default (Judge): A judge for default questions.
-        judge_math (Judge): A judge for math questions.
-        baseline_model (Optional[str]): The baseline model.
-    """
+    """Make match groups for pairwise comparison."""
+
     match_groups = {}
-    for model_1, model_2 in combinations(model_answers, 2):
-        if baseline_model and baseline_model not in {model_1, model_2}:
-            continue
-        matches = []
-        for question in questions:
-            qid = question["question_id"]
-            answer_1 = model_answers[model_1][qid]
-            answer_2 = model_answers[model_2][qid]
-            if question["category"] in NEED_REF_CATS:
-                judge = judge_math
-                ref_answer = ref_answers[judge.model][qid]
-            else:
-                judge = judge_default
+
+    for question in questions:
+        qid = question["question_id"]
+        category = question["category"]
+
+        # Determine if reference answer is needed
+        if category in NEED_REF_CATS:
+            judge = judge_math
+            ref_answer_list = ref_answers[judge.model].get(qid)
+            if not ref_answer_list:
+                logger.warning(f"No reference answer for question {qid} in model {judge.model}")
                 ref_answer = None
-            matches.append(
-                MatchPair(
-                    question=question,
-                    model_1=model_1,
-                    model_2=model_2,
-                    answer_1=answer_1,
-                    answer_2=answer_2,
-                    judge=judge,
-                    ref_answer=ref_answer,
-                )
-            )
-        match_groups[f"pairwise:{model_1}_{model_2}"] = matches
-    return match_groups
+            else:
+                ref_answer = ref_answer_list[0]
+        else:
+            judge = judge_default
+            ref_answer = None
+
+        # Get all models that have answers for this question
+        available_models = [model for model, answers in model_answers.items() if qid in answers]
+
+        if baseline_model:
+            if baseline_model not in available_models:
+                logger.warning(f"Baseline model {baseline_model} does not have an answer for question {qid}. Skipping.")
+                continue
+            non_baseline_models = [model for model in available_models if model != baseline_model]
+        else:
+            non_baseline_models = available_models
+
+        if num_answers_per_question is not None:
+            selected_non_baseline_models = non_baseline_models[:num_answers_per_question]
+        else:
+            selected_non_baseline_models = non_baseline_models
+
+        if baseline_model:
+            selected_models = selected_non_baseline_models + [baseline_model]
+        else:
+            selected_models = selected_non_baseline_models
+
+        # Generate all unique pairs
+        for model_1, model_2 in combinations(selected_models, 2):
+            if baseline_model and (model_1 != baseline_model and model_2 != baseline_model):
+                # In pairwise-baseline mode, only create pairs with the baseline
+                continue
+
+            pair_key = f"pairwise:{model_1}_{model_2}"
+            if pair_key not in match_groups:
+                match_groups[pair_key] = []
+
+            answers_1 = model_answers[model_1][qid]
+            answers_2 = model_answers[model_2][qid]
+
+            if num_answers_per_question is not None:
+                selected_answers_1 = answers_1[:num_answers_per_question]
+                selected_answers_2 = answers_2[:num_answers_per_question]
+            else:
+                selected_answers_1 = answers_1
+                selected_answers_2 = answers_2
+
+            for ans1 in selected_answers_1:
+                for ans2 in selected_answers_2:
+                    match = MatchPair(
+                        question=question,
+                        model_1=model_1,
+                        model_2=model_2,
+                        answer_1=ans1,
+                        answer_2=ans2,
+                        judge=judge,
+                        ref_answer=ref_answer,
+                    )
+                    match_groups[pair_key].append(match)
 
+    return match_groups
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -167,6 +215,12 @@ def make_match_groups_pairwise(
     parser.add_argument(
         "--verbose", "-v", action="count", default=0, help="Verbosity level"
     )
+    parser.add_argument(
+        "--num_answers_per_question",
+        type=int,
+        default=None,
+        help="Number of answers to evaluate per question.",
+    )
     args = parser.parse_args()
 
     if args.verbose == 0:
@@ -206,14 +260,20 @@ def make_match_groups_pairwise(
     for model in sorted(models):
         answers = load_model_answers(PREDICTION_DIR / model)
         for question in questions:
-            assert question["question_id"] in answers
+            qid = question["question_id"]
+            if qid not in answers:
+                logger.error(f"Question ID {qid} missing in model {model} answers.")
+                raise ValueError(f"Question ID {qid} missing in model {model} answers.")
         model_answers[model] = answers
 
     logger.info("Load reference answers")
     judge_model = args.judge_model
     answers = load_model_answers(REFERENCE_DIR / "gpt-4")
     for question in filter(lambda x: x["category"] in NEED_REF_CATS, questions):
-        assert question["question_id"] in answers
+        qid = question["question_id"]
+        if qid not in answers:
+            logger.error(f"Reference answer for question ID {qid} missing.")
+            raise ValueError(f"Reference answer for question ID {qid} missing.")
     ref_answers = {judge_model: answers}
 
     logger.info("Load judge prompts")
@@ -227,6 +287,7 @@ def make_match_groups_pairwise(
             ref_answers=ref_answers,
             judge_default=Judge(args.judge_model, judge_prompts["single"]),
             judge_math=Judge(args.judge_model, judge_prompts["single-math"]),
+            num_answers_per_question=args.num_answers_per_question,
         )
         output_dir = JUDGEMENT_DIR / "single" / args.judge_model
     else:
@@ -242,8 +303,11 @@ def make_match_groups_pairwise(
             judge_default=Judge(args.judge_model, judge_prompts["pair"]),
             judge_math=Judge(args.judge_model, judge_prompts["pair-math"]),
             baseline_model=baseline_model,
+            num_answers_per_question=args.num_answers_per_question,
         )
         output_dir = JUDGEMENT_DIR / "pairwise" / args.judge_model
+
+    # Filter out existing match_ids if not overwriting
     target_match_ids = set()
     for match_id in match_groups:
         output_file = output_dir / f"{match_id}.jsonl"
@@ -278,15 +342,19 @@ def make_match_groups_pairwise(
         with ThreadPoolExecutor(args.parallel) as executor:
             futures = [executor.submit(match.play) for match in matches]
             for future in tqdm(futures):
-                results.append(future.result())
+                try:
+                    result = future.result()
+                    results.append(result)
+                except Exception as e:
+                    logger.error(f"Error processing match {match_id}: {e}")
 
         logger.info(f"Write {len(results)} judgments")
         output_file.parent.mkdir(parents=True, exist_ok=True)
-        with open(output_file, "w") as f:
+        with open(output_file, "w", encoding="utf-8") as f:
             for result in results:
                 f.write(json.dumps(result, ensure_ascii=False) + "\n")
         logger.info(f"Saved the judgments to {output_file}")
 
         if args.wandb:
             logger.info("Log to wandb")
-            upload_results(args.mode, match_id, results, args.baseline_model)
+            upload_results(args.mode, match_id, results, args.baseline_model)
\ No newline at end of file
diff --git a/llm_judge/gen_model_answer.py b/llm_judge/gen_model_answer.py
index e92de77..3e00e37 100644
--- a/llm_judge/gen_model_answer.py
+++ b/llm_judge/gen_model_answer.py
@@ -26,7 +26,6 @@
     "generic": 0.1,
 }
 
-
 def generate_response(
     input_text, model, tokenizer, generation_config=None, special_token_map=None
 ):
@@ -64,7 +63,6 @@ def generate_response(
 
     return output
 
-
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
@@ -79,6 +77,9 @@ def generate_response(
     parser.add_argument(
         "--overwrite", action="store_true", help="Overwrite the existing results"
     )
+    parser.add_argument(
+        "--num_answers_per_question", type=int, default=1, help="Number of answers to generate per question"
+    )
     args = parser.parse_args()
 
     if args.verbose == 0:
@@ -159,25 +160,26 @@ def generate_response(
             category = question["category"]
             generation_config["temperature"] = DEFAULT_TEMPERATURE_MAP[category]
 
-        output = generate_response(
-            input_text=prompt_template.format_map({"instruction": instruction}),
-            model=model,
-            tokenizer=tokenizer,
-            generation_config=generation_config,
-            special_token_map=special_token_map,
-        )
-
-        logger.debug(f"{instruction}\n\n{output}")
-
-        results.append(
-            {
-                "question_id": int(question["question_id"]),
-                "answer_id": shortuuid.uuid(),
-                "model_id": model_id,
-                "choices": [{"index": 0, "turns": [output]}],
-                "tstamp": time.time(),
-            }
-        )
+        for _ in range(args.num_answers_per_question):
+            output = generate_response(
+                input_text=prompt_template.format_map({"instruction": instruction}),
+                model=model,
+                tokenizer=tokenizer,
+                generation_config=generation_config,
+                special_token_map=special_token_map,
+            )
+
+            logger.debug(f"{instruction}\n\n{output}")
+
+            results.append(
+                {
+                    "question_id": int(question["question_id"]),
+                    "answer_id": shortuuid.uuid(),
+                    "model_id": model_id,
+                    "choices": [{"index": 0, "turns": [output]}],
+                    "tstamp": time.time(),
+                }
+            )
 
     logger.info("Save the results")
     prediction_dir.mkdir(parents=True, exist_ok=True)
diff --git a/pyproject.toml b/pyproject.toml
index 12f7ebc..2a793de 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,10 +15,10 @@ classifiers = [
 dependencies = [
     "accelerate", "fastapi", "gradio==3.35.2", "httpx", "markdown2[all]", "nh3", "numpy",
     "peft==0.5", "prompt_toolkit>=3.0.0", "pydantic<=2.0", "requests", "rich>=10.0.0", "sentencepiece",
-    "shortuuid", "shortuuid", "tiktoken", "tokenizers>=0.12.1", "torch",
-    "transformers", "uvicorn", "wandb", "openai==0.28.1", "ray", "python-dotenv", "protobuf==3.19",
-    "wandb", "tiktoken"
+    "shortuuid", "tiktoken", "tokenizers>=0.12.1", "torch",
+    "transformers", "uvicorn", "wandb", "openai==1.35.3", "ray", "python-dotenv", "protobuf==3.19"
 ]
 
+
 [tool.setuptools.packages.find]
 exclude = ["*"]