From a6a3d26ed406d23ce6542891750c34628fb415ce Mon Sep 17 00:00:00 2001 From: Sh1gechan Date: Tue, 25 Jun 2024 02:14:02 +0900 Subject: [PATCH 1/6] OpenAI version upgrade (latest version) --- configs/cyberagent--calm2-7b-chat.json | 3 +-- ...p--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json | 1 - ...p--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json | 1 - configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json | 1 - .../rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json | 1 - configs/tokyotech-llm--Swallow-70b-instruct-hf.json | 1 - pyproject.toml | 6 +++--- 7 files changed, 4 insertions(+), 10 deletions(-) diff --git a/configs/cyberagent--calm2-7b-chat.json b/configs/cyberagent--calm2-7b-chat.json index fe5a486..356f184 100644 --- a/configs/cyberagent--calm2-7b-chat.json +++ b/configs/cyberagent--calm2-7b-chat.json @@ -6,8 +6,7 @@ "prompt_template": "USER: {instruction}\nASSISTANT: ", "generation_config": { "do_sample": true, - "max_length": 2048, - "temperature": 0.8 + "max_length": 2048 }, "special_token_map": {} } diff --git a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json index 7db6c30..86d0642 100644 --- a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json +++ b/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json @@ -7,7 +7,6 @@ "generation_config": { "do_sample": true, "max_length": 2048, - "temperature": 0.7, "top_p": 0.95 }, "special_token_map": {} diff --git a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json index 73fac46..c11c1e6 100644 --- a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json +++ b/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json @@ -7,7 +7,6 @@ "generation_config": { "do_sample": true, "max_length": 2048, - "temperature": 0.7, "top_p": 0.95 }, "special_token_map": {} diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json index 147e6a7..c83e10c 100644 --- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json +++ b/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json @@ -7,7 +7,6 @@ "generation_config": { "do_sample": true, "max_length": 2048, - "temperature": 0.7, "repetition_penalty": 1.1 }, "special_token_map": { diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json index d69e5ab..f5343a2 100644 --- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json +++ b/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json @@ -7,7 +7,6 @@ "generation_config": { "do_sample": true, "max_length": 2048, - "temperature": 0.7, "repetition_penalty": 1.1 }, "special_token_map": { diff --git a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json b/configs/tokyotech-llm--Swallow-70b-instruct-hf.json index f6b5efa..2fbe0e3 100644 --- a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json +++ b/configs/tokyotech-llm--Swallow-70b-instruct-hf.json @@ -7,7 +7,6 @@ "generation_config": { "do_sample": true, "max_length": 2048, - "temperature": 0.99, "top_p": 0.95 } } diff --git a/pyproject.toml b/pyproject.toml index 12f7ebc..2a793de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,10 +15,10 @@ classifiers = [ dependencies = [ "accelerate", "fastapi", "gradio==3.35.2", "httpx", "markdown2[all]", "nh3", "numpy", "peft==0.5", "prompt_toolkit>=3.0.0", "pydantic<=2.0", "requests", "rich>=10.0.0", "sentencepiece", - "shortuuid", "shortuuid", "tiktoken", "tokenizers>=0.12.1", "torch", - "transformers", "uvicorn", "wandb", "openai==0.28.1", "ray", "python-dotenv", "protobuf==3.19", - "wandb", "tiktoken" + "shortuuid", "tiktoken", "tokenizers>=0.12.1", "torch", + "transformers", "uvicorn", "wandb", "openai==1.35.3", "ray", "python-dotenv", "protobuf==3.19" ] + [tool.setuptools.packages.find] exclude = ["*"] From b2ab6c3d9dbcd175e8cd543e7ebd87bca7c5cecb Mon Sep 17 00:00:00 2001 From: Sh1gechan Date: Tue, 25 Jun 2024 02:31:01 +0900 Subject: [PATCH 2/6] Add --num_answers_per_question option to gen_model_answer.py and gen_judgment.py --- README.md | 9 +++++-- llm_judge/common.py | 24 ++++++++++++------- llm_judge/gen_judgment.py | 16 ++++++++++++- llm_judge/gen_model_answer.py | 44 ++++++++++++++++++----------------- 4 files changed, 61 insertions(+), 32 deletions(-) diff --git a/README.md b/README.md index 854eb92..5d05154 100644 --- a/README.md +++ b/README.md @@ -26,13 +26,16 @@ python llm_judge/gen_model_answer.py --config Arguments & Options: - `` is the path to a configuration file. Examples are in `configs/`. + - `num_answers_per_question` specifies how many to generate (default: all) For example: ```bash -python llm_judge/gen_model_answer.py --config configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json +python llm_judge/gen_model_answer.py --config configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json --num_answers_per_question ``` + + #### Step 2. Generate GPT-4 judgments There are several options to use GPT-4 as a judge, such as pairwise win-rate and single-answer grading. @@ -43,7 +46,8 @@ OPENAI_API_KEY= python llm_judge/gen_judgment.py \ [--baseline-model ] \ [--model-list ] \ [--yes] \ - [--wandb] + [--wandb] \ + [--num_answers_per_question] ``` Arguments & Options: @@ -55,6 +59,7 @@ Arguments & Options: - `--model-list ` is a list of model IDs to be evaluated. If not specified, all models in `data/jp_bench/model_answer` will be evaluated. - `--yes` is a flag to skip the confirmation prompt. - `--wandb` is a flag to enable logging to W&B. You can upload the results later to W&B by running `upload_result.py`, as described in the next section. +- `num_answers_per_question` : Number of answers to evaluate per question **Mode: `pairwise-baseline` (Default)** diff --git a/llm_judge/common.py b/llm_judge/common.py index 789bf12..d783c2d 100644 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -9,17 +9,20 @@ from typing import Optional, Union import openai +from openai import AzureOpenAI + +client = AzureOpenAI(api_key=os.getenv("OPENAI_API_KEY"), +api_version=os.getenv("OPENAI_API_VERSION")) import tiktoken from dotenv import load_dotenv logger = logging.getLogger(__name__) load_dotenv() -openai.api_key = os.getenv("OPENAI_API_KEY") -openai.organization = os.getenv("OPENAI_ORGANIZATION") -openai.api_type = os.getenv("OPENAI_API_TYPE") -openai.api_base = os.getenv("OPENAI_API_BASE") -openai.api_version = os.getenv("OPENAI_API_VERSION") +# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.getenv("OPENAI_ORGANIZATION"))' +# openai.organization = os.getenv("OPENAI_ORGANIZATION") +# TODO: The 'openai.api_base' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(base_url=os.getenv("OPENAI_API_BASE"))' +# openai.api_base = os.getenv("OPENAI_API_BASE") # Data paths JP_BENCH_DIR = Path(__file__).resolve().parent.parent / "data" / "jp_bench" @@ -68,9 +71,9 @@ def judge(self, **kwargs): params["engine"] = self.model else: params["model"] = self.model - response = openai.ChatCompletion.create(**params) - return response["choices"][0]["message"]["content"] - except openai.error.OpenAIError as e: + response = client.chat.completions.create(**params) + return response.choices[0].message.content + except openai.OpenAIError as e: logger.warning(f"OpenAI API error: {e}") time.sleep(API_RETRY_SLEEP) @@ -363,3 +366,8 @@ def filter_pairwise_judgements( else: filtered_result_id_results_map[result_id] = results return filtered_result_id_results_map + + + + + diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 6f6379a..51688fa 100644 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -32,6 +32,7 @@ def make_match_groups_single( ref_answers: dict[str, dict[int, dict]], judge_default: Judge, judge_math: Judge, + num_answers_per_question: Optional[int] = None, ): """Make match groups for single answer grading. @@ -41,6 +42,7 @@ def make_match_groups_single( ref_answers (dict): A dict of reference answers. judge_default (Judge): A judge for default questions. judge_math (Judge): A judge for math questions. + num_answers_per_question (Optional[int]): Number of answers to evaluate per question. """ match_groups = {} for model in model_answers: @@ -63,6 +65,8 @@ def make_match_groups_single( ref_answer=ref_answer, ) ) + if num_answers_per_question: + matches = matches[:num_answers_per_question] match_groups[f"single:{model}"] = matches return match_groups @@ -74,6 +78,7 @@ def make_match_groups_pairwise( judge_default: Judge, judge_math: Judge, baseline_model: Optional[str] = None, + num_answers_per_question: Optional[int] = None, ): """Make match groups for pairwise comparison. @@ -84,6 +89,7 @@ def make_match_groups_pairwise( judge_default (Judge): A judge for default questions. judge_math (Judge): A judge for math questions. baseline_model (Optional[str]): The baseline model. + num_answers_per_question (Optional[int]): Number of answers to evaluate per question. """ match_groups = {} for model_1, model_2 in combinations(model_answers, 2): @@ -111,6 +117,8 @@ def make_match_groups_pairwise( ref_answer=ref_answer, ) ) + if num_answers_per_question: + matches = matches[:num_answers_per_question] match_groups[f"pairwise:{model_1}_{model_2}"] = matches return match_groups @@ -132,7 +140,7 @@ def make_match_groups_pairwise( parser.add_argument( "--judge-model", type=str, - default="gpt-4", + default="gpt-4-0613", choices=["gpt-4", "gpt-4-0613", "gpt-4-1106-preview", "gpt-3.5-turbo"], help="The judge model.", ) @@ -167,6 +175,9 @@ def make_match_groups_pairwise( parser.add_argument( "--verbose", "-v", action="count", default=0, help="Verbosity level" ) + parser.add_argument( + "--num_answers_per_question", type=int, default=None, help="Number of answers to evaluate per question." + ) args = parser.parse_args() if args.verbose == 0: @@ -227,6 +238,7 @@ def make_match_groups_pairwise( ref_answers=ref_answers, judge_default=Judge(args.judge_model, judge_prompts["single"]), judge_math=Judge(args.judge_model, judge_prompts["single-math"]), + num_answers_per_question=args.num_answers_per_question, ) output_dir = JUDGEMENT_DIR / "single" / args.judge_model else: @@ -242,6 +254,7 @@ def make_match_groups_pairwise( judge_default=Judge(args.judge_model, judge_prompts["pair"]), judge_math=Judge(args.judge_model, judge_prompts["pair-math"]), baseline_model=baseline_model, + num_answers_per_question=args.num_answers_per_question, ) output_dir = JUDGEMENT_DIR / "pairwise" / args.judge_model target_match_ids = set() @@ -290,3 +303,4 @@ def make_match_groups_pairwise( if args.wandb: logger.info("Log to wandb") upload_results(args.mode, match_id, results, args.baseline_model) + diff --git a/llm_judge/gen_model_answer.py b/llm_judge/gen_model_answer.py index e92de77..3e00e37 100644 --- a/llm_judge/gen_model_answer.py +++ b/llm_judge/gen_model_answer.py @@ -26,7 +26,6 @@ "generic": 0.1, } - def generate_response( input_text, model, tokenizer, generation_config=None, special_token_map=None ): @@ -64,7 +63,6 @@ def generate_response( return output - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -79,6 +77,9 @@ def generate_response( parser.add_argument( "--overwrite", action="store_true", help="Overwrite the existing results" ) + parser.add_argument( + "--num_answers_per_question", type=int, default=1, help="Number of answers to generate per question" + ) args = parser.parse_args() if args.verbose == 0: @@ -159,25 +160,26 @@ def generate_response( category = question["category"] generation_config["temperature"] = DEFAULT_TEMPERATURE_MAP[category] - output = generate_response( - input_text=prompt_template.format_map({"instruction": instruction}), - model=model, - tokenizer=tokenizer, - generation_config=generation_config, - special_token_map=special_token_map, - ) - - logger.debug(f"{instruction}\n\n{output}") - - results.append( - { - "question_id": int(question["question_id"]), - "answer_id": shortuuid.uuid(), - "model_id": model_id, - "choices": [{"index": 0, "turns": [output]}], - "tstamp": time.time(), - } - ) + for _ in range(args.num_answers_per_question): + output = generate_response( + input_text=prompt_template.format_map({"instruction": instruction}), + model=model, + tokenizer=tokenizer, + generation_config=generation_config, + special_token_map=special_token_map, + ) + + logger.debug(f"{instruction}\n\n{output}") + + results.append( + { + "question_id": int(question["question_id"]), + "answer_id": shortuuid.uuid(), + "model_id": model_id, + "choices": [{"index": 0, "turns": [output]}], + "tstamp": time.time(), + } + ) logger.info("Save the results") prediction_dir.mkdir(parents=True, exist_ok=True) From 21de29b6850bbcd9137935b22826244eaf85c9fa Mon Sep 17 00:00:00 2001 From: Sh1gechan Date: Tue, 25 Jun 2024 02:51:04 +0900 Subject: [PATCH 3/6] Remove configs folder from the repository --- configs/README.md | 30 ------------------- configs/cyberagent--calm2-7b-chat.json | 12 -------- ...instruct-full-jaster-dolly-oasst-v1.0.json | 13 -------- ...instruct-lora-jaster-dolly-oasst-v1.0.json | 13 -------- configs/openai--text-davinci-003.json | 16 ---------- ...apanese-gpt-neox-3.6b-instruction-ppo.json | 15 ---------- ...nese-gpt-neox-3.6b-instruction-sft-v2.json | 15 ---------- ...okyotech-llm--Swallow-70b-instruct-hf.json | 12 -------- 8 files changed, 126 deletions(-) delete mode 100644 configs/README.md delete mode 100644 configs/cyberagent--calm2-7b-chat.json delete mode 100644 configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json delete mode 100644 configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json delete mode 100644 configs/openai--text-davinci-003.json delete mode 100644 configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json delete mode 100644 configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json delete mode 100644 configs/tokyotech-llm--Swallow-70b-instruct-hf.json diff --git a/configs/README.md b/configs/README.md deleted file mode 100644 index 889f306..0000000 --- a/configs/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# Configuration files - -Each configuration file is a JSON file with the following structure: - -```json5 -// rinna--japanese-gpt-neox-3.6b-instruction-ppo.json -{ - // The ID of the model - "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-ppo", - // The name of the model - "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-ppo", - // The name of the lora model (optional) - "lora_model_name_or_path": null, - // The name of the tokenizer (optional) - "tokenizer_name_or_path": null, - // The prompt template - "prompt_template": "ユーザー: {instruction}システム: ", - // The generation configuration (optional) - // NOTE: `temperature` will be set to a default value for each task category if left empty - "generation_config": { - "do_sample": true, - "max_new_tokens": 512, - "repetition_penalty": 1.1 - }, - // The special token map (optional); this is used to replace special tokens in the output - "special_token_map": { - "": "\n" - } -} -``` diff --git a/configs/cyberagent--calm2-7b-chat.json b/configs/cyberagent--calm2-7b-chat.json deleted file mode 100644 index 356f184..0000000 --- a/configs/cyberagent--calm2-7b-chat.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "model_id": "cyberagent--calm2-7b-chat", - "model_name_or_path": "cyberagent/calm2-7b-chat", - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "USER: {instruction}\nASSISTANT: ", - "generation_config": { - "do_sample": true, - "max_length": 2048 - }, - "special_token_map": {} -} diff --git a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json deleted file mode 100644 index 86d0642..0000000 --- a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "model_id": "llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0", - "model_name_or_path": "llm-jp/llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0", - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "{instruction} ### 回答:", - "generation_config": { - "do_sample": true, - "max_length": 2048, - "top_p": 0.95 - }, - "special_token_map": {} -} diff --git a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json deleted file mode 100644 index c11c1e6..0000000 --- a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "model_id": "llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0", - "model_name_or_path": "llm-jp/llm-jp-13b-v1.0", - "lora_model_name_or_path": "llm-jp/llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0", - "tokenizer_name_or_path": null, - "prompt_template": "{instruction} ### 回答:", - "generation_config": { - "do_sample": true, - "max_length": 2048, - "top_p": 0.95 - }, - "special_token_map": {} -} diff --git a/configs/openai--text-davinci-003.json b/configs/openai--text-davinci-003.json deleted file mode 100644 index a07e3c2..0000000 --- a/configs/openai--text-davinci-003.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "model_id": "openai--text-davinci-003", - "model_name_or_path": null, - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "{instruction}", - "generation_config": { - "engine": "text-davinci-003", - "temperature": 0.0, - "max_tokens": 2048, - "top_p": 1.0, - "frequency_penalty": 0.0, - "presence_penalty": 0.0 - }, - "special_token_map": {} -} diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json deleted file mode 100644 index c83e10c..0000000 --- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-ppo", - "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-ppo", - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "ユーザー: {instruction}システム: ", - "generation_config": { - "do_sample": true, - "max_length": 2048, - "repetition_penalty": 1.1 - }, - "special_token_map": { - "": "\n" - } -} diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json deleted file mode 100644 index f5343a2..0000000 --- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json +++ /dev/null @@ -1,15 +0,0 @@ -{ - "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-sft-v2", - "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-sft-v2", - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "ユーザー: {instruction}システム: ", - "generation_config": { - "do_sample": true, - "max_length": 2048, - "repetition_penalty": 1.1 - }, - "special_token_map": { - "": "\n" - } -} diff --git a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json b/configs/tokyotech-llm--Swallow-70b-instruct-hf.json deleted file mode 100644 index 2fbe0e3..0000000 --- a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "model_id": "tokyotech-llm--Swallow-70b-instruct-hf", - "model_name_or_path": "tokyotech-llm/Swallow-70b-instruct-hf", - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "以下に、あるタスクを説明する指示があります。リクエストを適切に完了するための回答を記述してください。\n\n### 指示:\n{instruction}\n\n### 応答:\n", - "generation_config": { - "do_sample": true, - "max_length": 2048, - "top_p": 0.95 - } - } From 01e60431c0135c2086a8dfa5591ba197070ba5ae Mon Sep 17 00:00:00 2001 From: Sh1gechan Date: Tue, 25 Jun 2024 17:11:46 +0900 Subject: [PATCH 4/6] fix READEME --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 5d05154..ef336b7 100644 --- a/README.md +++ b/README.md @@ -26,12 +26,12 @@ python llm_judge/gen_model_answer.py --config Arguments & Options: - `` is the path to a configuration file. Examples are in `configs/`. - - `num_answers_per_question` specifies how many to generate (default: all) + - `num_answers_per_question` specifies the number of answers to generate per question (default: all) For example: ```bash -python llm_judge/gen_model_answer.py --config configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json --num_answers_per_question +python llm_judge/gen_model_answer.py --config configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json ``` @@ -59,7 +59,7 @@ Arguments & Options: - `--model-list ` is a list of model IDs to be evaluated. If not specified, all models in `data/jp_bench/model_answer` will be evaluated. - `--yes` is a flag to skip the confirmation prompt. - `--wandb` is a flag to enable logging to W&B. You can upload the results later to W&B by running `upload_result.py`, as described in the next section. -- `num_answers_per_question` : Number of answers to evaluate per question +- `num_answers_per_question` : specifies the number of answers to evaluate per question **Mode: `pairwise-baseline` (Default)** @@ -162,4 +162,3 @@ If you use our code in your research, please cite our work: year={2024} } ``` - From ec5a5e0d9ff5ddd160336bc0ca584c736b96084a Mon Sep 17 00:00:00 2001 From: Sh1gechan Date: Thu, 1 Aug 2024 04:11:28 +0900 Subject: [PATCH 5/6] fix common.py --- llm_judge/common.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/llm_judge/common.py b/llm_judge/common.py index d783c2d..9e6c971 100644 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -127,9 +127,11 @@ def estimate_cost(self) -> float: + len(enc.encode(self.judge.prompt_template["prompt_template"])) ) if self.ref_answer: - num_input_tokens += len( - enc.encode(self.ref_answer["choices"][0]["turns"][0]) - ) + if isinstance(self.ref_answer, list): + ref_answer_text = self.ref_answer[0]["choices"][0]["turns"][0] + else: + ref_answer_text = self.ref_answer["choices"][0]["turns"][0] + num_input_tokens += len(enc.encode(ref_answer_text)) num_output_tokens = 200 # Estimated from a few samples if self.judge.model in {"gpt-4", "gpt-4-0613"}: return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000 @@ -217,9 +219,11 @@ def estimate_cost(self) -> float: + len(enc.encode(self.judge.prompt_template["prompt_template"])) ) if self.ref_answer: - num_input_tokens += len( - enc.encode(self.ref_answer["choices"][0]["turns"][0]) - ) + if isinstance(self.ref_answer, list): + ref_answer_text = self.ref_answer[0]["choices"][0]["turns"][0] + else: + ref_answer_text = self.ref_answer["choices"][0]["turns"][0] + num_input_tokens += len(enc.encode(ref_answer_text)) num_output_tokens = 200 # Estimated from a few samples if self.judge.model in {"gpt-4", "gpt-4-0613"}: return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000 From b319fd778b78255a884a4be39a5c2d90e33680bf Mon Sep 17 00:00:00 2001 From: Sh1gechan Date: Mon, 7 Oct 2024 18:29:20 +0900 Subject: [PATCH 6/6] fix gen_judgement.py --- llm_judge/common.py | 42 ++++--- llm_judge/gen_gpt3.5_answer.py | 9 +- llm_judge/gen_judgment.py | 216 ++++++++++++++++++++------------- 3 files changed, 165 insertions(+), 102 deletions(-) diff --git a/llm_judge/common.py b/llm_judge/common.py index 9e6c971..e5b6db8 100644 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -127,11 +127,9 @@ def estimate_cost(self) -> float: + len(enc.encode(self.judge.prompt_template["prompt_template"])) ) if self.ref_answer: - if isinstance(self.ref_answer, list): - ref_answer_text = self.ref_answer[0]["choices"][0]["turns"][0] - else: - ref_answer_text = self.ref_answer["choices"][0]["turns"][0] - num_input_tokens += len(enc.encode(ref_answer_text)) + num_input_tokens += len( + enc.encode(self.ref_answer["choices"][0]["turns"][0]) + ) num_output_tokens = 200 # Estimated from a few samples if self.judge.model in {"gpt-4", "gpt-4-0613"}: return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000 @@ -152,7 +150,6 @@ def get_score(judgment: str) -> int: return ast.literal_eval(match.groups()[0]) return -1 - @dataclasses.dataclass class MatchPair: question: dict @@ -219,11 +216,9 @@ def estimate_cost(self) -> float: + len(enc.encode(self.judge.prompt_template["prompt_template"])) ) if self.ref_answer: - if isinstance(self.ref_answer, list): - ref_answer_text = self.ref_answer[0]["choices"][0]["turns"][0] - else: - ref_answer_text = self.ref_answer["choices"][0]["turns"][0] - num_input_tokens += len(enc.encode(ref_answer_text)) + num_input_tokens += len( + enc.encode(self.ref_answer["choices"][0]["turns"][0]) + ) num_output_tokens = 200 # Estimated from a few samples if self.judge.model in {"gpt-4", "gpt-4-0613"}: return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000 @@ -263,6 +258,19 @@ def get_model_list(answer_dir: Union[str, Path]): return [path.name for path in Path(answer_dir).iterdir()] +# def load_model_answers(answer_dir: Union[str, Path]): +# """Load model answers. + +# Args: +# answer_dir (Union[str, Path]): The answer directory. +# """ +# answers = {} +# with open(Path(answer_dir) / "results.jsonl", "r") as fin: +# for line in fin: +# answer = json.loads(line) +# answers[answer["question_id"]] = answer +# return answers + def load_model_answers(answer_dir: Union[str, Path]): """Load model answers. @@ -273,7 +281,10 @@ def load_model_answers(answer_dir: Union[str, Path]): with open(Path(answer_dir) / "results.jsonl", "r") as fin: for line in fin: answer = json.loads(line) - answers[answer["question_id"]] = answer + qid = answer["question_id"] + if qid not in answers: + answers[qid] = [] + answers[qid].append(answer) return answers @@ -369,9 +380,4 @@ def filter_pairwise_judgements( filtered_result_id_results_map[result_id] = results else: filtered_result_id_results_map[result_id] = results - return filtered_result_id_results_map - - - - - + return filtered_result_id_results_map \ No newline at end of file diff --git a/llm_judge/gen_gpt3.5_answer.py b/llm_judge/gen_gpt3.5_answer.py index a222111..2f8a4fe 100644 --- a/llm_judge/gen_gpt3.5_answer.py +++ b/llm_judge/gen_gpt3.5_answer.py @@ -5,6 +5,9 @@ import time import openai +from openai import OpenAI + +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) import shortuuid from common import PREDICTION_DIR, QUESTION_FILE, load_questions from dotenv import load_dotenv @@ -13,8 +16,8 @@ logger = logging.getLogger(__name__) load_dotenv() -openai.api_key = os.getenv("OPENAI_API_KEY") -openai.organization = os.getenv("OPENAI_ORGANIZATION") +# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.getenv("OPENAI_ORGANIZATION"))' +# openai.organization = os.getenv("OPENAI_ORGANIZATION") def generate_response(input_text, generation_config) -> str: @@ -24,7 +27,7 @@ def generate_response(input_text, generation_config) -> str: input_text: The input text. generation_config: The config for the generation. """ - response = openai.Completion.create(prompt=input_text, **generation_config) + response = client.completions.create(prompt=input_text, **generation_config) return response.choices[0].text diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 51688fa..4e44f51 100644 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -3,7 +3,7 @@ import logging from concurrent.futures import ThreadPoolExecutor from itertools import combinations -from typing import Optional +from typing import Optional, Dict, List from common import ( JUDGEMENT_DIR, @@ -27,101 +27,141 @@ def make_match_groups_single( - questions: list[dict], - model_answers: dict[str, dict[int, dict]], - ref_answers: dict[str, dict[int, dict]], + questions: List[dict], + model_answers: Dict[str, Dict[int, List[dict]]], + ref_answers: Dict[str, Dict[int, List[dict]]], judge_default: Judge, judge_math: Judge, num_answers_per_question: Optional[int] = None, ): - """Make match groups for single answer grading. - - Args: - questions (list): A list of questions. - model_answers (dict): A dict of model answers. - ref_answers (dict): A dict of reference answers. - judge_default (Judge): A judge for default questions. - judge_math (Judge): A judge for math questions. - num_answers_per_question (Optional[int]): Number of answers to evaluate per question. - """ - match_groups = {} - for model in model_answers: - matches = [] - for question in questions: - qid = question["question_id"] - answer = model_answers[model][qid] - if question["category"] in NEED_REF_CATS: - judge = judge_math - ref_answer = ref_answers[judge.model][qid] - else: - judge = judge_default + """Make match groups for single answer grading.""" + + match_groups = {model: [] for model in model_answers} + + for question in questions: + qid = question["question_id"] + category = question["category"] + + # Determine if reference answer is needed + if category in NEED_REF_CATS: + judge = judge_math + ref_answer_list = ref_answers[judge.model].get(qid) + if not ref_answer_list: + logger.warning(f"No reference answer for question {qid} in model {judge.model}") ref_answer = None - matches.append( - MatchSingle( + else: + ref_answer = ref_answer_list[0] + else: + judge = judge_default + ref_answer = None + # Get all models that have answers for this question + available_models = [model for model, answers in model_answers.items() if qid in answers] + + for model in available_models: + answers = model_answers[model][qid] + if num_answers_per_question is not None: + selected_answers = answers[:num_answers_per_question] + else: + selected_answers = answers + + for answer in selected_answers: + match = MatchSingle( question=question, model=model, answer=answer, judge=judge, ref_answer=ref_answer, ) - ) - if num_answers_per_question: - matches = matches[:num_answers_per_question] - match_groups[f"single:{model}"] = matches + match_groups[model].append(match) + return match_groups def make_match_groups_pairwise( - questions: list[dict], - model_answers: dict[str, dict[int, dict]], - ref_answers: dict[str, dict[int, dict]], + questions: List[dict], + model_answers: Dict[str, Dict[int, List[dict]]], + ref_answers: Dict[str, Dict[int, List[dict]]], judge_default: Judge, judge_math: Judge, baseline_model: Optional[str] = None, num_answers_per_question: Optional[int] = None, ): - """Make match groups for pairwise comparison. - - Args: - questions (list): A list of questions. - model_answers (dict): A dict of model answers. - ref_answers (dict): A dict of reference answers. - judge_default (Judge): A judge for default questions. - judge_math (Judge): A judge for math questions. - baseline_model (Optional[str]): The baseline model. - num_answers_per_question (Optional[int]): Number of answers to evaluate per question. - """ + """Make match groups for pairwise comparison.""" + match_groups = {} - for model_1, model_2 in combinations(model_answers, 2): - if baseline_model and baseline_model not in {model_1, model_2}: - continue - matches = [] - for question in questions: - qid = question["question_id"] - answer_1 = model_answers[model_1][qid] - answer_2 = model_answers[model_2][qid] - if question["category"] in NEED_REF_CATS: - judge = judge_math - ref_answer = ref_answers[judge.model][qid] - else: - judge = judge_default + + for question in questions: + qid = question["question_id"] + category = question["category"] + + # Determine if reference answer is needed + if category in NEED_REF_CATS: + judge = judge_math + ref_answer_list = ref_answers[judge.model].get(qid) + if not ref_answer_list: + logger.warning(f"No reference answer for question {qid} in model {judge.model}") ref_answer = None - matches.append( - MatchPair( - question=question, - model_1=model_1, - model_2=model_2, - answer_1=answer_1, - answer_2=answer_2, - judge=judge, - ref_answer=ref_answer, - ) - ) - if num_answers_per_question: - matches = matches[:num_answers_per_question] - match_groups[f"pairwise:{model_1}_{model_2}"] = matches - return match_groups + else: + ref_answer = ref_answer_list[0] + else: + judge = judge_default + ref_answer = None + + # Get all models that have answers for this question + available_models = [model for model, answers in model_answers.items() if qid in answers] + + if baseline_model: + if baseline_model not in available_models: + logger.warning(f"Baseline model {baseline_model} does not have an answer for question {qid}. Skipping.") + continue + non_baseline_models = [model for model in available_models if model != baseline_model] + else: + non_baseline_models = available_models + + if num_answers_per_question is not None: + selected_non_baseline_models = non_baseline_models[:num_answers_per_question] + else: + selected_non_baseline_models = non_baseline_models + + if baseline_model: + selected_models = selected_non_baseline_models + [baseline_model] + else: + selected_models = selected_non_baseline_models + # Generate all unique pairs + for model_1, model_2 in combinations(selected_models, 2): + if baseline_model and (model_1 != baseline_model and model_2 != baseline_model): + # In pairwise-baseline mode, only create pairs with the baseline + continue + + pair_key = f"pairwise:{model_1}_{model_2}" + if pair_key not in match_groups: + match_groups[pair_key] = [] + + answers_1 = model_answers[model_1][qid] + answers_2 = model_answers[model_2][qid] + + if num_answers_per_question is not None: + selected_answers_1 = answers_1[:num_answers_per_question] + selected_answers_2 = answers_2[:num_answers_per_question] + else: + selected_answers_1 = answers_1 + selected_answers_2 = answers_2 + + for ans1 in selected_answers_1: + for ans2 in selected_answers_2: + match = MatchPair( + question=question, + model_1=model_1, + model_2=model_2, + answer_1=ans1, + answer_2=ans2, + judge=judge, + ref_answer=ref_answer, + ) + match_groups[pair_key].append(match) + + return match_groups if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -140,7 +180,7 @@ def make_match_groups_pairwise( parser.add_argument( "--judge-model", type=str, - default="gpt-4-0613", + default="gpt-4", choices=["gpt-4", "gpt-4-0613", "gpt-4-1106-preview", "gpt-3.5-turbo"], help="The judge model.", ) @@ -176,7 +216,10 @@ def make_match_groups_pairwise( "--verbose", "-v", action="count", default=0, help="Verbosity level" ) parser.add_argument( - "--num_answers_per_question", type=int, default=None, help="Number of answers to evaluate per question." + "--num_answers_per_question", + type=int, + default=None, + help="Number of answers to evaluate per question.", ) args = parser.parse_args() @@ -217,14 +260,20 @@ def make_match_groups_pairwise( for model in sorted(models): answers = load_model_answers(PREDICTION_DIR / model) for question in questions: - assert question["question_id"] in answers + qid = question["question_id"] + if qid not in answers: + logger.error(f"Question ID {qid} missing in model {model} answers.") + raise ValueError(f"Question ID {qid} missing in model {model} answers.") model_answers[model] = answers logger.info("Load reference answers") judge_model = args.judge_model answers = load_model_answers(REFERENCE_DIR / "gpt-4") for question in filter(lambda x: x["category"] in NEED_REF_CATS, questions): - assert question["question_id"] in answers + qid = question["question_id"] + if qid not in answers: + logger.error(f"Reference answer for question ID {qid} missing.") + raise ValueError(f"Reference answer for question ID {qid} missing.") ref_answers = {judge_model: answers} logger.info("Load judge prompts") @@ -257,6 +306,8 @@ def make_match_groups_pairwise( num_answers_per_question=args.num_answers_per_question, ) output_dir = JUDGEMENT_DIR / "pairwise" / args.judge_model + + # Filter out existing match_ids if not overwriting target_match_ids = set() for match_id in match_groups: output_file = output_dir / f"{match_id}.jsonl" @@ -291,16 +342,19 @@ def make_match_groups_pairwise( with ThreadPoolExecutor(args.parallel) as executor: futures = [executor.submit(match.play) for match in matches] for future in tqdm(futures): - results.append(future.result()) + try: + result = future.result() + results.append(result) + except Exception as e: + logger.error(f"Error processing match {match_id}: {e}") logger.info(f"Write {len(results)} judgments") output_file.parent.mkdir(parents=True, exist_ok=True) - with open(output_file, "w") as f: + with open(output_file, "w", encoding="utf-8") as f: for result in results: f.write(json.dumps(result, ensure_ascii=False) + "\n") logger.info(f"Saved the judgments to {output_file}") if args.wandb: logger.info("Log to wandb") - upload_results(args.mode, match_id, results, args.baseline_model) - + upload_results(args.mode, match_id, results, args.baseline_model) \ No newline at end of file