From cda14bd01ef5511c1fb521091013d3fd40262932 Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Fri, 9 Feb 2024 10:20:10 +0900 Subject: [PATCH 1/4] load API_TYPE etc. --- llm_judge/common.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llm_judge/common.py b/llm_judge/common.py index 8bc2c94..c4d97a4 100644 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -17,6 +17,9 @@ load_dotenv() openai.api_key = os.getenv("OPENAI_API_KEY") openai.organization = os.getenv("OPENAI_ORGANIZATION") +openai.api_type = os.getenv("OPENAI_API_TYPE") +openai.api_base = os.getenv("OPENAI_API_BASE") +openai.api_version = os.getenv("OPENAI_API_VERSION") # Data paths JP_BENCH_DIR = Path(__file__).resolve().parent.parent / "data" / "jp_bench" From dc7e00dbf0dc4419316b79545acb36c611b95d1f Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Fri, 9 Feb 2024 10:41:43 +0900 Subject: [PATCH 2/4] add models --- llm_judge/common.py | 14 +++++++++----- llm_judge/gen_judgment.py | 2 +- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/llm_judge/common.py b/llm_judge/common.py index c4d97a4..d4e8ded 100644 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -124,10 +124,12 @@ def estimate_cost(self) -> float: enc.encode(self.ref_answer["choices"][0]["turns"][0]) ) num_output_tokens = 200 # Estimated from a few samples - if self.judge.model == "gpt-4": + if self.judge.model in {"gpt-4", "gpt-4-0613"}: return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000 + elif self.judge.model == "gpt-4-1106-preview": + return (0.01 * num_input_tokens + 0.03 * num_output_tokens) / 1_000 elif self.judge.model == "gpt-3.5-turbo": - return (0.001 * num_input_tokens + 0.002 * num_output_tokens) / 1_000 + return (0.0005 * num_input_tokens + 0.0015 * num_output_tokens) / 1_000 raise AssertionError @staticmethod @@ -212,10 +214,12 @@ def estimate_cost(self) -> float: enc.encode(self.ref_answer["choices"][0]["turns"][0]) ) num_output_tokens = 200 # Estimated from a few samples - if self.judge.model == "gpt-4": - return 2 * (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000 + if self.judge.model in {"gpt-4", "gpt-4-0613"}: + return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000 + elif self.judge.model == "gpt-4-1106-preview": + return (0.01 * num_input_tokens + 0.03 * num_output_tokens) / 1_000 elif self.judge.model == "gpt-3.5-turbo": - return 2 * (0.001 * num_input_tokens + 0.002 * num_output_tokens) / 1_000 + return (0.0005 * num_input_tokens + 0.0015 * num_output_tokens) / 1_000 raise AssertionError @staticmethod diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 9a9de37..9c18766 100644 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -133,7 +133,7 @@ def make_match_groups_pairwise( "--judge-model", type=str, default="gpt-4", - choices=["gpt-4", "gpt-3.5-turbo"], + choices=["gpt-4", "gpt-4-0613", "gpt-4-1106-preview", "gpt-3.5-turbo"], help="The judge model.", ) parser.add_argument( From 94c6c7fdbeb619e06b70cbc55a373c53f7072cda Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Fri, 9 Feb 2024 10:51:18 +0900 Subject: [PATCH 3/4] use reference answers generated by gpt-4 --- llm_judge/gen_judgment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 9c18766..6f6379a 100644 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -211,7 +211,7 @@ def make_match_groups_pairwise( logger.info("Load reference answers") judge_model = args.judge_model - answers = load_model_answers(REFERENCE_DIR / judge_model) + answers = load_model_answers(REFERENCE_DIR / "gpt-4") for question in filter(lambda x: x["category"] in NEED_REF_CATS, questions): assert question["question_id"] in answers ref_answers = {judge_model: answers} From 46f3a987590280c67db5f578f1018269d55dd77e Mon Sep 17 00:00:00 2001 From: Hirokazu Kiyomaru Date: Fri, 9 Feb 2024 10:56:21 +0900 Subject: [PATCH 4/4] use engine when api type is azure --- llm_judge/common.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/llm_judge/common.py b/llm_judge/common.py index d4e8ded..789bf12 100644 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -59,12 +59,16 @@ def judge(self, **kwargs): ] for _ in range(API_MAX_RETRY): try: - response = openai.ChatCompletion.create( - model=self.model, - messages=messages, - temperature=0, - max_tokens=2048, - ) + params = { + "messages": messages, + "temperature": 0, + "max_tokens": 2048, + } + if openai.api_type == "azure": + params["engine"] = self.model + else: + params["model"] = self.model + response = openai.ChatCompletion.create(**params) return response["choices"][0]["message"]["content"] except openai.error.OpenAIError as e: logger.warning(f"OpenAI API error: {e}")