diff --git a/llm_judge/common.py b/llm_judge/common.py index 8bc2c94..789bf12 100644 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -17,6 +17,9 @@ load_dotenv() openai.api_key = os.getenv("OPENAI_API_KEY") openai.organization = os.getenv("OPENAI_ORGANIZATION") +openai.api_type = os.getenv("OPENAI_API_TYPE") +openai.api_base = os.getenv("OPENAI_API_BASE") +openai.api_version = os.getenv("OPENAI_API_VERSION") # Data paths JP_BENCH_DIR = Path(__file__).resolve().parent.parent / "data" / "jp_bench" @@ -56,12 +59,16 @@ def judge(self, **kwargs): ] for _ in range(API_MAX_RETRY): try: - response = openai.ChatCompletion.create( - model=self.model, - messages=messages, - temperature=0, - max_tokens=2048, - ) + params = { + "messages": messages, + "temperature": 0, + "max_tokens": 2048, + } + if openai.api_type == "azure": + params["engine"] = self.model + else: + params["model"] = self.model + response = openai.ChatCompletion.create(**params) return response["choices"][0]["message"]["content"] except openai.error.OpenAIError as e: logger.warning(f"OpenAI API error: {e}") @@ -121,10 +128,12 @@ def estimate_cost(self) -> float: enc.encode(self.ref_answer["choices"][0]["turns"][0]) ) num_output_tokens = 200 # Estimated from a few samples - if self.judge.model == "gpt-4": + if self.judge.model in {"gpt-4", "gpt-4-0613"}: return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000 + elif self.judge.model == "gpt-4-1106-preview": + return (0.01 * num_input_tokens + 0.03 * num_output_tokens) / 1_000 elif self.judge.model == "gpt-3.5-turbo": - return (0.001 * num_input_tokens + 0.002 * num_output_tokens) / 1_000 + return (0.0005 * num_input_tokens + 0.0015 * num_output_tokens) / 1_000 raise AssertionError @staticmethod @@ -209,10 +218,12 @@ def estimate_cost(self) -> float: enc.encode(self.ref_answer["choices"][0]["turns"][0]) ) num_output_tokens = 200 # Estimated from a few samples - if self.judge.model == "gpt-4": - return 2 * (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000 + if self.judge.model in {"gpt-4", "gpt-4-0613"}: + return (0.03 * num_input_tokens + 0.06 * num_output_tokens) / 1_000 + elif self.judge.model == "gpt-4-1106-preview": + return (0.01 * num_input_tokens + 0.03 * num_output_tokens) / 1_000 elif self.judge.model == "gpt-3.5-turbo": - return 2 * (0.001 * num_input_tokens + 0.002 * num_output_tokens) / 1_000 + return (0.0005 * num_input_tokens + 0.0015 * num_output_tokens) / 1_000 raise AssertionError @staticmethod diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 9a9de37..6f6379a 100644 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -133,7 +133,7 @@ def make_match_groups_pairwise( "--judge-model", type=str, default="gpt-4", - choices=["gpt-4", "gpt-3.5-turbo"], + choices=["gpt-4", "gpt-4-0613", "gpt-4-1106-preview", "gpt-3.5-turbo"], help="The judge model.", ) parser.add_argument( @@ -211,7 +211,7 @@ def make_match_groups_pairwise( logger.info("Load reference answers") judge_model = args.judge_model - answers = load_model_answers(REFERENCE_DIR / judge_model) + answers = load_model_answers(REFERENCE_DIR / "gpt-4") for question in filter(lambda x: x["category"] in NEED_REF_CATS, questions): assert question["question_id"] in answers ref_answers = {judge_model: answers}