diff --git a/README.md b/README.md index 854eb92..ef336b7 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,7 @@ python llm_judge/gen_model_answer.py --config Arguments & Options: - `` is the path to a configuration file. Examples are in `configs/`. + - `num_answers_per_question` specifies the number of answers to generate per question (default: all) For example: @@ -33,6 +34,8 @@ For example: python llm_judge/gen_model_answer.py --config configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json ``` + + #### Step 2. Generate GPT-4 judgments There are several options to use GPT-4 as a judge, such as pairwise win-rate and single-answer grading. @@ -43,7 +46,8 @@ OPENAI_API_KEY= python llm_judge/gen_judgment.py \ [--baseline-model ] \ [--model-list ] \ [--yes] \ - [--wandb] + [--wandb] \ + [--num_answers_per_question] ``` Arguments & Options: @@ -55,6 +59,7 @@ Arguments & Options: - `--model-list ` is a list of model IDs to be evaluated. If not specified, all models in `data/jp_bench/model_answer` will be evaluated. - `--yes` is a flag to skip the confirmation prompt. - `--wandb` is a flag to enable logging to W&B. You can upload the results later to W&B by running `upload_result.py`, as described in the next section. +- `num_answers_per_question` : specifies the number of answers to evaluate per question **Mode: `pairwise-baseline` (Default)** @@ -157,4 +162,3 @@ If you use our code in your research, please cite our work: year={2024} } ``` - diff --git a/configs/README.md b/configs/README.md deleted file mode 100644 index 889f306..0000000 --- a/configs/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# Configuration files - -Each configuration file is a JSON file with the following structure: - -```json5 -// rinna--japanese-gpt-neox-3.6b-instruction-ppo.json -{ - // The ID of the model - "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-ppo", - // The name of the model - "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-ppo", - // The name of the lora model (optional) - "lora_model_name_or_path": null, - // The name of the tokenizer (optional) - "tokenizer_name_or_path": null, - // The prompt template - "prompt_template": "ユーザー: {instruction}システム: ", - // The generation configuration (optional) - // NOTE: `temperature` will be set to a default value for each task category if left empty - "generation_config": { - "do_sample": true, - "max_new_tokens": 512, - "repetition_penalty": 1.1 - }, - // The special token map (optional); this is used to replace special tokens in the output - "special_token_map": { - "": "\n" - } -} -``` diff --git a/configs/cyberagent--calm2-7b-chat.json b/configs/cyberagent--calm2-7b-chat.json deleted file mode 100644 index fe5a486..0000000 --- a/configs/cyberagent--calm2-7b-chat.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "model_id": "cyberagent--calm2-7b-chat", - "model_name_or_path": "cyberagent/calm2-7b-chat", - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "USER: {instruction}\nASSISTANT: ", - "generation_config": { - "do_sample": true, - "max_length": 2048, - "temperature": 0.8 - }, - "special_token_map": {} -} diff --git a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json deleted file mode 100644 index 7db6c30..0000000 --- a/configs/llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model_id": "llm-jp--llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0", - "model_name_or_path": "llm-jp/llm-jp-13b-instruct-full-jaster-dolly-oasst-v1.0", - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "{instruction} ### 回答:", - "generation_config": { - "do_sample": true, - "max_length": 2048, - "temperature": 0.7, - "top_p": 0.95 - }, - "special_token_map": {} -} diff --git a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json b/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json deleted file mode 100644 index 73fac46..0000000 --- a/configs/llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "model_id": "llm-jp--llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0", - "model_name_or_path": "llm-jp/llm-jp-13b-v1.0", - "lora_model_name_or_path": "llm-jp/llm-jp-13b-instruct-lora-jaster-dolly-oasst-v1.0", - "tokenizer_name_or_path": null, - "prompt_template": "{instruction} ### 回答:", - "generation_config": { - "do_sample": true, - "max_length": 2048, - "temperature": 0.7, - "top_p": 0.95 - }, - "special_token_map": {} -} diff --git a/configs/openai--text-davinci-003.json b/configs/openai--text-davinci-003.json deleted file mode 100644 index a07e3c2..0000000 --- a/configs/openai--text-davinci-003.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "model_id": "openai--text-davinci-003", - "model_name_or_path": null, - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "{instruction}", - "generation_config": { - "engine": "text-davinci-003", - "temperature": 0.0, - "max_tokens": 2048, - "top_p": 1.0, - "frequency_penalty": 0.0, - "presence_penalty": 0.0 - }, - "special_token_map": {} -} diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json deleted file mode 100644 index 147e6a7..0000000 --- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-ppo.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-ppo", - "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-ppo", - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "ユーザー: {instruction}システム: ", - "generation_config": { - "do_sample": true, - "max_length": 2048, - "temperature": 0.7, - "repetition_penalty": 1.1 - }, - "special_token_map": { - "": "\n" - } -} diff --git a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json b/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json deleted file mode 100644 index d69e5ab..0000000 --- a/configs/rinna--japanese-gpt-neox-3.6b-instruction-sft-v2.json +++ /dev/null @@ -1,16 +0,0 @@ -{ - "model_id": "rinna--japanese-gpt-neox-3.6b-instruction-sft-v2", - "model_name_or_path": "rinna/japanese-gpt-neox-3.6b-instruction-sft-v2", - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "ユーザー: {instruction}システム: ", - "generation_config": { - "do_sample": true, - "max_length": 2048, - "temperature": 0.7, - "repetition_penalty": 1.1 - }, - "special_token_map": { - "": "\n" - } -} diff --git a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json b/configs/tokyotech-llm--Swallow-70b-instruct-hf.json deleted file mode 100644 index f6b5efa..0000000 --- a/configs/tokyotech-llm--Swallow-70b-instruct-hf.json +++ /dev/null @@ -1,13 +0,0 @@ -{ - "model_id": "tokyotech-llm--Swallow-70b-instruct-hf", - "model_name_or_path": "tokyotech-llm/Swallow-70b-instruct-hf", - "lora_model_name_or_path": null, - "tokenizer_name_or_path": null, - "prompt_template": "以下に、あるタスクを説明する指示があります。リクエストを適切に完了するための回答を記述してください。\n\n### 指示:\n{instruction}\n\n### 応答:\n", - "generation_config": { - "do_sample": true, - "max_length": 2048, - "temperature": 0.99, - "top_p": 0.95 - } - } diff --git a/llm_judge/common.py b/llm_judge/common.py index 789bf12..e5b6db8 100644 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -9,17 +9,20 @@ from typing import Optional, Union import openai +from openai import AzureOpenAI + +client = AzureOpenAI(api_key=os.getenv("OPENAI_API_KEY"), +api_version=os.getenv("OPENAI_API_VERSION")) import tiktoken from dotenv import load_dotenv logger = logging.getLogger(__name__) load_dotenv() -openai.api_key = os.getenv("OPENAI_API_KEY") -openai.organization = os.getenv("OPENAI_ORGANIZATION") -openai.api_type = os.getenv("OPENAI_API_TYPE") -openai.api_base = os.getenv("OPENAI_API_BASE") -openai.api_version = os.getenv("OPENAI_API_VERSION") +# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.getenv("OPENAI_ORGANIZATION"))' +# openai.organization = os.getenv("OPENAI_ORGANIZATION") +# TODO: The 'openai.api_base' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(base_url=os.getenv("OPENAI_API_BASE"))' +# openai.api_base = os.getenv("OPENAI_API_BASE") # Data paths JP_BENCH_DIR = Path(__file__).resolve().parent.parent / "data" / "jp_bench" @@ -68,9 +71,9 @@ def judge(self, **kwargs): params["engine"] = self.model else: params["model"] = self.model - response = openai.ChatCompletion.create(**params) - return response["choices"][0]["message"]["content"] - except openai.error.OpenAIError as e: + response = client.chat.completions.create(**params) + return response.choices[0].message.content + except openai.OpenAIError as e: logger.warning(f"OpenAI API error: {e}") time.sleep(API_RETRY_SLEEP) @@ -147,7 +150,6 @@ def get_score(judgment: str) -> int: return ast.literal_eval(match.groups()[0]) return -1 - @dataclasses.dataclass class MatchPair: question: dict @@ -256,6 +258,19 @@ def get_model_list(answer_dir: Union[str, Path]): return [path.name for path in Path(answer_dir).iterdir()] +# def load_model_answers(answer_dir: Union[str, Path]): +# """Load model answers. + +# Args: +# answer_dir (Union[str, Path]): The answer directory. +# """ +# answers = {} +# with open(Path(answer_dir) / "results.jsonl", "r") as fin: +# for line in fin: +# answer = json.loads(line) +# answers[answer["question_id"]] = answer +# return answers + def load_model_answers(answer_dir: Union[str, Path]): """Load model answers. @@ -266,7 +281,10 @@ def load_model_answers(answer_dir: Union[str, Path]): with open(Path(answer_dir) / "results.jsonl", "r") as fin: for line in fin: answer = json.loads(line) - answers[answer["question_id"]] = answer + qid = answer["question_id"] + if qid not in answers: + answers[qid] = [] + answers[qid].append(answer) return answers @@ -362,4 +380,4 @@ def filter_pairwise_judgements( filtered_result_id_results_map[result_id] = results else: filtered_result_id_results_map[result_id] = results - return filtered_result_id_results_map + return filtered_result_id_results_map \ No newline at end of file diff --git a/llm_judge/gen_gpt3.5_answer.py b/llm_judge/gen_gpt3.5_answer.py index a222111..2f8a4fe 100644 --- a/llm_judge/gen_gpt3.5_answer.py +++ b/llm_judge/gen_gpt3.5_answer.py @@ -5,6 +5,9 @@ import time import openai +from openai import OpenAI + +client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) import shortuuid from common import PREDICTION_DIR, QUESTION_FILE, load_questions from dotenv import load_dotenv @@ -13,8 +16,8 @@ logger = logging.getLogger(__name__) load_dotenv() -openai.api_key = os.getenv("OPENAI_API_KEY") -openai.organization = os.getenv("OPENAI_ORGANIZATION") +# TODO: The 'openai.organization' option isn't read in the client API. You will need to pass it when you instantiate the client, e.g. 'OpenAI(organization=os.getenv("OPENAI_ORGANIZATION"))' +# openai.organization = os.getenv("OPENAI_ORGANIZATION") def generate_response(input_text, generation_config) -> str: @@ -24,7 +27,7 @@ def generate_response(input_text, generation_config) -> str: input_text: The input text. generation_config: The config for the generation. """ - response = openai.Completion.create(prompt=input_text, **generation_config) + response = client.completions.create(prompt=input_text, **generation_config) return response.choices[0].text diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 6f6379a..4e44f51 100644 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -3,7 +3,7 @@ import logging from concurrent.futures import ThreadPoolExecutor from itertools import combinations -from typing import Optional +from typing import Optional, Dict, List from common import ( JUDGEMENT_DIR, @@ -27,93 +27,141 @@ def make_match_groups_single( - questions: list[dict], - model_answers: dict[str, dict[int, dict]], - ref_answers: dict[str, dict[int, dict]], + questions: List[dict], + model_answers: Dict[str, Dict[int, List[dict]]], + ref_answers: Dict[str, Dict[int, List[dict]]], judge_default: Judge, judge_math: Judge, + num_answers_per_question: Optional[int] = None, ): - """Make match groups for single answer grading. - - Args: - questions (list): A list of questions. - model_answers (dict): A dict of model answers. - ref_answers (dict): A dict of reference answers. - judge_default (Judge): A judge for default questions. - judge_math (Judge): A judge for math questions. - """ - match_groups = {} - for model in model_answers: - matches = [] - for question in questions: - qid = question["question_id"] - answer = model_answers[model][qid] - if question["category"] in NEED_REF_CATS: - judge = judge_math - ref_answer = ref_answers[judge.model][qid] - else: - judge = judge_default + """Make match groups for single answer grading.""" + + match_groups = {model: [] for model in model_answers} + + for question in questions: + qid = question["question_id"] + category = question["category"] + + # Determine if reference answer is needed + if category in NEED_REF_CATS: + judge = judge_math + ref_answer_list = ref_answers[judge.model].get(qid) + if not ref_answer_list: + logger.warning(f"No reference answer for question {qid} in model {judge.model}") ref_answer = None - matches.append( - MatchSingle( + else: + ref_answer = ref_answer_list[0] + else: + judge = judge_default + ref_answer = None + # Get all models that have answers for this question + available_models = [model for model, answers in model_answers.items() if qid in answers] + + for model in available_models: + answers = model_answers[model][qid] + if num_answers_per_question is not None: + selected_answers = answers[:num_answers_per_question] + else: + selected_answers = answers + + for answer in selected_answers: + match = MatchSingle( question=question, model=model, answer=answer, judge=judge, ref_answer=ref_answer, ) - ) - match_groups[f"single:{model}"] = matches + match_groups[model].append(match) + return match_groups def make_match_groups_pairwise( - questions: list[dict], - model_answers: dict[str, dict[int, dict]], - ref_answers: dict[str, dict[int, dict]], + questions: List[dict], + model_answers: Dict[str, Dict[int, List[dict]]], + ref_answers: Dict[str, Dict[int, List[dict]]], judge_default: Judge, judge_math: Judge, baseline_model: Optional[str] = None, + num_answers_per_question: Optional[int] = None, ): - """Make match groups for pairwise comparison. - - Args: - questions (list): A list of questions. - model_answers (dict): A dict of model answers. - ref_answers (dict): A dict of reference answers. - judge_default (Judge): A judge for default questions. - judge_math (Judge): A judge for math questions. - baseline_model (Optional[str]): The baseline model. - """ + """Make match groups for pairwise comparison.""" + match_groups = {} - for model_1, model_2 in combinations(model_answers, 2): - if baseline_model and baseline_model not in {model_1, model_2}: - continue - matches = [] - for question in questions: - qid = question["question_id"] - answer_1 = model_answers[model_1][qid] - answer_2 = model_answers[model_2][qid] - if question["category"] in NEED_REF_CATS: - judge = judge_math - ref_answer = ref_answers[judge.model][qid] - else: - judge = judge_default + + for question in questions: + qid = question["question_id"] + category = question["category"] + + # Determine if reference answer is needed + if category in NEED_REF_CATS: + judge = judge_math + ref_answer_list = ref_answers[judge.model].get(qid) + if not ref_answer_list: + logger.warning(f"No reference answer for question {qid} in model {judge.model}") ref_answer = None - matches.append( - MatchPair( - question=question, - model_1=model_1, - model_2=model_2, - answer_1=answer_1, - answer_2=answer_2, - judge=judge, - ref_answer=ref_answer, - ) - ) - match_groups[f"pairwise:{model_1}_{model_2}"] = matches - return match_groups + else: + ref_answer = ref_answer_list[0] + else: + judge = judge_default + ref_answer = None + + # Get all models that have answers for this question + available_models = [model for model, answers in model_answers.items() if qid in answers] + + if baseline_model: + if baseline_model not in available_models: + logger.warning(f"Baseline model {baseline_model} does not have an answer for question {qid}. Skipping.") + continue + non_baseline_models = [model for model in available_models if model != baseline_model] + else: + non_baseline_models = available_models + + if num_answers_per_question is not None: + selected_non_baseline_models = non_baseline_models[:num_answers_per_question] + else: + selected_non_baseline_models = non_baseline_models + + if baseline_model: + selected_models = selected_non_baseline_models + [baseline_model] + else: + selected_models = selected_non_baseline_models + + # Generate all unique pairs + for model_1, model_2 in combinations(selected_models, 2): + if baseline_model and (model_1 != baseline_model and model_2 != baseline_model): + # In pairwise-baseline mode, only create pairs with the baseline + continue + + pair_key = f"pairwise:{model_1}_{model_2}" + if pair_key not in match_groups: + match_groups[pair_key] = [] + + answers_1 = model_answers[model_1][qid] + answers_2 = model_answers[model_2][qid] + + if num_answers_per_question is not None: + selected_answers_1 = answers_1[:num_answers_per_question] + selected_answers_2 = answers_2[:num_answers_per_question] + else: + selected_answers_1 = answers_1 + selected_answers_2 = answers_2 + + for ans1 in selected_answers_1: + for ans2 in selected_answers_2: + match = MatchPair( + question=question, + model_1=model_1, + model_2=model_2, + answer_1=ans1, + answer_2=ans2, + judge=judge, + ref_answer=ref_answer, + ) + match_groups[pair_key].append(match) + return match_groups if __name__ == "__main__": parser = argparse.ArgumentParser() @@ -167,6 +215,12 @@ def make_match_groups_pairwise( parser.add_argument( "--verbose", "-v", action="count", default=0, help="Verbosity level" ) + parser.add_argument( + "--num_answers_per_question", + type=int, + default=None, + help="Number of answers to evaluate per question.", + ) args = parser.parse_args() if args.verbose == 0: @@ -206,14 +260,20 @@ def make_match_groups_pairwise( for model in sorted(models): answers = load_model_answers(PREDICTION_DIR / model) for question in questions: - assert question["question_id"] in answers + qid = question["question_id"] + if qid not in answers: + logger.error(f"Question ID {qid} missing in model {model} answers.") + raise ValueError(f"Question ID {qid} missing in model {model} answers.") model_answers[model] = answers logger.info("Load reference answers") judge_model = args.judge_model answers = load_model_answers(REFERENCE_DIR / "gpt-4") for question in filter(lambda x: x["category"] in NEED_REF_CATS, questions): - assert question["question_id"] in answers + qid = question["question_id"] + if qid not in answers: + logger.error(f"Reference answer for question ID {qid} missing.") + raise ValueError(f"Reference answer for question ID {qid} missing.") ref_answers = {judge_model: answers} logger.info("Load judge prompts") @@ -227,6 +287,7 @@ def make_match_groups_pairwise( ref_answers=ref_answers, judge_default=Judge(args.judge_model, judge_prompts["single"]), judge_math=Judge(args.judge_model, judge_prompts["single-math"]), + num_answers_per_question=args.num_answers_per_question, ) output_dir = JUDGEMENT_DIR / "single" / args.judge_model else: @@ -242,8 +303,11 @@ def make_match_groups_pairwise( judge_default=Judge(args.judge_model, judge_prompts["pair"]), judge_math=Judge(args.judge_model, judge_prompts["pair-math"]), baseline_model=baseline_model, + num_answers_per_question=args.num_answers_per_question, ) output_dir = JUDGEMENT_DIR / "pairwise" / args.judge_model + + # Filter out existing match_ids if not overwriting target_match_ids = set() for match_id in match_groups: output_file = output_dir / f"{match_id}.jsonl" @@ -278,15 +342,19 @@ def make_match_groups_pairwise( with ThreadPoolExecutor(args.parallel) as executor: futures = [executor.submit(match.play) for match in matches] for future in tqdm(futures): - results.append(future.result()) + try: + result = future.result() + results.append(result) + except Exception as e: + logger.error(f"Error processing match {match_id}: {e}") logger.info(f"Write {len(results)} judgments") output_file.parent.mkdir(parents=True, exist_ok=True) - with open(output_file, "w") as f: + with open(output_file, "w", encoding="utf-8") as f: for result in results: f.write(json.dumps(result, ensure_ascii=False) + "\n") logger.info(f"Saved the judgments to {output_file}") if args.wandb: logger.info("Log to wandb") - upload_results(args.mode, match_id, results, args.baseline_model) + upload_results(args.mode, match_id, results, args.baseline_model) \ No newline at end of file diff --git a/llm_judge/gen_model_answer.py b/llm_judge/gen_model_answer.py index e92de77..3e00e37 100644 --- a/llm_judge/gen_model_answer.py +++ b/llm_judge/gen_model_answer.py @@ -26,7 +26,6 @@ "generic": 0.1, } - def generate_response( input_text, model, tokenizer, generation_config=None, special_token_map=None ): @@ -64,7 +63,6 @@ def generate_response( return output - if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( @@ -79,6 +77,9 @@ def generate_response( parser.add_argument( "--overwrite", action="store_true", help="Overwrite the existing results" ) + parser.add_argument( + "--num_answers_per_question", type=int, default=1, help="Number of answers to generate per question" + ) args = parser.parse_args() if args.verbose == 0: @@ -159,25 +160,26 @@ def generate_response( category = question["category"] generation_config["temperature"] = DEFAULT_TEMPERATURE_MAP[category] - output = generate_response( - input_text=prompt_template.format_map({"instruction": instruction}), - model=model, - tokenizer=tokenizer, - generation_config=generation_config, - special_token_map=special_token_map, - ) - - logger.debug(f"{instruction}\n\n{output}") - - results.append( - { - "question_id": int(question["question_id"]), - "answer_id": shortuuid.uuid(), - "model_id": model_id, - "choices": [{"index": 0, "turns": [output]}], - "tstamp": time.time(), - } - ) + for _ in range(args.num_answers_per_question): + output = generate_response( + input_text=prompt_template.format_map({"instruction": instruction}), + model=model, + tokenizer=tokenizer, + generation_config=generation_config, + special_token_map=special_token_map, + ) + + logger.debug(f"{instruction}\n\n{output}") + + results.append( + { + "question_id": int(question["question_id"]), + "answer_id": shortuuid.uuid(), + "model_id": model_id, + "choices": [{"index": 0, "turns": [output]}], + "tstamp": time.time(), + } + ) logger.info("Save the results") prediction_dir.mkdir(parents=True, exist_ok=True) diff --git a/pyproject.toml b/pyproject.toml index 12f7ebc..2a793de 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,10 +15,10 @@ classifiers = [ dependencies = [ "accelerate", "fastapi", "gradio==3.35.2", "httpx", "markdown2[all]", "nh3", "numpy", "peft==0.5", "prompt_toolkit>=3.0.0", "pydantic<=2.0", "requests", "rich>=10.0.0", "sentencepiece", - "shortuuid", "shortuuid", "tiktoken", "tokenizers>=0.12.1", "torch", - "transformers", "uvicorn", "wandb", "openai==0.28.1", "ray", "python-dotenv", "protobuf==3.19", - "wandb", "tiktoken" + "shortuuid", "tiktoken", "tokenizers>=0.12.1", "torch", + "transformers", "uvicorn", "wandb", "openai==1.35.3", "ray", "python-dotenv", "protobuf==3.19" ] + [tool.setuptools.packages.find] exclude = ["*"]