diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 9e06523..3402f23 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -9,4 +9,3 @@ ## Related issue number (if applicable) - diff --git a/.gitignore b/.gitignore index 94230b2..10f3929 100644 --- a/.gitignore +++ b/.gitignore @@ -176,20 +176,12 @@ pyrightconfig.json # End of https://www.toptal.com/developers/gitignore/api/python # folder -fastchat/llm_judge/data/vicuna_jp -llm_judge/data +data llm_judge/judge.sh llm_judge/LLMs llm_judge/gen_model_answer copy.py llm_judge/gen_model_answer_ori.py -# files -fastchat/llm_judge/data/judge_prompts_jp.jsonl -fastchat/llm_judge/data/question_Japanese_ver.jsonl -fastchat/llm_judge/data/jp_bench/question_Japanese_ver.jsonl -fastchat/llm_judge/data/jp_bench/question_Japanese_ver2.jsonl -fastchat/llm_judge/run_eval.sh - # Log *.log *.log.* @@ -214,4 +206,3 @@ tests/state_of_the_union.txt # Build build - diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6de8d5e..29ff389 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -11,4 +11,4 @@ repos: hooks: - id: ruff args: [--fix, --exit-non-zero-on-fix] - - id: ruff-format \ No newline at end of file + - id: ruff-format diff --git a/README.md b/README.md index de188ee..d98a912 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Japanese Vicuna QA Benchmark - We released Japanese Vicuna QA Benchmark for measuring comprehensive capabilities of Japanese LLMs, which consists of 80 diverse questions in 10 categories (generic, coding, roleplay, writing, etc.) + We released Japanese Vicuna QA Benchmark for measuring comprehensive capabilities of Japanese LLMs, which consists of 80 diverse questions in 10 categories (generic, coding, roleplay, writing, etc.) You can leverage this package to evaluate the answers of your Japanese LLM models in a reference-free manner with LLM-as-a-judge. To automate the evaluation process, we prompt strong LLMs like GPT-4 to act as judges and assess the quality of the models' responses. @@ -138,13 +138,13 @@ python show_result.py \ ## Sample Outputs -#### Question: +#### Question: > 植物性タンパク源と動物性タンパク源の違いは何ですか? -#### Answer of rinna-3.6b: +#### Answer of rinna-3.6b: > 栄養素の含有量に大きな違いがあります。 -#### Answer of rinna-3.6b-sft-v2: +#### Answer of rinna-3.6b-sft-v2: > 植物性タンパク源は肉や魚など、いくつかの種類の食品を指します。一方、動物性タンパク源には卵、乳製品、大豆、ナッツ、種などが含まれます。 -#### Answer of rinna-3.6b-ppo: +#### Answer of rinna-3.6b-ppo: > 植物性タンパク源とは、主に大豆や豆腐などの大豆製品を生産する植物によって作られるタンパク質で、卵、牛乳、肉などの動物性タンパク源よりも栄養価が高く、低カロリーです。一方、動物性タンパク源には、牛肉、豚肉、鶏肉などが含まれます。 #### Answer of japanese-alpaca-lora-7b: > 動物タンポク源(ATP)は、動物によって生産されるエネルギー源である。ATPは、細胞におけるATP認識システムの活性化により生成されています。動物のATP源は、遺伝的に組み込まれたATP生成機構であり、これは、ATPを生成するために使用され、経験的にATPの量を増加させることができる。 @@ -160,12 +160,12 @@ python show_result.py \ | rinna-3.6b-sft-v2 | 70 | 35 | 58 | 0.429448 | 0.214724 | 0.607362 | | japanese-alpaca-lora-7b | 15 | 112 | 38 | 0.090909 | 0.678788 | 0.206061 | -The GPT4 judgments is placed in `data/jp_bench/model_judgment/gpt-4_pair.jsonl`. +The GPT4 judgments is placed in `data/jp_bench/model_judgment/gpt-4_pair.jsonl`. To be noticed, `pairwise-all` might become very inefficient when evaluating more LLMs, as it evaluates combinations of each two of them. In such cases, we recommend using the `pairwise-baseline` mode, allowing all models to be compared against a fixed baseline such as ChatGPT. ## Supported baseline Models -To make it more convenient for users to utilize pairwise comparisons with existing Japanese LLMs, we offer the prediction of the following four baselines in `fastchat/llm_judge/data/jp_bench/model_answer`. +To make it more convenient for users to utilize pairwise comparisons with existing Japanese LLMs, we offer the prediction of the following four baselines in `fastchat/llm_judge/data/jp_bench/model_answer`. - [Rinna-3.6B](https://huggingface.co/rinna/japanese-gpt-neox-3.6b) - [Rinna-3.6B-sft-v2](https://huggingface.co/rinna/japanese-gpt-neox-3.6b-instruction-sft-v2) diff --git a/llm_judge/clean_judgment.py b/llm_judge/clean_judgment.py deleted file mode 100644 index d139ed7..0000000 --- a/llm_judge/clean_judgment.py +++ /dev/null @@ -1,93 +0,0 @@ -""" -Clean model judgment files. -""" -import argparse -import json - -selected_models = [ - "alpaca-13b", - "baize-v2-13b", - "chatglm-6b", - "claude-instant-v1", - "claude-v1", - "dolly-v2-12b", - "falcon-40b-instruct", - "fastchat-t5-3b", - "gpt-3.5-turbo", - "gpt-4", - "gpt4all-13b-snoozy", - "guanaco-33b", - "guanaco-65b", - "h2ogpt-oasst-open-llama-13b", - "koala-13b", - "llama-13b", - "mpt-30b-chat", - "mpt-30b-instruct", - "mpt-7b-chat", - "nous-hermes-13b", - "oasst-sft-4-pythia-12b", - "oasst-sft-7-llama-30b", - "palm-2-chat-bison-001", - "rwkv-4-raven-14b", - "stablelm-tuned-alpha-7b", - "tulu-30b", - "vicuna-13b-v1.3", - "vicuna-33b-v1.3", - "vicuna-7b-v1.3", - "wizardlm-13b", - "wizardlm-30b", -] - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--infile", type=str) - args = parser.parse_args() - - infile = args.infile - outfile = infile.replace(".jsonl", "_clean.jsonl") - - raw_lines = open(infile).readlines() - rets = [] - models = set() - visited = set() - for line in raw_lines: - obj = json.loads(line) - - if "model_1" in obj: # pair - model = obj["model_1"] - key = ( - obj["model_1"], - obj["model_2"], - obj["question_id"], - tuple(obj["judge"]), - ) - else: # single - model = obj["model"] - key = (obj["model"], obj["question_id"], tuple(obj["judge"])) - - if key in visited: - continue - visited.add(key) - - if model not in selected_models: - continue - models.add(model) - rets.append(obj) - - models = sorted(list(models)) - missing_models = [x for x in selected_models if x not in models] - print(f"in models: {models}, number: {len(models)}") - print(f"missing models: {missing_models}") - print(f"#in: {len(raw_lines)}, #out: {len(rets)}") - rets.sort( - key=lambda x: ( - x["model"] if "model" in x else x["model_1"], - x["question_id"], - x["turn"], - ) - ) - - with open(outfile, "w") as fout: - for x in rets: - fout.write(json.dumps(x) + "\n") diff --git a/llm_judge/common.py b/llm_judge/common.py index 0fe15db..4d20f52 100644 --- a/llm_judge/common.py +++ b/llm_judge/common.py @@ -11,9 +11,13 @@ import time from typing import Optional import openai -import anthropic +from dotenv import load_dotenv -from model.model_adapter import get_conversation_template +from model_adapter import get_conversation_template + +load_dotenv() # Load environment variables from .env file + +openai.api_key = os.getenv("OPENAI_API_KEY") # API setting constants API_MAX_RETRY = 16 @@ -174,10 +178,6 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False): if model in ["gpt-3.5-turbo", "gpt-4"]: judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048) - elif model in ["claude-v1", "claude-instant-v1"]: - judgment = chat_compeletion_anthropic( - model, conv, temperature=0, max_tokens=1024 - ) else: raise ValueError(f"Invalid judge model name: {model}") @@ -282,13 +282,8 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F if model in ["gpt-3.5-turbo", "gpt-4"]: conv.system = system_prompt judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048) - elif model in ["claude-v1", "claude-instant-v1"]: - if system_prompt != "You are a helpful assistant.": - user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt - conv.messages[0][1] = user_prompt - judgment = chat_compeletion_anthropic( - model, conv, temperature=0, max_tokens=1024 - ) + print(judgment) + assert False else: raise ValueError(f"Invalid judge model name: {model}") @@ -422,6 +417,7 @@ def chat_compeletion_openai(model, conv, temperature, max_tokens): for _ in range(API_MAX_RETRY): try: messages = conv.to_openai_api_messages() + print(messages) response = openai.ChatCompletion.create( model=model, messages=messages, @@ -438,53 +434,6 @@ def chat_compeletion_openai(model, conv, temperature, max_tokens): return output -def chat_compeletion_anthropic(model, conv, temperature, max_tokens): - output = API_ERROR_OUTPUT - for _ in range(API_MAX_RETRY): - try: - c = anthropic.Client(os.environ["ANTHROPIC_API_KEY"]) - prompt = conv.get_prompt() - response = c.completion( - model=model, - prompt=prompt, - stop_sequences=[anthropic.HUMAN_PROMPT], - max_tokens_to_sample=max_tokens, - temperature=temperature, - ) - output = response["completion"] - break - except anthropic.ApiException as e: - print(type(e), e) - time.sleep(API_RETRY_SLEEP) - return output.strip() - - -def chat_compeletion_palm(chat_state, model, conv, temperature, max_tokens): - from fastchat.serve.api_provider import init_palm_chat - - assert model == "palm-2-chat-bison-001" - - if chat_state is None: - chat_state = init_palm_chat("chat-bison@001") - - parameters = { - "temperature": temperature, - "top_p": 0.8, - "top_k": 40, - "max_output_tokens": max_tokens, - } - output = API_ERROR_OUTPUT - for _ in range(API_MAX_RETRY): - try: - response = chat_state.send_message(conv.messages[-2][1], **parameters) - output = response.text - break - except Exception as e: - print(type(e), e) - time.sleep(API_RETRY_SLEEP) - return chat_state, output - - def normalize_game_key_single(gamekey, result): """Make the model names sorted in a game key.""" qid, model_1, model_2 = gamekey diff --git a/llm_judge/compute_agreement.py b/llm_judge/compute_agreement.py deleted file mode 100644 index e579c46..0000000 --- a/llm_judge/compute_agreement.py +++ /dev/null @@ -1,137 +0,0 @@ -""" -Compute agreement among judges. - -Usage: -python compute_agreement.py --judges gpt4-pair human --votefiles human_judgments.json gpt4_pair_judgments.json -python compute_agreement.py --judges human human --votefiles human_judgments.json -""" -import argparse -import json - - -def get_judge_name(judge): - if isinstance(judge, list) and judge[0] == "gpt-4" and judge[1].startswith("pair"): - return "gpt4-pair" - if judge.startswith("expert"): - return "human" - if judge.startswith("author"): - return "author" - - -def revert(vote): - if vote == "model_a": - return "model_b" - elif vote == "model_b": - return "model_a" - return vote - - -def get_mt_bench_votes_data(raw_votes): - data = [{}, {}] - - for judge_votes in raw_votes: - for vote in judge_votes: - turn = vote["turn"] - 1 - if vote["model_a"] < vote["model_b"]: - key = (vote["question_id"], vote["model_a"], vote["model_b"]) - winner = vote["winner"] - else: - key = (vote["question_id"], vote["model_b"], vote["model_a"]) - winner = revert(vote["winner"]) - judge = get_judge_name(vote["judge"]) - if key not in data[turn]: - data[turn][key] = {} - if judge not in data[turn][key]: - data[turn][key][judge] = [] - data[turn][key][judge].append(winner) - - return data - - -def convertvote(vote): - if "tie" in vote: - return "tie" - return vote - - -def equalvote(vote1, vote2): - if "tie" in vote1 and "tie" in vote2: - return True - return vote1 == vote2 - - -# data: Dict[qid -> List[vote]] -def get_mt_bench_agreement(data, judge1, judge2, ban): - if judge1.startswith("gpt4") and judge2 == "human": - stats = [0, 0] - for votes in data.values(): - if judge1 not in votes or judge2 not in votes: - continue - assert len(votes[judge1]) == 1 - if convertvote(votes[judge1][0]) in ban: - continue - for v in votes[judge2]: - if convertvote(v) in ban: - continue - stats[1] += 1 - stats[0] += equalvote(votes[judge1][0], v) - return stats[0], stats[1] - elif judge1 == "human" and judge2 == "human": - stats = [0, 0] - for votes in data.values(): - if "human" not in votes: - continue - for i in range(len(votes["human"]) - 1): - for j in range(i + 1, len(votes["human"])): - if ( - convertvote(votes["human"][i]) in ban - or convertvote(votes["human"][j]) in ban - ): - continue - stats[1] += 1 - stats[0] += equalvote(votes["human"][i], votes["human"][j]) - return stats[0], stats[1] - else: - raise Exception("Unsupported judges.") - - -def run_mt_bench_agreement(judges, votefiles): - # votes[i]: List of votes - votes = [] - for filename in votefiles: - with open(filename, "r") as f: - data = json.load(f) - votes.append(data) - - data = get_mt_bench_votes_data(votes) - - agree, total = get_mt_bench_agreement(data[0], judges[0], judges[1], ban=[]) - print( - f"turn 1 with tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}" - ) - agree, total = get_mt_bench_agreement(data[0], judges[0], judges[1], ban=["tie"]) - print( - f"turn 1 without tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}" - ) - agree, total = get_mt_bench_agreement(data[1], judges[0], judges[1], ban=[]) - print( - f"turn 2 with tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}" - ) - agree, total = get_mt_bench_agreement(data[1], judges[0], judges[1], ban=["tie"]) - print( - f"turn 2 without tie. #total: {total}, #agree: {agree}, ratio: {agree/total:.2f}" - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--judges", nargs=2, type=str, default=["gpt4-pair", "human"]) - parser.add_argument( - "--votefiles", - nargs="+", - type=str, - default=["gpt4_judgments.json", "human_judgments.json"], - ) - args = parser.parse_args() - - run_mt_bench_agreement(args.judges, args.votefiles) diff --git a/llm_judge/constants.py b/llm_judge/constants.py deleted file mode 100644 index 52109c9..0000000 --- a/llm_judge/constants.py +++ /dev/null @@ -1,58 +0,0 @@ -from enum import IntEnum -import os - -REPO_PATH = os.path.dirname(os.path.dirname(__file__)) - -##### For the gradio web server -SERVER_ERROR_MSG = ( - "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**" -) -MODERATION_MSG = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE FIX YOUR INPUT AND TRY AGAIN." -CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION." -INACTIVE_MSG = "THIS SESSION HAS BEEN INACTIVE FOR TOO LONG. PLEASE REFRESH THIS PAGE." -# Maximum input length -INPUT_CHAR_LEN_LIMIT = int(os.getenv("FASTCHAT_INPUT_CHAR_LEN_LIMIT", 2560)) -# Maximum conversation turns -CONVERSATION_TURN_LIMIT = 50 -# Session expiration time -SESSION_EXPIRATION_TIME = 3600 -# The output dir of log files -LOGDIR = "." - - -##### For the controller and workers (could be overwritten through ENV variables.) -CONTROLLER_HEART_BEAT_EXPIRATION = int( - os.getenv("FASTCHAT_CONTROLLER_HEART_BEAT_EXPIRATION", 90) -) -WORKER_HEART_BEAT_INTERVAL = int(os.getenv("FASTCHAT_WORKER_HEART_BEAT_INTERVAL", 45)) -WORKER_API_TIMEOUT = int(os.getenv("FASTCHAT_WORKER_API_TIMEOUT", 100)) -WORKER_API_EMBEDDING_BATCH_SIZE = int( - os.getenv("FASTCHAT_WORKER_API_EMBEDDING_BATCH_SIZE", 4) -) - - -class ErrorCode(IntEnum): - """ - https://platform.openai.com/docs/guides/error-codes/api-errors - """ - - VALIDATION_TYPE_ERROR = 40001 - - INVALID_AUTH_KEY = 40101 - INCORRECT_AUTH_KEY = 40102 - NO_PERMISSION = 40103 - - INVALID_MODEL = 40301 - PARAM_OUT_OF_RANGE = 40302 - CONTEXT_OVERFLOW = 40303 - - RATE_LIMIT = 42901 - QUOTA_EXCEEDED = 42902 - ENGINE_OVERLOADED = 42903 - - INTERNAL_ERROR = 50001 - CUDA_OUT_OF_MEMORY = 50002 - GRADIO_REQUEST_ERROR = 50003 - GRADIO_STREAM_UNKNOWN_ERROR = 50004 - CONTROLLER_NO_WORKER = 50005 - CONTROLLER_WORKER_TIMEOUT = 50006 diff --git a/llm_judge/gen_gpt3.5_answer.py b/llm_judge/gen_gpt3.5_answer.py index 9696876..b2e3def 100644 --- a/llm_judge/gen_gpt3.5_answer.py +++ b/llm_judge/gen_gpt3.5_answer.py @@ -4,10 +4,68 @@ from tqdm import tqdm import time import shortuuid + +from typing import List + + +class GPT3_Demo(object): + def __init__( + self, + engine, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + best_of, + logprobs, + ): + self.engine = engine + self.temperature = temperature + self.max_tokens = max_tokens + self.top_p = top_p + self.frequency_penalty = frequency_penalty + self.presence_penalty = presence_penalty + self.best_of = best_of + self.logprobs = logprobs + + def get_multiple_sample(self, prompt_list: List[str]): + response = openai.Completion.create( + engine=self.engine, + prompt=prompt_list, + temperature=self.temperature, + max_tokens=self.max_tokens, + top_p=self.top_p, + frequency_penalty=self.frequency_penalty, + presence_penalty=self.presence_penalty, + best_of=self.best_of, + logprobs=self.logprobs, + ) + results = [choice.text for choice in response.choices] + return results + + +def run_gpt3(prompt_list): + demo = GPT3_Demo( + engine="text-davinci-003", # text-davinci-003: best, text-ada-001: lowest price + temperature=0, # control randomness: lowring results in less random completion (0 ~ 1.0) + max_tokens=8, # max number of tokens to generate (1 ~ 4,000) + top_p=1, # control diversity (0 ~ 1.0) + frequency_penalty=0, # how to penalize new tokens based on their existing frequency (0 ~ 2.0) + presence_penalty=0, # 这个是对于词是否已经出现过的惩罚,文档上说这个值调高可以增大谈论新topic的概率 (0 ~ 2.0) + best_of=3, # 这个是说从多少个里选最好的,如果这里是10,就会生成10个然后选最好的,但是这样会更贵(1 ~ 20) + logprobs=1, + ) + results = demo.get_multiple_sample(prompt_list) + return results + + # use following codes to activate azure: # use openai api with group PRISM + + class Chat_Demo(object): def __init__( self, @@ -93,17 +151,17 @@ def run_chatgpt(user_prompt_list): examples = instruction_list results = [] for index, example in tqdm(enumerate(examples)): - response = run_chatgpt(example) + response = run_gpt3(example) results.append( { "question_id": question[index]["question_id"], "answer_id": shortuuid.uuid(), - "model_id": "gpt-4", + "model_id": "gpt-3.5-davinci", "choices": [{"index": 0, "turns": [response]}], "tstamp": time.time(), } ) - predictions_file = "./data/jp_bench/model_answer/GPT-4_reference.jsonl" + predictions_file = "./data/jp_bench/model_answer/gpt-3.5-davinci.jsonl" dirname = os.path.dirname(predictions_file) os.makedirs(dirname, exist_ok=True) with open(predictions_file, "w") as f: diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py index 0a59112..855f1ac 100644 --- a/llm_judge/gen_judgment.py +++ b/llm_judge/gen_judgment.py @@ -5,8 +5,10 @@ import argparse from concurrent.futures import ThreadPoolExecutor import json +import os import numpy as np from tqdm import tqdm +from dotenv import load_dotenv from common import ( load_questions, @@ -23,6 +25,11 @@ ) +load_dotenv() # Load environment variables from .env file + +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") + + def make_match( questions, models, diff --git a/llm_judge/gen_model_answer.py b/llm_judge/gen_model_answer.py index 9912a58..bfdfc4b 100644 --- a/llm_judge/gen_model_answer.py +++ b/llm_judge/gen_model_answer.py @@ -4,6 +4,7 @@ import shortuuid import time from tqdm import tqdm + import torch from transformers import ( GenerationConfig, diff --git a/llm_judge/model/__init__.py b/llm_judge/model/__init__.py deleted file mode 100644 index d61b79d..0000000 --- a/llm_judge/model/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from model.model_adapter import ( - load_model, - get_conversation_template, - add_model_args, -) diff --git a/llm_judge/model/apply_delta.py b/llm_judge/model/apply_delta.py deleted file mode 100644 index ba1c06d..0000000 --- a/llm_judge/model/apply_delta.py +++ /dev/null @@ -1,165 +0,0 @@ -""" -Apply the delta weights on top of a base model. - -Usage: -python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta-v1.1 -""" -import argparse -import gc -import glob -import json -import os -import shutil -import tempfile - -from huggingface_hub import snapshot_download -import torch -from torch import nn -from tqdm import tqdm -from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig - - -GB = 1 << 30 - - -def split_files(model_path, tmp_path, split_size): - if not os.path.exists(model_path): - model_path = snapshot_download(repo_id=model_path) - if not os.path.exists(tmp_path): - os.makedirs(tmp_path) - - file_pattern = os.path.join(model_path, "pytorch_model-*.bin") - files = glob.glob(file_pattern) - - part = 0 - try: - for file_path in tqdm(files): - state_dict = torch.load(file_path) - new_state_dict = {} - - current_size = 0 - for name, param in state_dict.items(): - param_size = param.numel() * param.element_size() - - if current_size + param_size > split_size: - new_file_name = f"pytorch_model-{part}.bin" - new_file_path = os.path.join(tmp_path, new_file_name) - torch.save(new_state_dict, new_file_path) - current_size = 0 - new_state_dict = None - gc.collect() - new_state_dict = {} - part += 1 - - new_state_dict[name] = param - current_size += param_size - - new_file_name = f"pytorch_model-{part}.bin" - new_file_path = os.path.join(tmp_path, new_file_name) - torch.save(new_state_dict, new_file_path) - new_state_dict = None - gc.collect() - new_state_dict = {} - part += 1 - except Exception as e: - print(f"An error occurred during split_files: {e}") - shutil.rmtree(tmp_path) - raise - - -def apply_delta_low_cpu_mem(base_model_path, target_model_path, delta_path): - delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False) - delta_config = AutoConfig.from_pretrained(delta_path) - - if os.path.exists(target_model_path): - shutil.rmtree(target_model_path) - os.makedirs(target_model_path) - - split_size = 4 * GB - - with tempfile.TemporaryDirectory() as tmp_base_path, tempfile.TemporaryDirectory() as tmp_delta_path: - print(f"Split files for the base model to {tmp_base_path}") - split_files(base_model_path, tmp_base_path, split_size) - print(f"Split files for the delta weights to {tmp_delta_path}") - split_files(delta_path, tmp_delta_path, split_size) - - base_pattern = os.path.join(tmp_base_path, "pytorch_model-*.bin") - base_files = glob.glob(base_pattern) - delta_pattern = os.path.join(tmp_delta_path, "pytorch_model-*.bin") - delta_files = glob.glob(delta_pattern) - delta_state_dict = torch.load(delta_files[0]) - - print("Applying the delta") - weight_map = {} - total_size = 0 - - for i, base_file in tqdm(enumerate(base_files)): - state_dict = torch.load(base_file) - file_name = f"pytorch_model-{i}.bin" - for name, param in state_dict.items(): - if name not in delta_state_dict: - for delta_file in delta_files: - delta_state_dict = torch.load(delta_file) - gc.collect() - if name in delta_state_dict: - break - - state_dict[name] += delta_state_dict[name] - weight_map[name] = file_name - total_size += param.numel() * param.element_size() - gc.collect() - torch.save(state_dict, os.path.join(target_model_path, file_name)) - - with open( - os.path.join(target_model_path, "pytorch_model.bin.index.json"), "w" - ) as f: - json.dump( - {"weight_map": weight_map, "metadata": {"total_size": total_size}}, f - ) - - print(f"Saving the target model to {target_model_path}") - delta_tokenizer.save_pretrained(target_model_path) - delta_config.save_pretrained(target_model_path) - - -def apply_delta(base_model_path, target_model_path, delta_path): - print(f"Loading the delta weights from {delta_path}") - delta_tokenizer = AutoTokenizer.from_pretrained(delta_path, use_fast=False) - delta = AutoModelForCausalLM.from_pretrained( - delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True - ) - - print(f"Loading the base model from {base_model_path}") - base = AutoModelForCausalLM.from_pretrained( - base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True - ) - - print("Applying the delta") - for name, param in tqdm(base.state_dict().items(), desc="Applying delta"): - assert name in delta.state_dict() - param.data += delta.state_dict()[name] - - print(f"Saving the target model to {target_model_path}") - base.save_pretrained(target_model_path) - delta_tokenizer.save_pretrained(target_model_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--base-model-path", type=str, required=True) - parser.add_argument("--target-model-path", type=str, required=True) - parser.add_argument("--delta-path", type=str, required=True) - parser.add_argument( - "--low-cpu-mem", - action="store_true", - help="Lower the cpu memory usage. This will split large files and use " - "disk as swap to reduce the memory usage below 10GB.", - ) - args = parser.parse_args() - - if args.low_cpu_mem: - apply_delta_low_cpu_mem( - args.base_model_path, args.target_model_path, args.delta_path - ) - else: - apply_delta(args.base_model_path, args.target_model_path, args.delta_path) diff --git a/llm_judge/model/apply_lora.py b/llm_judge/model/apply_lora.py deleted file mode 100644 index d74bc70..0000000 --- a/llm_judge/model/apply_lora.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Apply the LoRA weights on top of a base model. - -Usage: -python3 -m fastchat.model.apply_lora --base ~/model_weights/llama-7b --target ~/model_weights/baize-7b --lora project-baize/baize-lora-7B - -Dependency: -pip3 install git+https://github.com/huggingface/peft.git@2822398fbe896f25d4dac5e468624dc5fd65a51b -""" -import argparse - -import torch -from peft import PeftModel -from transformers import LlamaTokenizer, AutoTokenizer, AutoModelForCausalLM - - -def apply_lora(base_model_path, target_model_path, lora_path): - print(f"Loading the base model from {base_model_path}") - base = AutoModelForCausalLM.from_pretrained( - base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True - ) - base_tokenizer = LlamaTokenizer.from_pretrained(base_model_path, use_fast=False) - - print(f"Loading the LoRA adapter from {lora_path}") - - lora_model = PeftModel.from_pretrained( - base, - lora_path, - # torch_dtype=torch.float16 - ) - - print("Applying the LoRA") - model = lora_model.merge_and_unload() - - print(f"Saving the target model to {target_model_path}") - model.save_pretrained(target_model_path) - base_tokenizer.save_pretrained(target_model_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--base-model-path", type=str, required=True) - parser.add_argument("--target-model-path", type=str, required=True) - parser.add_argument("--lora-path", type=str, required=True) - - args = parser.parse_args() - - apply_lora(args.base_model_path, args.target_model_path, args.lora_path) diff --git a/llm_judge/model/compression.py b/llm_judge/model/compression.py deleted file mode 100644 index 7c1dcb3..0000000 --- a/llm_judge/model/compression.py +++ /dev/null @@ -1,244 +0,0 @@ -import dataclasses -import gc -import glob -import os - -from accelerate import init_empty_weights -from accelerate.utils import set_module_tensor_to_device -from huggingface_hub import snapshot_download -import torch -from torch import Tensor -from torch.nn import functional as F -import torch.nn as nn -from tqdm import tqdm -from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer - - -@dataclasses.dataclass -class CompressionConfig: - """Group-wise quantization.""" - - num_bits: int - group_size: int - group_dim: int - symmetric: bool - enabled: bool = True - - -default_compression_config = CompressionConfig( - num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True -) - - -class CLinear(nn.Module): - """Compressed Linear Layer.""" - - def __init__(self, weight=None, bias=None, device=None): - super().__init__() - if weight is None: - self.weight = None - elif isinstance(weight, Tensor): - self.weight = compress(weight.data.to(device), default_compression_config) - else: - self.weight = weight - self.bias = bias - - def forward(self, input: Tensor) -> Tensor: - weight = decompress(self.weight, default_compression_config) - if self.bias is None: - return F.linear(input.to(weight.dtype), weight) - return F.linear(input.to(weight.dtype), weight, self.bias.to(weight.dtype)) - - -def compress_module(module, target_device): - for attr_str in dir(module): - target_attr = getattr(module, attr_str) - if type(target_attr) == torch.nn.Linear: - setattr( - module, - attr_str, - CLinear(target_attr.weight, target_attr.bias, target_device), - ) - for name, child in module.named_children(): - compress_module(child, target_device) - - -def get_compressed_list(module, prefix=""): - compressed_list = [] - for attr_str in dir(module): - target_attr = getattr(module, attr_str) - if type(target_attr) == torch.nn.Linear: - full_name = ( - f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight" - ) - compressed_list.append(full_name) - for name, child in module.named_children(): - child_prefix = f"{prefix}.{name}" if prefix else name - for each in get_compressed_list(child, child_prefix): - compressed_list.append(each) - return compressed_list - - -def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""): - for attr_str in dir(module): - target_attr = getattr(module, attr_str) - if type(target_attr) == torch.nn.Linear: - full_name = ( - f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight" - ) - setattr( - module, - attr_str, - CLinear( - compressed_state_dict[full_name], target_attr.bias, target_device - ), - ) - for name, child in module.named_children(): - child_prefix = f"{prefix}.{name}" if prefix else name - apply_compressed_weight( - child, compressed_state_dict, target_device, child_prefix - ) - - -def load_compress_model(model_path, device, torch_dtype, use_fast, revision="main"): - # partially load model - tokenizer = AutoTokenizer.from_pretrained( - model_path, use_fast=use_fast, revision=revision - ) - - with init_empty_weights(): - config = AutoConfig.from_pretrained( - model_path, - low_cpu_mem_usage=True, - torch_dtype=torch_dtype, - revision=revision, - ) - model = AutoModelForCausalLM.from_config(config) - linear_weights = get_compressed_list(model) - - if os.path.exists(model_path): - # `model_path` is a local folder - base_pattern = os.path.join(model_path, "pytorch_model*.bin") - else: - # `model_path` is a cached Hugging Face repo - model_path = snapshot_download(model_path, revision=revision) - base_pattern = os.path.join(model_path, "pytorch_model*.bin") - - files = glob.glob(base_pattern) - - compressed_state_dict = {} - - for filename in tqdm(files): - tmp_state_dict = torch.load(filename) - for name in tmp_state_dict: - if name in linear_weights: - tensor = tmp_state_dict[name].to(device).data.to(torch_dtype) - compressed_state_dict[name] = compress( - tensor, default_compression_config - ) - else: - compressed_state_dict[name] = tmp_state_dict[name].to(device) - tmp_state_dict[name] = None - tensor = None - gc.collect() - torch.cuda.empty_cache() - - for name in model.state_dict(): - if name not in linear_weights: - set_module_tensor_to_device( - model, name, device, value=compressed_state_dict[name] - ) - apply_compressed_weight(model, compressed_state_dict, device) - - model.to(device) - - return model, tokenizer - - -def compress(tensor, config): - """Simulate group-wise quantization.""" - if not config.enabled: - return tensor - - group_size, num_bits, group_dim, symmetric = ( - config.group_size, - config.num_bits, - config.group_dim, - config.symmetric, - ) - assert num_bits <= 8 - - original_shape = tensor.shape - num_groups = (original_shape[group_dim] + group_size - 1) // group_size - new_shape = ( - original_shape[:group_dim] - + (num_groups, group_size) - + original_shape[group_dim + 1 :] - ) - - # Pad - pad_len = (group_size - original_shape[group_dim] % group_size) % group_size - if pad_len != 0: - pad_shape = ( - original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :] - ) - tensor = torch.cat( - [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)], - dim=group_dim, - ) - data = tensor.view(new_shape) - - # Quantize - if symmetric: - B = 2 ** (num_bits - 1) - 1 - scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0] - data = data * scale - data = data.clamp_(-B, B).round_().to(torch.int8) - return data, scale, original_shape - else: - B = 2**num_bits - 1 - mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0] - mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0] - - scale = B / (mx - mn) - data = data - mn - data.mul_(scale) - - data = data.clamp_(0, B).round_().to(torch.uint8) - return data, mn, scale, original_shape - - -def decompress(packed_data, config): - """Simulate group-wise dequantization.""" - if not config.enabled: - return packed_data - - group_size, num_bits, group_dim, symmetric = ( - config.group_size, - config.num_bits, - config.group_dim, - config.symmetric, - ) - - # Dequantize - if symmetric: - data, scale, original_shape = packed_data - data = data / scale - else: - data, mn, scale, original_shape = packed_data - data = data / scale - data.add_(mn) - - # Unpad - pad_len = (group_size - original_shape[group_dim] % group_size) % group_size - if pad_len: - padded_original_shape = ( - original_shape[:group_dim] - + (original_shape[group_dim] + pad_len,) - + original_shape[group_dim + 1 :] - ) - data = data.reshape(padded_original_shape) - indices = [slice(0, x) for x in original_shape] - return data[indices].contiguous() - else: - return data.view(original_shape) diff --git a/llm_judge/model/convert_fp16.py b/llm_judge/model/convert_fp16.py deleted file mode 100644 index efc40aa..0000000 --- a/llm_judge/model/convert_fp16.py +++ /dev/null @@ -1,26 +0,0 @@ -""" -Usage: -python3 -m fastchat.model.convert_fp16 --in in-folder --out out-folder -""" -import argparse - -from transformers import AutoTokenizer, AutoModelForCausalLM -import torch - - -def convert_fp16(in_checkpoint, out_checkpoint): - tokenizer = AutoTokenizer.from_pretrained(in_checkpoint, use_fast=False) - model = AutoModelForCausalLM.from_pretrained( - in_checkpoint, torch_dtype=torch.float16, low_cpu_mem_usage=True - ) - model.save_pretrained(out_checkpoint) - tokenizer.save_pretrained(out_checkpoint) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--in-checkpoint", type=str, help="Path to the model") - parser.add_argument("--out-checkpoint", type=str, help="Path to the output model") - args = parser.parse_args() - - convert_fp16(args.in_checkpoint, args.out_checkpoint) diff --git a/llm_judge/model/llama_condense_monkey_patch.py b/llm_judge/model/llama_condense_monkey_patch.py deleted file mode 100644 index cb45a8b..0000000 --- a/llm_judge/model/llama_condense_monkey_patch.py +++ /dev/null @@ -1,71 +0,0 @@ -# Code adapted from https://huggingface.co/kaiokendev/superhot-13b-8k-no-rlhf-test/blob/main/llama_rope_scaled_monkey_patch.py - -from functools import partial - -import torch -import transformers -import transformers.models.llama.modeling_llama - - -class CondenseRotaryEmbedding(torch.nn.Module): - def __init__( - self, dim, ratio, max_position_embeddings=2048, base=10000, device=None - ): - super().__init__() - inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float().to(device) / dim)) - self.register_buffer("inv_freq", inv_freq) - - # Build here to make `torch.jit.trace` work. - self.ratio = ratio - max_position_embeddings *= ratio - self.max_seq_len_cached = max_position_embeddings - # print(f"Monkey Patching condense ratio {ratio}") - t = ( - torch.arange( - self.max_seq_len_cached, - device=self.inv_freq.device, - dtype=self.inv_freq.dtype, - ) - / ratio - ) - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1) - dtype = torch.get_default_dtype() - self.register_buffer( - "cos_cached", emb.cos()[None, None, :, :].to(dtype), persistent=False - ) - self.register_buffer( - "sin_cached", emb.sin()[None, None, :, :].to(dtype), persistent=False - ) - - def forward(self, x, seq_len=None): - # x: [bs, num_attention_heads, seq_len, head_size] - # This `if` block is unlikely to be run after we build sin/cos in `__init__`. Keep the logic here just in case. - if seq_len > self.max_seq_len_cached: - self.max_seq_len_cached = seq_len - t = ( - torch.arange( - self.max_seq_len_cached, device=x.device, dtype=self.inv_freq.dtype - ) - / self.ratio - ) - freqs = torch.einsum("i,j->ij", t, self.inv_freq) - # Different from paper, but it uses a different permutation in order to obtain the same calculation - emb = torch.cat((freqs, freqs), dim=-1).to(x.device) - self.register_buffer( - "cos_cached", emb.cos()[None, None, :, :].to(x.dtype), persistent=False - ) - self.register_buffer( - "sin_cached", emb.sin()[None, None, :, :].to(x.dtype), persistent=False - ) - return ( - self.cos_cached[:, :, :seq_len, ...].to(dtype=x.dtype), - self.sin_cached[:, :, :seq_len, ...].to(dtype=x.dtype), - ) - - -def replace_llama_with_condense(ratio): - transformers.models.llama.modeling_llama.LlamaRotaryEmbedding = partial( - CondenseRotaryEmbedding, ratio=ratio - ) diff --git a/llm_judge/model/make_delta.py b/llm_judge/model/make_delta.py deleted file mode 100644 index 480ba8f..0000000 --- a/llm_judge/model/make_delta.py +++ /dev/null @@ -1,48 +0,0 @@ -""" -Make the delta weights by subtracting base weights. - -Usage: -python3 -m fastchat.model.make_delta --base ~/model_weights/llama-13b --target ~/model_weights/vicuna-13b --delta ~/model_weights/vicuna-13b-delta --hub-repo-id lmsys/vicuna-13b-delta-v1.1 -""" -import argparse - -import torch -from tqdm import tqdm -from transformers import AutoTokenizer, AutoModelForCausalLM - - -def make_delta(base_model_path, target_model_path, delta_path): - print(f"Loading the base model from {base_model_path}") - base = AutoModelForCausalLM.from_pretrained( - base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True - ) - - print(f"Loading the target model from {target_model_path}") - target = AutoModelForCausalLM.from_pretrained( - target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True - ) - target_tokenizer = AutoTokenizer.from_pretrained(target_model_path, use_fast=False) - - print("Calculating the delta") - for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"): - assert name in base.state_dict() - param.data -= base.state_dict()[name] - - print(f"Saving the delta to {delta_path}") - if args.hub_repo_id: - kwargs = {"push_to_hub": True, "repo_id": args.hub_repo_id} - else: - kwargs = {} - target.save_pretrained(delta_path, **kwargs) - target_tokenizer.save_pretrained(delta_path, **kwargs) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--base-model-path", type=str, required=True) - parser.add_argument("--target-model-path", type=str, required=True) - parser.add_argument("--delta-path", type=str, required=True) - parser.add_argument("--hub-repo-id", type=str) - args = parser.parse_args() - - make_delta(args.base_model_path, args.target_model_path, args.delta_path) diff --git a/llm_judge/model/merge.sh b/llm_judge/model/merge.sh deleted file mode 100644 index 01f0ef4..0000000 --- a/llm_judge/model/merge.sh +++ /dev/null @@ -1,4 +0,0 @@ -python3 -m apply_lora \ - --base decapoda-research/llama-13b-hf \ - --target ~/models/alpaca-lora-13b \ - --lora ~/alpaca-lora/lora-alpaca-13b-bf16 diff --git a/llm_judge/model/model_adapter.py b/llm_judge/model/model_adapter.py deleted file mode 100644 index 542829b..0000000 --- a/llm_judge/model/model_adapter.py +++ /dev/null @@ -1,1103 +0,0 @@ -"""Model adapter registration.""" - -import math -import sys -from typing import List, Optional -import warnings - -if sys.version_info >= (3, 9): - from functools import cache -else: - from functools import lru_cache as cache - -import accelerate -import psutil -import torch -from transformers import ( - AutoConfig, - AutoModel, - AutoModelForCausalLM, - AutoModelForSeq2SeqLM, - AutoTokenizer, - LlamaTokenizer, - LlamaForCausalLM, - T5Tokenizer, -) - -from modules.gptq import GptqConfig, load_gptq_quantized -from conversation import Conversation, get_conv_template -from model.compression import load_compress_model -from model.model_chatglm import generate_stream_chatglm -from model.model_codet5p import generate_stream_codet5p -from model.model_falcon import generate_stream_falcon -from model.monkey_patch_non_inplace import ( - replace_llama_attn_with_non_inplace_operations, -) -from utils import get_gpu_memory - - -class BaseModelAdapter: - """The base and the default model adapter.""" - - use_fast_tokenizer = True - - def match(self, model_path: str): - return True - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - try: - tokenizer = AutoTokenizer.from_pretrained( - model_path, - use_fast=self.use_fast_tokenizer, - revision=revision, - ) - except TypeError: - tokenizer = AutoTokenizer.from_pretrained( - model_path, - use_fast=False, - revision=revision, - ) - - model = AutoModelForCausalLM.from_pretrained( - model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs - ) - return model, tokenizer - - def load_compress_model(self, model_path, device, torch_dtype, revision="main"): - return load_compress_model( - model_path, - device, - torch_dtype, - use_fast=self.use_fast_tokenizer, - revision=revision, - ) - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("one_shot") - - -# A global registry for all model adapters -# TODO (lmzheng): make it a priority queue. -model_adapters: List[BaseModelAdapter] = [] - - -def register_model_adapter(cls): - """Register a model adapter.""" - model_adapters.append(cls()) - - -@cache -def get_model_adapter(model_path: str) -> BaseModelAdapter: - """Get a model adapter for a model_path.""" - for adapter in model_adapters: - if adapter.match(model_path): - return adapter - raise ValueError(f"No valid model adapter for {model_path}") - - -def raise_warning_for_incompatible_cpu_offloading_configuration( - device: str, load_8bit: bool, cpu_offloading: bool -): - if cpu_offloading: - if not load_8bit: - warnings.warn( - "The cpu-offloading feature can only be used while also using 8-bit-quantization.\n" - "Use '--load-8bit' to enable 8-bit-quantization\n" - "Continuing without cpu-offloading enabled\n" - ) - return False - if not "linux" in sys.platform: - warnings.warn( - "CPU-offloading is only supported on linux-systems due to the limited compatability with the bitsandbytes-package\n" - "Continuing without cpu-offloading enabled\n" - ) - return False - if device != "cuda": - warnings.warn( - "CPU-offloading is only enabled when using CUDA-devices\n" - "Continuing without cpu-offloading enabled\n" - ) - return False - return cpu_offloading - - -def load_model( - model_path: str, - device: str, - num_gpus: int, - max_gpu_memory: Optional[str] = None, - load_8bit: bool = False, - cpu_offloading: bool = False, - gptq_config: Optional[GptqConfig] = None, - revision: str = "main", - debug: bool = False, -): - """Load a model from Hugging Face.""" - - # get model adapter - adapter = get_model_adapter(model_path) - - # Handle device mapping - cpu_offloading = raise_warning_for_incompatible_cpu_offloading_configuration( - device, load_8bit, cpu_offloading - ) - if device == "cpu": - kwargs = {"torch_dtype": torch.float32} - elif device == "cuda": - kwargs = {"torch_dtype": torch.float16} - if num_gpus != 1: - kwargs["device_map"] = "auto" - if max_gpu_memory is None: - kwargs[ - "device_map" - ] = "sequential" # This is important for not the same VRAM sizes - available_gpu_memory = get_gpu_memory(num_gpus) - kwargs["max_memory"] = { - i: str(int(available_gpu_memory[i] * 0.85)) + "GiB" - for i in range(num_gpus) - } - else: - kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)} - elif device == "mps": - kwargs = {"torch_dtype": torch.float16} - # Avoid bugs in mps backend by not using in-place operations. - replace_llama_attn_with_non_inplace_operations() - elif device == "xpu": - kwargs = {"torch_dtype": torch.bfloat16} - # Try to load ipex, while it looks unused, it links into torch for xpu support - try: - import intel_extension_for_pytorch as ipex - except ImportError: - warnings.warn( - "Intel Extension for PyTorch is not installed, but is required for xpu inference." - ) - else: - raise ValueError(f"Invalid device: {device}") - - if cpu_offloading: - # raises an error on incompatible platforms - from transformers import BitsAndBytesConfig - - if "max_memory" in kwargs: - kwargs["max_memory"]["cpu"] = ( - str(math.floor(psutil.virtual_memory().available / 2**20)) + "Mib" - ) - kwargs["quantization_config"] = BitsAndBytesConfig( - load_in_8bit_fp32_cpu_offload=cpu_offloading - ) - kwargs["load_in_8bit"] = load_8bit - elif load_8bit: - if num_gpus != 1: - warnings.warn( - "8-bit quantization is not supported for multi-gpu inference." - ) - else: - return adapter.load_compress_model( - model_path=model_path, - device=device, - torch_dtype=kwargs["torch_dtype"], - revision=revision, - ) - elif gptq_config and gptq_config.wbits < 16: - model, tokenizer = load_gptq_quantized(model_path, gptq_config) - if num_gpus != 1: - device_map = accelerate.infer_auto_device_map( - model, - max_memory=kwargs["max_memory"], - no_split_module_classes=["LlamaDecoderLayer"], - ) - model = accelerate.dispatch_model( - model, device_map=device_map, offload_buffers=True - ) - else: - model.to(device) - return model, tokenizer - kwargs["revision"] = revision - - # Load model - adapter = get_model_adapter(model_path) - model, tokenizer = adapter.load_model(model_path, kwargs) - - if (device == "cuda" and num_gpus == 1 and not cpu_offloading) or device == "mps": - model.to(device) - - elif device == "xpu": - model.eval() - model = model.to("xpu") - model = torch.xpu.optimize(model, dtype=torch.bfloat16, inplace=True) - - if debug: - print(model) - - return model, tokenizer - - -def get_conversation_template(model_path: str) -> Conversation: - """Get the default conversation template.""" - adapter = get_model_adapter(model_path) - return adapter.get_default_conv_template(model_path) - - -def get_generate_stream_function(model: torch.nn.Module, model_path: str): - """Get the generate_stream function for inference.""" - from fastchat.serve.inference import generate_stream - - model_type = str(type(model)).lower() - is_chatglm = "chatglm" in model_type - is_falcon = "rwforcausallm" in model_type - is_codet5p = "codet5p" in model_type - - if is_chatglm: - return generate_stream_chatglm - elif is_falcon: - return generate_stream_falcon - elif is_codet5p: - return generate_stream_codet5p - else: - return generate_stream - - -def add_model_args(parser): - parser.add_argument( - "--model-path", - type=str, - default="lmsys/vicuna-7b-v1.3", - help="The path to the weights. This can be a local folder or a Hugging Face repo ID.", - ) - parser.add_argument( - "--revision", - type=str, - default="main", - help="Hugging Face Hub model revision identifier", - ) - parser.add_argument( - "--device", - type=str, - choices=["cpu", "cuda", "mps", "xpu"], - default="cuda", - help="The device type", - ) - parser.add_argument( - "--gpus", - type=str, - default=None, - help="A single GPU like 1 or multiple GPUs like 0,2", - ) - parser.add_argument("--num-gpus", type=int, default=1) - parser.add_argument( - "--max-gpu-memory", - type=str, - help="The maximum memory per gpu. Use a string like '13Gib'", - ) - parser.add_argument( - "--load-8bit", action="store_true", help="Use 8-bit quantization" - ) - parser.add_argument( - "--cpu-offloading", - action="store_true", - help="Only when using 8-bit quantization: Offload excess weights to the CPU that don't fit on the GPU", - ) - parser.add_argument( - "--gptq-ckpt", - type=str, - default=None, - help="Load quantized model. The path to the local GPTQ checkpoint.", - ) - parser.add_argument( - "--gptq-wbits", - type=int, - default=16, - choices=[2, 3, 4, 8, 16], - help="#bits to use for quantization", - ) - parser.add_argument( - "--gptq-groupsize", - type=int, - default=-1, - help="Groupsize to use for quantization; default uses full row.", - ) - parser.add_argument( - "--gptq-act-order", - action="store_true", - help="Whether to apply the activation order GPTQ heuristic", - ) - - -def remove_parent_directory_name(model_path): - """Remove parent directory name.""" - if model_path[-1] == "/": - model_path = model_path[:-1] - return model_path.split("/")[-1] - - -class PeftModelAdapter: - """Loads any "peft" model and it's base model.""" - - def match(self, model_path: str): - """Accepts any model path with "peft" in the name""" - return "peft" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - """Loads the base model then the (peft) adapter weights""" - from peft import PeftConfig, PeftModel - - config = PeftConfig.from_pretrained(model_path) - base_model_path = config.base_model_name_or_path - if "peft" in base_model_path: - raise ValueError( - f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}" - ) - - base_adapter = get_model_adapter(base_model_path) - base_model, tokenizer = base_adapter.load_model( - base_model_path, from_pretrained_kwargs - ) - model = PeftModel.from_pretrained(base_model, model_path) - - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - """Uses the conv template of the base model""" - from peft import PeftConfig, PeftModel - - config = PeftConfig.from_pretrained(model_path) - if "peft" in config.base_model_name_or_path: - raise ValueError( - f"PeftModelAdapter cannot load a base model with 'peft' in the name: {config.base_model_name_or_path}" - ) - base_model_path = config.base_model_name_or_path - base_adapter = get_model_adapter(base_model_path) - return base_adapter.get_default_conv_template(config.base_model_name_or_path) - - -class VicunaAdapter(BaseModelAdapter): - "Model adapater for Vicuna models (e.g., lmsys/vicuna-7b-v1.3)" "" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "vicuna" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - tokenizer = AutoTokenizer.from_pretrained( - model_path, use_fast=self.use_fast_tokenizer, revision=revision - ) - model = AutoModelForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - **from_pretrained_kwargs, - ) - self.raise_warning_for_old_weights(model) - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - if "v0" in remove_parent_directory_name(model_path): - return get_conv_template("one_shot") - return get_conv_template("vicuna_v1.1") - - def raise_warning_for_old_weights(self, model): - if isinstance(model, LlamaForCausalLM) and model.model.vocab_size > 32000: - warnings.warn( - "\nYou are probably using the old Vicuna-v0 model, " - "which will generate unexpected results with the " - "current fastchat.\nYou can try one of the following methods:\n" - "1. Upgrade your weights to the new Vicuna-v1.3: https://github.com/lm-sys/FastChat#vicuna-weights.\n" - "2. Use the old conversation template by `python3 -m fastchat.serve.cli --model-path /path/to/vicuna-v0 --conv-template conv_one_shot`\n" - "3. Downgrade fschat to fschat==0.1.10 (Not recommonded).\n" - ) - - -class AiroborosAdapter(BaseModelAdapter): - """The model adapter for jondurbin/airoboros-*""" - - def match(self, model_path: str): - return "airoboros" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("airoboros_v1") - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - if "mpt" not in model_path: - return super().load_model(model_path, from_pretrained_kwargs) - model = AutoModelForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - trust_remote_code=True, - max_seq_len=8192, - **from_pretrained_kwargs, - ) - tokenizer = AutoTokenizer.from_pretrained( - model_path, trust_remote_code=True, use_fast=True - ) - return model, tokenizer - - -class LongChatAdapter(BaseModelAdapter): - "Model adapater for LongChat models (e.g., lmsys/longchat-7b-16k)." - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "longchat" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - config = AutoConfig.from_pretrained(model_path, revision=revision) - - # Apply monkey patch, TODO(Dacheng): Add flash attention support - from fastchat.model.llama_condense_monkey_patch import ( - replace_llama_with_condense, - ) - - replace_llama_with_condense(config.rope_condense_ratio) - - tokenizer = AutoTokenizer.from_pretrained( - model_path, use_fast=self.use_fast_tokenizer, revision=revision - ) - model = AutoModelForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - **from_pretrained_kwargs, - ) - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("vicuna_v1.1") - - -class CodeT5pAdapter(BaseModelAdapter): - """The model adapter for Salesforce/codet5p-6b""" - - def match(self, model_path: str): - return "codet5p" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) - model = AutoModelForSeq2SeqLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - trust_remote_code=True, - **from_pretrained_kwargs, - ) - return model, tokenizer - - -class T5Adapter(BaseModelAdapter): - """The model adapter for lmsys/fastchat-t5-3b-v1.0""" - - def match(self, model_path: str): - return "t5" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - tokenizer = T5Tokenizer.from_pretrained(model_path, revision=revision) - model = AutoModelForSeq2SeqLM.from_pretrained( - model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs - ) - return model, tokenizer - - -class KoalaAdapter(BaseModelAdapter): - """The model adapter for koala""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "koala" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("koala_v1") - - -class AlpacaAdapter(BaseModelAdapter): - """The model adapter for alpaca""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "alpaca" in model_path.lower() - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("alpaca") - - -class ChatGLMAdapter(BaseModelAdapter): - """The model adapter for THUDM/chatglm-6b, THUDM/chatglm2-6b""" - - def match(self, model_path: str): - return "chatglm" in model_path.lower() - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - tokenizer = AutoTokenizer.from_pretrained( - model_path, trust_remote_code=True, revision=revision - ) - model = AutoModel.from_pretrained( - model_path, trust_remote_code=True, **from_pretrained_kwargs - ) - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - model_path = model_path.lower() - if "chatglm2" in model_path: - return get_conv_template("chatglm2") - return get_conv_template("chatglm") - - -class DollyV2Adapter(BaseModelAdapter): - """The model adapter for databricks/dolly-v2-12b""" - - def match(self, model_path: str): - return "dolly-v2" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) - model = AutoModelForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - **from_pretrained_kwargs, - ) - # 50277 means "### End" - tokenizer.eos_token_id = 50277 - model.config.eos_token_id = tokenizer.eos_token_id - model.config.pad_token_id = tokenizer.pad_token_id - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("dolly_v2") - - -class OasstPythiaAdapter(BaseModelAdapter): - """The model adapter for OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5""" - - def match(self, model_path: str): - return "oasst" in model_path and "pythia" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("oasst_pythia") - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) - model.config.eos_token_id = tokenizer.eos_token_id - model.config.pad_token_id = tokenizer.pad_token_id - return model, tokenizer - - -class OasstLLaMAAdapter(BaseModelAdapter): - """The model adapter for OpenAssistant/oasst-sft-7-llama-30b""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - if "OpenAssistant-SFT-7-Llama-30B-HF" in model_path: - return True - return "oasst" in model_path and "pythia" not in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("oasst_llama") - - -class PythiaAdapter(BaseModelAdapter): - """The model adapter for any EleutherAI/pythia model""" - - def match(self, model_path: str): - return "pythia" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - model, tokenizer = super().load_model(model_path, from_pretrained_kwargs) - model.config.eos_token_id = tokenizer.eos_token_id - model.config.pad_token_id = tokenizer.pad_token_id - return model, tokenizer - - -class StableLMAdapter(BaseModelAdapter): - """The model adapter for StabilityAI/stablelm-tuned-alpha-7b""" - - def match(self, model_path: str): - return "stablelm" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("stablelm") - - -class MPTAdapter(BaseModelAdapter): - """The model adapter for MPT series (mosaicml/mpt-7b-chat, mosaicml/mpt-30b-chat)""" - - def match(self, model_path: str): - return "mpt" in model_path and not "airoboros" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - model = AutoModelForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - trust_remote_code=True, - max_seq_len=8192, - **from_pretrained_kwargs, - ) - tokenizer = AutoTokenizer.from_pretrained( - model_path, trust_remote_code=True, revision=revision - ) - model.config.eos_token_id = tokenizer.eos_token_id - model.config.pad_token_id = tokenizer.pad_token_id - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - if "mpt-7b-chat" in model_path: - return get_conv_template("mpt-7b-chat") - elif "mpt-30b-chat" in model_path: - return get_conv_template("mpt-30b-chat") - elif "mpt-30b-instruct" in model_path: - return get_conv_template("mpt-30b-instruct") - else: - print( - "Warning: Loading base MPT model with `zero_shot` conversation configuration. " - "If this is not desired, inspect model configurations and names." - ) - return get_conv_template("zero_shot") - - -class BaizeAdapter(BaseModelAdapter): - """The model adapter for project-baize/baize-v2-7b""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "baize" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("baize") - - -class RwkvAdapter(BaseModelAdapter): - """The model adapter for BlinkDL/RWKV-4-Raven""" - - def match(self, model_path: str): - return "RWKV-4" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - from fastchat.model.rwkv_model import RwkvModel - - model = RwkvModel(model_path) - revision = from_pretrained_kwargs.get("revision", "main") - tokenizer = AutoTokenizer.from_pretrained( - "EleutherAI/pythia-160m", revision=revision - ) - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("rwkv") - - -class OpenBuddyAdapter(BaseModelAdapter): - """The model adapter for OpenBuddy/openbuddy-7b-v1.1-bf16-enc""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "openbuddy" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("openbuddy") - - -class PhoenixAdapter(BaseModelAdapter): - """The model adapter for FreedomIntelligence/phoenix-inst-chat-7b""" - - def match(self, model_path: str): - return "phoenix" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("phoenix") - - -class ChatGPTAdapter(BaseModelAdapter): - """The model adapter for ChatGPT""" - - def match(self, model_path: str): - return model_path in ("gpt-3.5-turbo", "gpt-4") - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - raise NotImplementedError() - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("chatgpt") - - -class ClaudeAdapter(BaseModelAdapter): - """The model adapter for Claude""" - - def match(self, model_path: str): - return model_path in ["claude-v1", "claude-instant-v1"] - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - raise NotImplementedError() - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("claude") - - -class BardAdapter(BaseModelAdapter): - """The model adapter for Bard""" - - def match(self, model_path: str): - return model_path == "bard" - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - raise NotImplementedError() - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("bard") - - -class PaLM2Adapter(BaseModelAdapter): - """The model adapter for PaLM2""" - - def match(self, model_path: str): - return model_path == "palm-2" - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - raise NotImplementedError() - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("bard") - - -class BiLLaAdapter(BaseModelAdapter): - """The model adapter for Neutralzz/BiLLa-7B-SFT""" - - def match(self, model_path: str): - return "billa" in model_path.lower() - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("billa") - - -class RedPajamaINCITEAdapter(BaseModelAdapter): - """The model adapter for togethercomputer/RedPajama-INCITE-7B-Chat""" - - def match(self, model_path: str): - return "redpajama-incite" in model_path.lower() - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) - model = AutoModelForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - **from_pretrained_kwargs, - ) - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("redpajama-incite") - - -class H2OGPTAdapter(BaseModelAdapter): - """The model adapter for h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "h2ogpt" in model_path.lower() - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("h2ogpt") - - -class RobinAdapter(BaseModelAdapter): - """The model adapter for LMFlow/Full-Robin-7b-v2""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "Robin" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("Robin") - - -class SnoozyAdapter(BaseModelAdapter): - """The model adapter for nomic-ai/gpt4all-13b-snoozy""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "gpt4all" in model_path and "snoozy" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("snoozy") - - -class WizardLMAdapter(BaseModelAdapter): - """The model adapter for WizardLM/WizardLM-13B-V1.0""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "wizardlm" in model_path.lower() - - def get_default_conv_template(self, model_path: str) -> Conversation: - model_path = model_path.lower() - if "13b" in model_path or "30b" in model_path: - return get_conv_template("vicuna_v1.1") - else: - # TODO: use the recommended template for 7B - # (https://huggingface.co/WizardLM/WizardLM-13B-V1.0) - return get_conv_template("one_shot") - - -class ManticoreAdapter(BaseModelAdapter): - """The model adapter for openaccess-ai-collective/manticore-13b-chat-pyg""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "manticore" in model_path.lower() - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("manticore") - - -class GuanacoAdapter(BaseModelAdapter): - """The model adapter for timdettmers/guanaco-33b-merged""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "guanaco" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - tokenizer = AutoTokenizer.from_pretrained( - model_path, use_fast=self.use_fast_tokenizer, revision=revision - ) - model = AutoModelForCausalLM.from_pretrained( - model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs - ) - # Fix a bug in tokenizer config - tokenizer.eos_token_id = model.config.eos_token_id - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("zero_shot") - - -class ChangGPTAdapter(BaseModelAdapter): - """The model adapter for lcw99/polyglot-ko-12.8b-chang-instruct-chat""" - - def match(self, model_path: str): - return "polyglot" in model_path and "chang" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("polyglot_changgpt") - - -class CamelAdapter(BaseModelAdapter): - """The model adapter for camel-ai/CAMEL-13B-Combined-Data""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "camel" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("vicuna_v1.1") - - -class TuluAdapter(BaseModelAdapter): - """The model adapter for allenai/tulu-30b""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "tulu" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("tulu") - - -class FalconAdapter(BaseModelAdapter): - """The model adapter for tiiuae/falcon-40b.""" - - def match(self, model_path: str): - return "falcon" in model_path.lower() - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - # Strongly suggest using bf16, which is recommended by the author of Falcon - tokenizer = AutoTokenizer.from_pretrained(model_path, revision=revision) - model = AutoModelForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - trust_remote_code=True, - **from_pretrained_kwargs, - ) - # In Falcon tokenizer config and special config there is not any pad token - # Setting `pad_token_id` to 9, which corresponds to special token '>>SUFFIX<<' - tokenizer.pad_token_id = 9 - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("falcon") - - -class TigerBotAdapter(BaseModelAdapter): - """The model adapter for TigerResearch/tigerbot-7b-sft""" - - def match(self, model_path: str): - return "tigerbot" in model_path.lower() - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - tokenizer = AutoTokenizer.from_pretrained( - model_path, - trust_remote_code=True, - revision=revision, - ) - model = AutoModelForCausalLM.from_pretrained( - model_path, - trust_remote_code=True, - low_cpu_mem_usage=True, - **from_pretrained_kwargs, - ) - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("tigerbot") - - -class BaichuanAdapter(BaseModelAdapter): - """The model adapter for baichuan-inc/baichuan-7B""" - - def match(self, model_path: str): - return "baichuan" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - tokenizer = AutoTokenizer.from_pretrained( - model_path, trust_remote_code=True, revision=revision - ) - model = AutoModelForCausalLM.from_pretrained( - model_path, - trust_remote_code=True, - low_cpu_mem_usage=True, - **from_pretrained_kwargs, - ) - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("one_shot") - - -class XGenAdapter(BaseModelAdapter): - """The model adapter for Salesforce/xgen-7b""" - - def match(self, model_path: str): - return "xgen" in model_path - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - model = AutoModelForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - trust_remote_code=True, - **from_pretrained_kwargs, - ) - tokenizer = AutoTokenizer.from_pretrained( - model_path, trust_remote_code=True, revision=revision - ) - model.config.eos_token_id = 50256 - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("xgen") - - -class NousHermesAdapter(BaseModelAdapter): - """The model adapter for NousResearch/Nous-Hermes-13b""" - - use_fast_tokenizer = False - - def match(self, model_path: str): - return "Nous-Hermes" in model_path - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("alpaca") - - -class InternLMChatAdapter(BaseModelAdapter): - """The model adapter for internlm/internlm-chat-7b""" - - def match(self, model_path: str): - return "internlm-chat" in model_path.lower() - - def load_model(self, model_path: str, from_pretrained_kwargs: dict): - revision = from_pretrained_kwargs.get("revision", "main") - model = AutoModelForCausalLM.from_pretrained( - model_path, - low_cpu_mem_usage=True, - trust_remote_code=True, - **from_pretrained_kwargs, - ) - model = model.eval() - if "8k" in model_path.lower(): - model.config.max_sequence_length = 8192 - tokenizer = AutoTokenizer.from_pretrained( - model_path, trust_remote_code=True, revision=revision - ) - return model, tokenizer - - def get_default_conv_template(self, model_path: str) -> Conversation: - return get_conv_template("internlm-chat") - - -# Note: the registration order matters. -# The one registered earlier has a higher matching priority. -register_model_adapter(PeftModelAdapter) -register_model_adapter(VicunaAdapter) -register_model_adapter(AiroborosAdapter) -register_model_adapter(LongChatAdapter) -register_model_adapter(CodeT5pAdapter) -register_model_adapter(T5Adapter) -register_model_adapter(KoalaAdapter) -register_model_adapter(AlpacaAdapter) -register_model_adapter(ChatGLMAdapter) -register_model_adapter(DollyV2Adapter) -register_model_adapter(OasstPythiaAdapter) -register_model_adapter(OasstLLaMAAdapter) -register_model_adapter(StableLMAdapter) -register_model_adapter(BaizeAdapter) -register_model_adapter(RwkvAdapter) -register_model_adapter(OpenBuddyAdapter) -register_model_adapter(PhoenixAdapter) -register_model_adapter(BardAdapter) -register_model_adapter(PaLM2Adapter) -register_model_adapter(ChatGPTAdapter) -register_model_adapter(ClaudeAdapter) -register_model_adapter(MPTAdapter) -register_model_adapter(BiLLaAdapter) -register_model_adapter(RedPajamaINCITEAdapter) -register_model_adapter(H2OGPTAdapter) -register_model_adapter(RobinAdapter) -register_model_adapter(SnoozyAdapter) -register_model_adapter(WizardLMAdapter) -register_model_adapter(ManticoreAdapter) -register_model_adapter(GuanacoAdapter) -register_model_adapter(CamelAdapter) -register_model_adapter(ChangGPTAdapter) -register_model_adapter(TuluAdapter) -register_model_adapter(FalconAdapter) -register_model_adapter(TigerBotAdapter) -register_model_adapter(BaichuanAdapter) -register_model_adapter(XGenAdapter) -register_model_adapter(NousHermesAdapter) -register_model_adapter(PythiaAdapter) -register_model_adapter(InternLMChatAdapter) - -# After all adapters, try the default base adapter. -register_model_adapter(BaseModelAdapter) diff --git a/llm_judge/model/model_chatglm.py b/llm_judge/model/model_chatglm.py deleted file mode 100644 index 5d4db62..0000000 --- a/llm_judge/model/model_chatglm.py +++ /dev/null @@ -1,102 +0,0 @@ -""" -Inference code for ChatGLM. -Adapted from https://huggingface.co/THUDM/chatglm-6b/blob/main/modeling_chatglm.py. -""" -import re - -import torch -from transformers.generation.logits_process import LogitsProcessor - - -class InvalidScoreLogitsProcessor(LogitsProcessor): - def __call__( - self, input_ids: torch.LongTensor, scores: torch.FloatTensor - ) -> torch.FloatTensor: - if torch.isnan(scores).any() or torch.isinf(scores).any(): - scores.zero_() - scores[..., 5] = 5e4 - return scores - - -invalid_score_processor = InvalidScoreLogitsProcessor() - - -def process_response(response): - response = response.strip() - response = response.replace("[[训练时间]]", "2023年") - punkts = [ - [",", ","], - ["!", "!"], - [":", ":"], - [";", ";"], - ["\?", "?"], - ] - for item in punkts: - response = re.sub(r"([\u4e00-\u9fff])%s" % item[0], r"\1%s" % item[1], response) - response = re.sub(r"%s([\u4e00-\u9fff])" % item[0], r"%s\1" % item[1], response) - return response - - -@torch.inference_mode() -def generate_stream_chatglm( - model, - tokenizer, - params, - device, - context_len=2048, - stream_interval=2, - judge_sent_end=False, -): - prompt = params["prompt"] - temperature = float(params.get("temperature", 1.0)) - repetition_penalty = float(params.get("repetition_penalty", 1.0)) - top_p = float(params.get("top_p", 1.0)) - max_new_tokens = int(params.get("max_new_tokens", 256)) - echo = params.get("echo", True) - - inputs = tokenizer([prompt], return_tensors="pt").to(model.device) - input_echo_len = len(inputs["input_ids"][0]) - - gen_kwargs = { - "max_length": max_new_tokens + input_echo_len, - "do_sample": True if temperature > 1e-5 else False, - "top_p": top_p, - "repetition_penalty": repetition_penalty, - "logits_processor": [invalid_score_processor], - } - if temperature > 1e-5: - gen_kwargs["temperature"] = temperature - - total_len = 0 - for total_ids in model.stream_generate(**inputs, **gen_kwargs): - total_ids = total_ids.tolist()[0] - total_len = len(total_ids) - if echo: - output_ids = total_ids - else: - output_ids = total_ids[input_echo_len:] - response = tokenizer.decode(output_ids) - response = process_response(response) - - yield { - "text": response, - "usage": { - "prompt_tokens": input_echo_len, - "completion_tokens": total_len - input_echo_len, - "total_tokens": total_len, - }, - "finish_reason": None, - } - - # TODO: ChatGLM stop when it reach max length - # Only last stream result contains finish_reason, we set finish_reason as stop - ret = { - "text": response, - "usage": { - "prompt_tokens": input_echo_len, - "completion_tokens": total_len - input_echo_len, - "total_tokens": total_len, - }, - "finish_reason": "stop", - } - yield ret diff --git a/llm_judge/model/model_codet5p.py b/llm_judge/model/model_codet5p.py deleted file mode 100644 index 4cd9f4a..0000000 --- a/llm_judge/model/model_codet5p.py +++ /dev/null @@ -1,106 +0,0 @@ -import gc -from threading import Thread -import torch -import transformers -from transformers import ( - GenerationConfig, - StoppingCriteria, - StoppingCriteriaList, - TextIteratorStreamer, -) - -transformers.logging.set_verbosity_error() - - -@torch.inference_mode() -def generate_stream_codet5p( - model, - tokenizer, - params, - device, - context_len=2048, - stream_interval=2, - judge_sent_end=False, -): - prompt = params["prompt"] - temperature = float(params.get("temperature", 1.0)) - repetition_penalty = float(params.get("repetition_penalty", 1.0)) - top_p = float(params.get("top_p", 1.0)) - top_k = int(params.get("top_k", 50)) # -1 means disable - max_new_tokens = int(params.get("max_new_tokens", 1024)) - stop_token_ids = params.get("stop_token_ids", None) or [] - stop_token_ids.append(tokenizer.eos_token_id) - - decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True) - streamer = TextIteratorStreamer(tokenizer, **decode_config) - encoding = tokenizer(prompt, return_tensors="pt").to(device) - input_ids = encoding.input_ids - encoding["decoder_input_ids"] = encoding["input_ids"].clone() - input_echo_len = len(input_ids) - - generation_config = GenerationConfig( - max_new_tokens=max_new_tokens, - do_sample=temperature >= 1e-5, - temperature=temperature, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=10, - top_p=top_p, - top_k=top_k, - eos_token_id=stop_token_ids, - ) - - class CodeBlockStopper(StoppingCriteria): - def __call__( - self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs - ) -> bool: - # Code-completion is open-end generation. - # We check \n\n to stop at end of a code block. - if list(input_ids[0][-2:]) == [628, 198]: - return True - return False - - gen_kwargs = dict( - **encoding, - streamer=streamer, - generation_config=generation_config, - stopping_criteria=StoppingCriteriaList([CodeBlockStopper()]), - ) - thread = Thread(target=model.generate, kwargs=gen_kwargs) - thread.start() - i = 0 - output = "" - for new_text in streamer: - i += 1 - output += new_text - if i % stream_interval == 0 or i == max_new_tokens - 1: - yield { - "text": output, - "usage": { - "prompt_tokens": input_echo_len, - "completion_tokens": i, - "total_tokens": input_echo_len + i, - }, - "finish_reason": None, - } - if i >= max_new_tokens: - break - - if i >= max_new_tokens: - finish_reason = "length" - else: - finish_reason = "stop" - - yield { - "text": output, - "usage": { - "prompt_tokens": input_echo_len, - "completion_tokens": i, - "total_tokens": input_echo_len + i, - }, - "finish_reason": finish_reason, - } - thread.join() - - # clean - gc.collect() - torch.cuda.empty_cache() diff --git a/llm_judge/model/model_falcon.py b/llm_judge/model/model_falcon.py deleted file mode 100644 index b15ef12..0000000 --- a/llm_judge/model/model_falcon.py +++ /dev/null @@ -1,138 +0,0 @@ -import gc -from threading import Thread -from typing import Iterable - -import torch -import transformers -from transformers import TextIteratorStreamer, GenerationConfig - -from utils import is_partial_stop - -transformers.logging.set_verbosity_error() - - -@torch.inference_mode() -def generate_stream_falcon( - model, - tokenizer, - params, - device, - context_len=2048, - stream_interval=2, - judge_sent_end=False, -): - prompt = params["prompt"] - len_prompt = len(prompt) - temperature = float(params.get("temperature", 1.0)) - repetition_penalty = float(params.get("repetition_penalty", 1.0)) - top_p = float(params.get("top_p", 1.0)) - top_k = int(params.get("top_k", 50)) # -1 means disable - max_new_tokens = int(params.get("max_new_tokens", 256)) - stop_str = params.get("stop", None) - echo = bool(params.get("echo", True)) - stop_token_ids = params.get("stop_token_ids", None) or [] - stop_token_ids.append(tokenizer.eos_token_id) - - inputs = tokenizer(prompt, return_tensors="pt").to(model.device) - input_ids = inputs["input_ids"] - attention_mask = inputs["attention_mask"] - - max_src_len = context_len - max_new_tokens - 8 - - input_ids = input_ids[-max_src_len:] # truncate from the left - attention_mask = attention_mask[-max_src_len:] # truncate from the left - input_echo_len = len(input_ids) - - decode_config = dict(skip_special_tokens=True, clean_up_tokenization_spaces=True) - streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, **decode_config) - - generation_config = GenerationConfig( - max_new_tokens=max_new_tokens, - do_sample=temperature >= 1e-5, - temperature=temperature, - repetition_penalty=repetition_penalty, - no_repeat_ngram_size=10, - top_p=top_p, - top_k=top_k, - eos_token_id=stop_token_ids, - ) - - generation_kwargs = dict( - inputs=input_ids, - attention_mask=attention_mask, - streamer=streamer, - generation_config=generation_config, - ) - - thread = Thread(target=model.generate, kwargs=generation_kwargs) - thread.start() - - if echo: - # means keep the prompt - output = prompt - else: - output = "" - - for i, new_text in enumerate(streamer): - output += new_text - if i % stream_interval == 0: - if echo: - rfind_start = len_prompt - else: - rfind_start = 0 - - partially_stopped = False - if stop_str: - if isinstance(stop_str, str): - pos = output.rfind(stop_str, rfind_start) - if pos != -1: - output = output[:pos] - else: - partially_stopped = is_partial_stop(output, stop_str) - elif isinstance(stop_str, Iterable): - for each_stop in stop_str: - pos = output.rfind(each_stop, rfind_start) - if pos != -1: - output = output[:pos] - break - else: - partially_stopped = is_partial_stop(output, each_stop) - if partially_stopped: - break - else: - raise ValueError("Invalid stop field type.") - - # prevent yielding partial stop sequence - if not partially_stopped: - yield { - "text": output, - "usage": { - "prompt_tokens": input_echo_len, - "completion_tokens": i, - "total_tokens": input_echo_len + i, - }, - "finish_reason": None, - } - output = output.strip() - - # finish stream event, which contains finish reason - if i == max_new_tokens - 1: - finish_reason = "length" - elif partially_stopped: - finish_reason = None - else: - finish_reason = "stop" - - yield { - "text": output, - "usage": { - "prompt_tokens": input_echo_len, - "completion_tokens": i, - "total_tokens": input_echo_len + i, - }, - "finish_reason": finish_reason, - } - - # clean - gc.collect() - torch.cuda.empty_cache() diff --git a/llm_judge/model/model_registry.py b/llm_judge/model/model_registry.py deleted file mode 100644 index 178938a..0000000 --- a/llm_judge/model/model_registry.py +++ /dev/null @@ -1,231 +0,0 @@ -"""Additional information of the models.""" -from collections import namedtuple -from typing import List - - -ModelInfo = namedtuple("ModelInfo", ["simple_name", "link", "description"]) - - -model_info = {} - - -def register_model_info( - full_names: List[str], simple_name: str, link: str, description: str -): - info = ModelInfo(simple_name, link, description) - - for full_name in full_names: - model_info[full_name] = info - - -def get_model_info(name: str) -> ModelInfo: - return model_info[name] - - -register_model_info( - ["gpt-4"], "ChatGPT-4", "https://openai.com/research/gpt-4", "ChatGPT-4 by OpenAI" -) -register_model_info( - ["gpt-3.5-turbo"], - "ChatGPT-3.5", - "https://openai.com/blog/chatgpt", - "ChatGPT-3.5 by OpenAI", -) -register_model_info( - ["claude-v1"], - "Claude", - "https://www.anthropic.com/index/introducing-claude", - "Claude by Anthropic", -) -register_model_info( - ["claude-instant-v1"], - "Claude Instant", - "https://www.anthropic.com/index/introducing-claude", - "Claude Instant by Anthropic", -) -register_model_info( - ["palm-2"], - "PaLM 2 Chat", - "https://cloud.google.com/vertex-ai/docs/release-notes#May_10_2023", - "PaLM 2 for Chat (chat-bison@001) by Google", -) -register_model_info( - [ - "vicuna-13b", - "vicuna-13b-v1.3", - "vicuna-7b", - "vicuna-7b-v1.3", - "vicuna-33b", - "vicuna-33b-v1.3", - ], - "Vicuna", - "https://lmsys.org/blog/2023-03-30-vicuna/", - "a chat assistant fine-tuned from LLaMA on user-shared conversations by LMSYS", -) -register_model_info( - ["wizardlm-13b"], - "WizardLM", - "https://github.com/nlpxucan/WizardLM", - "an instruction-following LLM using evol-instruct by Microsoft", -) -register_model_info( - ["guanaco-33b", "guanaco-65b"], - "Guanaco", - "https://github.com/artidoro/qlora", - "a model fine-tuned with QLoRA by UW", -) -register_model_info( - ["mpt-7b-chat"], - "MPT-Chat", - "https://www.mosaicml.com/blog/mpt-7b", - "a chatbot fine-tuned from MPT-7B by MosaicML", -) -register_model_info( - ["mpt-30b-chat"], - "MPT-Chat", - "https://www.mosaicml.com/blog/mpt-30b", - "a chatbot fine-tuned from MPT-30B by MosaicML", -) -register_model_info( - ["gpt4all-13b-snoozy"], - "GPT4All-Snoozy", - "https://github.com/nomic-ai/gpt4all", - "A finetuned LLaMA model on assistant style data by Nomic AI", -) -register_model_info( - ["koala-13b"], - "Koala", - "https://bair.berkeley.edu/blog/2023/04/03/koala", - "a dialogue model for academic research by BAIR", -) -register_model_info( - ["RWKV-4-Raven-14B"], - "RWKV-4-Raven", - "https://huggingface.co/BlinkDL/rwkv-4-raven", - "an RNN with transformer-level LLM performance", -) -register_model_info( - ["alpaca-13b"], - "Alpaca", - "https://crfm.stanford.edu/2023/03/13/alpaca.html", - "a model fine-tuned from LLaMA on instruction-following demonstrations by Stanford", -) -register_model_info( - ["chatglm-6b", "chatglm2-6b"], - "ChatGLM", - "https://chatglm.cn/blog", - "an open bilingual dialogue language model by Tsinghua University", -) -register_model_info( - ["oasst-pythia-12b"], - "OpenAssistant (oasst)", - "https://open-assistant.io", - "an Open Assistant for everyone by LAION", -) -register_model_info( - ["oasst-sft-7-llama-30b"], - "OpenAssistant (oasst)", - "https://open-assistant.io", - "an Open Assistant for everyone by LAION", -) -register_model_info( - ["llama-13b"], - "LLaMA", - "https://arxiv.org/abs/2302.13971", - "open and efficient foundation language models by Meta", -) -register_model_info( - ["dolly-v2-12b"], - "Dolly", - "https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm", - "an instruction-tuned open large language model by Databricks", -) -register_model_info( - ["stablelm-tuned-alpha-7b"], - "StableLM", - "https://github.com/stability-AI/stableLM", - "Stability AI language models", -) -register_model_info( - ["codet5p-6b"], - "CodeT5p-6b", - "https://huggingface.co/Salesforce/codet5p-6b", - "Code completion model released by Salesforce", -) -register_model_info( - ["fastchat-t5-3b", "fastchat-t5-3b-v1.0"], - "FastChat-T5", - "https://huggingface.co/lmsys/fastchat-t5-3b-v1.0", - "a chat assistant fine-tuned from FLAN-T5 by LMSYS", -) -register_model_info( - ["phoenix-inst-chat-7b"], - "Phoenix-7B", - "https://huggingface.co/FreedomIntelligence/phoenix-inst-chat-7b", - "a multilingual chat assistant fine-tuned from Bloomz to democratize ChatGPT across languages by CUHK(SZ)", -) -register_model_info( - ["billa-7b-sft"], - "BiLLa-7B-SFT", - "https://huggingface.co/Neutralzz/BiLLa-7B-SFT", - "an instruction-tuned bilingual LLaMA with enhanced reasoning ability by an independent researcher", -) -register_model_info( - ["h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2"], - "h2oGPT-GM-7b", - "https://huggingface.co/h2oai/h2ogpt-gm-oasst1-en-2048-open-llama-7b-preview-300bt-v2", - "an instruction-tuned OpenLLaMA with enhanced conversational ability by H2O.ai", -) -register_model_info( - ["baize-v2-7b", "baize-v2-13b"], - "Baize v2", - "https://github.com/project-baize/baize-chatbot#v2", - "A chatbot fine-tuned from LLaMA with ChatGPT self-chat data and Self-Disillation with Feedback (SDF) by UCSD and SYSU.", -) -register_model_info( - [ - "airoboros-7b-gpt4-1.4", - "airoboros-13b-gpt4-1.4", - "airoboros-33b-gpt4-1.4", - "airoboros-65b-gpt4-1.4", - ], - "airoboros", - "https://huggingface.co/jondurbin/airoboros-33b-gpt4-1.4", - "an instruction-tuned LlaMa model tuned with 100% synthetic instruction-response pairs from GPT4", -) -register_model_info( - ["Robin-7b-v2", "Robin-13b-v2", "Robin-33b-v2"], - "Robin-v2", - "https://huggingface.co/OptimalScale/robin-7b-v2-delta", - "A chatbot fine-tuned from LLaMA-7b, achieving competitive performance on chitchat, commonsense reasoning and instruction-following tasks, by OptimalScale, HKUST.", -) -register_model_info( - ["manticore-13b-chat"], - "Manticore 13B Chat", - "https://huggingface.co/openaccess-ai-collective/manticore-13b-chat-pyg", - "A chatbot fine-tuned from LlaMa across several CoT and chat datasets.", -) -register_model_info( - ["redpajama-incite-7b-chat"], - "RedPajama-INCITE-7B-Chat", - "https://huggingface.co/togethercomputer/RedPajama-INCITE-7B-Chat", - "A chatbot fine-tuned from RedPajama-INCITE-7B-Base by Together", -) -register_model_info( - ["falcon-7b", "falcon-7b-instruct", "falcon-40b", "falcon-40b-instruct"], - "Falcon", - "https://huggingface.co/tiiuae/falcon-40b", - "TII's flagship series of large language models", -) -register_model_info( - ["tigerbot-7b-sft"], - "Tigerbot", - "https://huggingface.co/TigerResearch/tigerbot-7b-sft", - "TigerBot is a large-scale language model (LLM) with multiple languages and tasks.", -) -register_model_info( - ["internlm-chat-7b", "internlm-chat-7b-8k"], - "InternLM", - "https://huggingface.co/internlm/internlm-chat-7b", - "InternLM is a multi-language large-scale language model (LLM), developed by SHLAB.", -) diff --git a/llm_judge/model/monkey_patch_non_inplace.py b/llm_judge/model/monkey_patch_non_inplace.py deleted file mode 100644 index 9661d70..0000000 --- a/llm_judge/model/monkey_patch_non_inplace.py +++ /dev/null @@ -1,118 +0,0 @@ -""" -Monkey patch the llama implementation in the huggingface/transformers library. -Avoid bugs in mps backend by not using in-place operations. -""" -import math -from typing import List, Optional, Tuple - -import torch -from torch import nn -import transformers - - -def rotate_half(x): - """Rotates half the hidden dims of the input.""" - x1 = x[..., : x.shape[-1] // 2].clone() - x2 = x[..., x.shape[-1] // 2 :].clone() - return torch.cat((-x2, x1), dim=-1) - - -def apply_rotary_pos_emb(q, k, cos, sin, position_ids): - gather_indices = position_ids[:, None, :, None] # [bs, 1, seq_len, 1] - gather_indices = gather_indices.repeat(1, cos.shape[1], 1, cos.shape[3]) - cos = torch.gather(cos.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices) - sin = torch.gather(sin.repeat(gather_indices.shape[0], 1, 1, 1), 2, gather_indices) - q_embed = (q * cos) + (rotate_half(q) * sin) - k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed - - -def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_value: Optional[Tuple[torch.Tensor]] = None, - output_attentions: bool = False, - use_cache: bool = False, -) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - bsz, q_len, _ = hidden_states.size() - - query_states = ( - self.q_proj(hidden_states) - .view(bsz, q_len, self.num_heads, self.head_dim) - .transpose(1, 2) - ) - key_states = ( - self.k_proj(hidden_states) - .view(bsz, q_len, self.num_heads, self.head_dim) - .transpose(1, 2) - ) - value_states = ( - self.v_proj(hidden_states) - .view(bsz, q_len, self.num_heads, self.head_dim) - .transpose(1, 2) - ) - - kv_seq_len = key_states.shape[-2] - if past_key_value is not None: - kv_seq_len += past_key_value[0].shape[-2] - cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) - query_states, key_states = apply_rotary_pos_emb( - query_states, key_states, cos, sin, position_ids - ) - # [bsz, nh, t, hd] - - if past_key_value is not None: - # reuse k, v, self_attention - key_states = torch.cat([past_key_value[0], key_states], dim=2) - value_states = torch.cat([past_key_value[1], value_states], dim=2) - - past_key_value = (key_states, value_states) if use_cache else None - - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt( - self.head_dim - ) - - if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): - raise ValueError( - f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): - raise ValueError( - f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - attn_weights = torch.max( - attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min) - ) - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to( - query_states.dtype - ) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2) - attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) - - attn_output = self.o_proj(attn_output) - - if not output_attentions: - attn_weights = None - - return attn_output, attn_weights, past_key_value - - -def replace_llama_attn_with_non_inplace_operations(): - """Avoid bugs in mps backend by not using in-place operations.""" - transformers.models.llama.modeling_llama.LlamaAttention.forward = forward diff --git a/llm_judge/model/rwkv_model.py b/llm_judge/model/rwkv_model.py deleted file mode 100644 index bdbc145..0000000 --- a/llm_judge/model/rwkv_model.py +++ /dev/null @@ -1,76 +0,0 @@ -import os -from types import SimpleNamespace -import warnings - -import torch - -os.environ["RWKV_JIT_ON"] = "1" -os.environ["RWKV_CUDA_ON"] = "1" - -from rwkv.model import RWKV -from rwkv.utils import PIPELINE, PIPELINE_ARGS - - -class RwkvModel: - def __init__(self, model_path): - warnings.warn( - "Experimental support. Please use ChatRWKV if you want to chat with RWKV" - ) - self.config = SimpleNamespace(is_encoder_decoder=False) - self.model = RWKV(model=model_path, strategy="cuda fp16") - # two GPUs - # self.model = RWKV(model=model_path, strategy="cuda:0 fp16 *20 -> cuda:1 fp16") - - self.tokenizer = None - self.model_path = model_path - - def to(self, target): - assert target == "cuda" - - def __call__(self, input_ids, use_cache, past_key_values=None): - assert use_cache == True - input_ids = input_ids[0].detach().cpu().numpy() - # print(input_ids) - logits, state = self.model.forward(input_ids, past_key_values) - # print(logits) - logits = logits.unsqueeze(0).unsqueeze(0) - out = SimpleNamespace(logits=logits, past_key_values=state) - return out - - def generate( - self, input_ids, do_sample, temperature, max_new_tokens, repetition_penalty=1.0 - ): - # This function is used by fastchat.llm_judge. - # Because RWKV does not support huggingface generation API, - # we reuse fastchat.serve.inference.generate_stream as a workaround. - from transformers import AutoTokenizer - - from fastchat.serve.inference import generate_stream - from fastchat.conversation import get_conv_template - - if self.tokenizer is None: - self.tokenizer = AutoTokenizer.from_pretrained( - "EleutherAI/pythia-160m", use_fast=True - ) - prompt = self.tokenizer.decode(input_ids[0].tolist()) - conv = get_conv_template("rwkv") - - gen_params = { - "model": self.model_path, - "prompt": prompt, - "temperature": temperature, - "repetition_penalty": repetition_penalty, - "max_new_tokens": max_new_tokens, - "stop": conv.stop_str, - "stop_token_ids": conv.stop_token_ids, - "echo": False, - } - res_iter = generate_stream(self, self.tokenizer, gen_params, "cuda") - - for res in res_iter: - pass - - output = res["text"] - output_ids = self.tokenizer.encode(output) - - return [input_ids[0].tolist() + output_ids] diff --git a/llm_judge/model/upload_hub.py b/llm_judge/model/upload_hub.py deleted file mode 100644 index a3e5ae4..0000000 --- a/llm_judge/model/upload_hub.py +++ /dev/null @@ -1,44 +0,0 @@ -""" -Upload weights to huggingface. - -Usage: -python3 -m fastchat.model.upload_hub --model-path ~/model_weights/vicuna-13b --hub-repo-id lmsys/vicuna-13b-v1.3 -""" -import argparse -import tempfile - -import torch -from transformers import AutoTokenizer, AutoModelForCausalLM - - -def upload_hub(model_path, hub_repo_id, component): - if component == "all": - components = ["model", "tokenizer"] - else: - components = [component] - - kwargs = {"push_to_hub": True, "repo_id": hub_repo_id} - - if "model" in components: - model = AutoModelForCausalLM.from_pretrained( - model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True - ) - with tempfile.TemporaryDirectory() as tmp_path: - model.save_pretrained(tmp_path, **kwargs) - - if "tokenizer" in components: - tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False) - with tempfile.TemporaryDirectory() as tmp_path: - tokenizer.save_pretrained(tmp_path, **kwargs) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--model-path", type=str, required=True) - parser.add_argument("--hub-repo-id", type=str, required=True) - parser.add_argument( - "--component", type=str, choices=["all", "model", "tokenizer"], default="all" - ) - args = parser.parse_args() - - upload_hub(args.model_path, args.hub_repo_id, args.component) diff --git a/llm_judge/model_adapter.py b/llm_judge/model_adapter.py new file mode 100644 index 0000000..7faff83 --- /dev/null +++ b/llm_judge/model_adapter.py @@ -0,0 +1,96 @@ +"""Model adapter registration.""" + +import sys +from typing import List + +if sys.version_info >= (3, 9): + from functools import cache +else: + from functools import lru_cache as cache + +from transformers import ( + AutoModelForCausalLM, + AutoTokenizer, +) + +from conversation import Conversation, get_conv_template + + +class BaseModelAdapter: + """The base and the default model adapter.""" + + use_fast_tokenizer = True + + def match(self, model_path: str): + return True + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + revision = from_pretrained_kwargs.get("revision", "main") + try: + tokenizer = AutoTokenizer.from_pretrained( + model_path, + use_fast=self.use_fast_tokenizer, + revision=revision, + ) + except TypeError: + tokenizer = AutoTokenizer.from_pretrained( + model_path, + use_fast=False, + revision=revision, + ) + + model = AutoModelForCausalLM.from_pretrained( + model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs + ) + return model, tokenizer + + def get_default_conv_template(self, model_path: str) -> Conversation: + return get_conv_template("one_shot") + + +# A global registry for all model adapters +# TODO (lmzheng): make it a priority queue. +model_adapters: List[BaseModelAdapter] = [] + + +def register_model_adapter(cls): + """Register a model adapter.""" + model_adapters.append(cls()) + + +@cache +def get_model_adapter(model_path: str) -> BaseModelAdapter: + """Get a model adapter for a model_path.""" + for adapter in model_adapters: + if adapter.match(model_path): + return adapter + raise ValueError(f"No valid model adapter for {model_path}") + + +def get_conversation_template(model_path: str) -> Conversation: + """Get the default conversation template.""" + adapter = get_model_adapter(model_path) + return adapter.get_default_conv_template(model_path) + + +class ChatGPTAdapter(BaseModelAdapter): + """The model adapter for ChatGPT""" + + def match(self, model_path: str): + return model_path in ("gpt-3.5-turbo", "gpt-4") + + def load_model(self, model_path: str, from_pretrained_kwargs: dict): + raise NotImplementedError() + + def get_default_conv_template(self, model_path: str) -> Conversation: + return get_conv_template("chatgpt") + + +# Note: the registration order matters. +# The one registered earlier has a higher matching priority. + + +register_model_adapter(ChatGPTAdapter) + +# After all adapters, try the default base adapter. +register_model_adapter(BaseModelAdapter) diff --git a/llm_judge/modules/__init__.py b/llm_judge/modules/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/llm_judge/modules/gptq.py b/llm_judge/modules/gptq.py deleted file mode 100644 index fe0a220..0000000 --- a/llm_judge/modules/gptq.py +++ /dev/null @@ -1,75 +0,0 @@ -from dataclasses import dataclass, field -import os -from os.path import isdir, isfile -from pathlib import Path -import sys - -from transformers import AutoTokenizer - - -@dataclass -class GptqConfig: - ckpt: str = field( - default=None, - metadata={ - "help": "Load quantized model. The path to the local GPTQ checkpoint." - }, - ) - wbits: int = field(default=16, metadata={"help": "#bits to use for quantization"}) - groupsize: int = field( - default=-1, - metadata={"help": "Groupsize to use for quantization; default uses full row."}, - ) - act_order: bool = field( - default=True, - metadata={"help": "Whether to apply the activation order GPTQ heuristic"}, - ) - - -def load_gptq_quantized(model_name, gptq_config: GptqConfig): - print("Loading GPTQ quantized model...") - - try: - script_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) - module_path = os.path.join(script_path, "../repositories/GPTQ-for-LLaMa") - - sys.path.insert(0, module_path) - from llama import load_quant - except ImportError as e: - print(f"Error: Failed to load GPTQ-for-LLaMa. {e}") - print("See https://github.com/lm-sys/FastChat/blob/main/docs/gptq.md") - sys.exit(-1) - - tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) - # only `fastest-inference-4bit` branch cares about `act_order` - if gptq_config.act_order: - model = load_quant( - model_name, - find_gptq_ckpt(gptq_config), - gptq_config.wbits, - gptq_config.groupsize, - act_order=gptq_config.act_order, - ) - else: - # other branches - model = load_quant( - model_name, - find_gptq_ckpt(gptq_config), - gptq_config.wbits, - gptq_config.groupsize, - ) - - return model, tokenizer - - -def find_gptq_ckpt(gptq_config: GptqConfig): - if Path(gptq_config.ckpt).is_file(): - return gptq_config.ckpt - - for ext in ["*.pt", "*.safetensors"]: - matched_result = sorted(Path(gptq_config.ckpt).glob(ext)) - if len(matched_result) > 0: - return str(matched_result[-1]) - - print("Error: gptq checkpoint not found") - sys.exit(1) diff --git a/pyproject.toml b/pyproject.toml index a109f1e..dc590dc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -16,7 +16,7 @@ dependencies = [ "accelerate", "fastapi", "gradio==3.35.2", "httpx", "markdown2[all]", "nh3", "numpy", "peft==0.5", "prompt_toolkit>=3.0.0", "pydantic<=2.0", "requests", "rich>=10.0.0", "sentencepiece", "shortuuid", "shortuuid", "tiktoken", "tokenizers>=0.12.1", "torch", - "transformers>=4.28.0,<4.29.0", "uvicorn", "wandb", "openai", "anthropic", "ray" + "transformers>=4.28.0,<4.29.0", "uvicorn", "wandb", "openai==0.28.1", "ray", "python-dotenv" ] [project.optional-dependencies] diff --git a/scripts/judge.sh b/scripts/judge.sh index 4d12eff..ccd7063 100644 --- a/scripts/judge.sh +++ b/scripts/judge.sh @@ -1,3 +1,4 @@ + python -B llm_judge/gen_judgment.py \ --bench-name "jp_bench" \ --mode pairwise-baseline \