diff --git a/llm_judge/gen_gpt3.5_answer.py b/llm_judge/gen_gpt3.5_answer.py index b2e3def..38d6e58 100644 --- a/llm_judge/gen_gpt3.5_answer.py +++ b/llm_judge/gen_gpt3.5_answer.py @@ -7,6 +7,13 @@ from typing import List +from dotenv import load_dotenv + +load_dotenv() + +openai.api_key = os.getenv("OPENAI_API_KEY") +openai.organization = os.getenv("OPENAI_ORGANIZATION") + class GPT3_Demo(object): def __init__( @@ -49,122 +56,39 @@ def run_gpt3(prompt_list): demo = GPT3_Demo( engine="text-davinci-003", # text-davinci-003: best, text-ada-001: lowest price temperature=0, # control randomness: lowring results in less random completion (0 ~ 1.0) - max_tokens=8, # max number of tokens to generate (1 ~ 4,000) + max_tokens=2048, # max number of tokens to generate (1 ~ 4,000) top_p=1, # control diversity (0 ~ 1.0) frequency_penalty=0, # how to penalize new tokens based on their existing frequency (0 ~ 2.0) presence_penalty=0, # 这个是对于词是否已经出现过的惩罚,文档上说这个值调高可以增大谈论新topic的概率 (0 ~ 2.0) - best_of=3, # 这个是说从多少个里选最好的,如果这里是10,就会生成10个然后选最好的,但是这样会更贵(1 ~ 20) + best_of=1, # 这个是说从多少个里选最好的,如果这里是10,就会生成10个然后选最好的,但是这样会更贵(1 ~ 20) logprobs=1, ) results = demo.get_multiple_sample(prompt_list) return results -# use following codes to activate azure: - - -# use openai api with group PRISM - - -class Chat_Demo(object): - def __init__( - self, - model, - user_system, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - n, - stream, - stop, - logit_bias, - ): - self.model = model - self.temperature = temperature - self.max_tokens = max_tokens - self.top_p = top_p - self.frequency_penalty = frequency_penalty - self.presence_penalty = presence_penalty - self.n = n - self.stream = stream - self.stop = stop - self.logit_bias = logit_bias - self.chat_list = [] - self.chat_list = [{"role": "system", "content": user_system}] - - def get_chat_output(self, user_prompt): - self.chat_list.append({"role": "user", "content": user_prompt}) - response = openai.ChatCompletion.create( - model=self.model, - messages=self.chat_list, - temperature=self.temperature, - max_tokens=self.max_tokens, - top_p=self.top_p, - frequency_penalty=self.frequency_penalty, - presence_penalty=self.presence_penalty, - n=self.n, - stream=self.stream, - stop=self.stop, - ) - reply = response["choices"][0]["message"]["content"] - self.chat_list.append(response["choices"][0]["message"]) - - return reply - - def delete_last_chat(self): - self.chat_list.pop() - self.chat_list.pop() - - -# usage template for chatgpt -def run_chatgpt(user_prompt_list): - demo = Chat_Demo( - model="gpt-3.5-turbo-16k-0613", # gpt-3.5-turbo: chatgpt with lowerest price, gpt-4: lateset version, higher price - # model="gpt-4", # gpt-3.5-turbo: chatgpt with lowerest price, gpt-4: lateset version, higher price - user_system="You are a helpful assistant", # add more description after this to fit your task, e.g., "you are a helpful assistant that translates English to Chinese." will be a good system for MT. - temperature=1.0, # default 1.0, control randomness: lowring results in less random completion (0 ~ 2.0) - max_tokens=256, # max number of tokens to generate (1 ~ 4,000) - top_p=1, # default 1, control diversity (0 ~ 1.0), openai suggests not to alter this with temperature together - frequency_penalty=0, # default 0, how to penalize new tokens based on their existing frequency (-2.0 ~ 2.0) - presence_penalty=0, # default 0, 这个是对于词是否已经出现过的惩罚,文档上说这个值调高可以增大谈论新topic的概率 (-2.0 ~ 2.0) - stop="", # manually control where to stop - stream=False, # temporarily keep false - n=1, # n choices of reply for each input - logit_bias=None, # use the bias to control what tokens you want to appear or disappear, this function is NOT implemented in Chat_Demo now - ) - reply = demo.get_chat_output(user_prompt_list) - return reply - - if __name__ == "__main__": - question = [] - data_file = "./data/question_Japanese_ver.jsonl" - with open(data_file, "r") as f: - instruction_list = [] - for line in tqdm(f.read().splitlines()): - tmp_dict = json.loads(line) - question.append(tmp_dict) - instruction_list.append(tmp_dict["text"][0:]) - # examples = [l.strip() for l in instruction_list] - examples = instruction_list + data_file = "./data/jp_bench/question.jsonl" + with open(data_file) as f: + questions = [json.loads(line) for line in tqdm(f)] + results = [] - for index, example in tqdm(enumerate(examples)): - response = run_gpt3(example) + for question in tqdm(questions): + instruction = question["turns"][0] + response = run_gpt3(instruction) results.append( { - "question_id": question[index]["question_id"], + "question_id": question["question_id"], "answer_id": shortuuid.uuid(), "model_id": "gpt-3.5-davinci", - "choices": [{"index": 0, "turns": [response]}], + "choices": [{"index": 0, "turns": response}], "tstamp": time.time(), } ) - predictions_file = "./data/jp_bench/model_answer/gpt-3.5-davinci.jsonl" + + predictions_file = "./data/jp_bench/model_answer/openai--gpt-3.5-davinci.json" dirname = os.path.dirname(predictions_file) os.makedirs(dirname, exist_ok=True) - with open(predictions_file, "w") as f: + with open(predictions_file, "w", encoding="utf-8") as f: for result in results: - json_line = json.dumps(result, ensure_ascii=False) - f.write(json_line + "\n") + f.write(json.dumps(result, ensure_ascii=False) + "\n")