fix the format

ku-nlp · Nov 21, 2023 · 04b8336 · 04b8336
1 parent 8b7bf51
commit 04b8336
Show file tree

Hide file tree

Showing 9 changed files with 39 additions and 47 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -9,4 +9,3 @@
 ## Related issue number (if applicable)
 
 <!-- For example: "Closes #1234" -->
-
diff --git a/.gitignore b/.gitignore
@@ -210,4 +210,3 @@ tests/state_of_the_union.txt
 
 # Build
 build
-
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,4 +11,4 @@ repos:
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
-      - id: ruff-format
+      - id: ruff-format
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Japanese Vicuna QA Benchmark
 
- We released Japanese Vicuna QA Benchmark for measuring comprehensive capabilities of Japanese LLMs, which consists of 80 diverse questions in 10 categories (generic, coding, roleplay, writing, etc.) 
+ We released Japanese Vicuna QA Benchmark for measuring comprehensive capabilities of Japanese LLMs, which consists of 80 diverse questions in 10 categories (generic, coding, roleplay, writing, etc.)
 You can leverage this package to evaluate the answers of your Japanese LLM models in a reference-free manner with LLM-as-a-judge.
 To automate the evaluation process, we prompt strong LLMs like GPT-4 to act as judges and assess the quality of the models' responses.
 
@@ -138,13 +138,13 @@ python show_result.py \
 
 ## Sample Outputs
 
-#### Question: 
+#### Question:
 > 植物性タンパク源と動物性タンパク源の違いは何ですか？
-#### Answer of rinna-3.6b: 
+#### Answer of rinna-3.6b:
 > 栄養素の含有量に大きな違いがあります。
-#### Answer of rinna-3.6b-sft-v2: 
+#### Answer of rinna-3.6b-sft-v2:
 > 植物性タンパク源は肉や魚など、いくつかの種類の食品を指します。一方、動物性タンパク源には卵、乳製品、大豆、ナッツ、種などが含まれます。
-#### Answer of rinna-3.6b-ppo: 
+#### Answer of rinna-3.6b-ppo:
 > 植物性タンパク源とは、主に大豆や豆腐などの大豆製品を生産する植物によって作られるタンパク質で、卵、牛乳、肉などの動物性タンパク源よりも栄養価が高く、低カロリーです。一方、動物性タンパク源には、牛肉、豚肉、鶏肉などが含まれます。
 #### Answer of japanese-alpaca-lora-7b:
 > 動物タンポク源（ATP）は、動物によって生産されるエネルギー源である。ATPは、細胞におけるATP認識システムの活性化により生成されています。動物のATP源は、遺伝的に組み込まれたATP生成機構であり、これは、ATPを生成するために使用され、経験的にATPの量を増加させることができる。
@@ -160,12 +160,12 @@ python show_result.py \
 | rinna-3.6b-sft-v2       |  70 |   35 |  58 |  0.429448 |  0.214724 |          0.607362 |
 | japanese-alpaca-lora-7b |  15 |  112 |  38 |  0.090909 |  0.678788 |          0.206061 |
 
-The GPT4 judgments is placed in `data/jp_bench/model_judgment/gpt-4_pair.jsonl`. 
+The GPT4 judgments is placed in `data/jp_bench/model_judgment/gpt-4_pair.jsonl`.
 
 To be noticed, `pairwise-all` might become very inefficient when evaluating more LLMs, as it evaluates combinations of each two of them. In such cases, we recommend using the `pairwise-baseline` mode, allowing all models to be compared against a fixed baseline such as ChatGPT.
 
 ## Supported baseline Models
-To make it more convenient for users to utilize pairwise comparisons with existing Japanese LLMs, we offer the prediction of the following four baselines in `fastchat/llm_judge/data/jp_bench/model_answer`. 
+To make it more convenient for users to utilize pairwise comparisons with existing Japanese LLMs, we offer the prediction of the following four baselines in `fastchat/llm_judge/data/jp_bench/model_answer`.
 
 - [Rinna-3.6B](https://huggingface.co/rinna/japanese-gpt-neox-3.6b)
 - [Rinna-3.6B-sft-v2](https://huggingface.co/rinna/japanese-gpt-neox-3.6b-instruction-sft-v2)

diff --git a/llm_judge/common.py b/llm_judge/common.py
@@ -13,12 +13,12 @@
 import openai
 from dotenv import load_dotenv
 
+from model_adapter import get_conversation_template
+
 load_dotenv()  # Load environment variables from .env file
 
 openai.api_key = os.getenv("OPENAI_API_KEY")
 
-from model_adapter import get_conversation_template
-
 # API setting constants
 API_MAX_RETRY = 16
 API_RETRY_SLEEP = 10
@@ -86,7 +86,6 @@ class MatchPair:
     multi_turn: bool = False
 
 
-
 def load_questions(question_file: str, begin: Optional[int], end: Optional[int]):
     """Load questions from a file."""
     questions = []
@@ -280,7 +279,6 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F
     conv.append_message(conv.roles[0], user_prompt)
     conv.append_message(conv.roles[1], None)
 
-
     if model in ["gpt-3.5-turbo", "gpt-4"]:
         conv.system = system_prompt
         judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
@@ -436,7 +434,6 @@ def chat_compeletion_openai(model, conv, temperature, max_tokens):
     return output
 
 
-
 def normalize_game_key_single(gamekey, result):
     """Make the model names sorted in a game key."""
     qid, model_1, model_2 = gamekey

diff --git a/llm_judge/gen_gpt3.5_answer.py b/llm_judge/gen_gpt3.5_answer.py
@@ -5,9 +5,21 @@
 import time
 import shortuuid
 
+from typing import List
+
 
 class GPT3_Demo(object):
-    def __init__(self, engine, temperature, max_tokens, top_p, frequency_penalty, presence_penalty, best_of, logprobs):
+    def __init__(
+        self,
+        engine,
+        temperature,
+        max_tokens,
+        top_p,
+        frequency_penalty,
+        presence_penalty,
+        best_of,
+        logprobs,
+    ):
         self.engine = engine
         self.temperature = temperature
         self.max_tokens = max_tokens
@@ -27,11 +39,12 @@ def get_multiple_sample(self, prompt_list: List[str]):
             frequency_penalty=self.frequency_penalty,
             presence_penalty=self.presence_penalty,
             best_of=self.best_of,
-            logprobs=self.logprobs
+            logprobs=self.logprobs,
         )
         results = [choice.text for choice in response.choices]
         return results
 
+
 def run_gpt3(prompt_list):
     demo = GPT3_Demo(
         engine="text-davinci-003",  # text-davinci-003: best, text-ada-001: lowest price
@@ -41,7 +54,7 @@ def run_gpt3(prompt_list):
         frequency_penalty=0,  # how to penalize new tokens based on their existing frequency (0 ~ 2.0)
         presence_penalty=0,  # 这个是对于词是否已经出现过的惩罚，文档上说这个值调高可以增大谈论新topic的概率 (0 ~ 2.0)
         best_of=3,  # 这个是说从多少个里选最好的，如果这里是10，就会生成10个然后选最好的，但是这样会更贵(1 ~ 20)
-        logprobs=1
+        logprobs=1,
     )
     results = demo.get_multiple_sample(prompt_list)
     return results
@@ -52,6 +65,7 @@ def run_gpt3(prompt_list):
 
 # use openai api with group PRISM
 
+
 class Chat_Demo(object):
     def __init__(
         self,
@@ -137,8 +151,16 @@ def run_chatgpt(user_prompt_list):
         examples = instruction_list
     results = []
     for index, example in tqdm(enumerate(examples)):
-        response=run_gpt3(example)
-        results.append({"question_id":question[index]["question_id"],"answer_id":shortuuid.uuid(),"model_id":"gpt-3.5-davinci" ,"choices":[{"index": 0, "turns": [response]}],"tstamp": time.time(),})    
+        response = run_gpt3(example)
+        results.append(
+            {
+                "question_id": question[index]["question_id"],
+                "answer_id": shortuuid.uuid(),
+                "model_id": "gpt-3.5-davinci",
+                "choices": [{"index": 0, "turns": [response]}],
+                "tstamp": time.time(),
+            }
+        )
     predictions_file = "./data/jp_bench/model_answer/gpt-3.5-davinci.jsonl"
     dirname = os.path.dirname(predictions_file)
     os.makedirs(dirname, exist_ok=True)

diff --git a/llm_judge/gen_judgment.py b/llm_judge/gen_judgment.py
@@ -5,12 +5,7 @@
 import argparse
 from concurrent.futures import ThreadPoolExecutor
 import json
-<<<<<<< HEAD
-import sys
 import os
-
-=======
->>>>>>> origin/dev
 import numpy as np
 from tqdm import tqdm
 from dotenv import load_dotenv

diff --git a/llm_judge/gen_model_answer.py b/llm_judge/gen_model_answer.py
@@ -4,7 +4,6 @@
 import shortuuid
 import time
 from tqdm import tqdm
-import sys
 
 import torch
 from transformers import (

diff --git a/llm_judge/model_adapter.py b/llm_judge/model_adapter.py
@@ -1,31 +1,21 @@
 """Model adapter registration."""
 
-import math
 import sys
-from typing import List, Optional
-import warnings
+from typing import List
 
 if sys.version_info >= (3, 9):
     from functools import cache
 else:
     from functools import lru_cache as cache
 
-import accelerate
-import psutil
-import torch
 from transformers import (
-    AutoConfig,
-    AutoModel,
     AutoModelForCausalLM,
-    AutoModelForSeq2SeqLM,
     AutoTokenizer,
-    LlamaTokenizer,
-    LlamaForCausalLM,
-    T5Tokenizer,
 )
 
 from conversation import Conversation, get_conv_template
 
+
 class BaseModelAdapter:
     """The base and the default model adapter."""
 
@@ -54,7 +44,6 @@ def load_model(self, model_path: str, from_pretrained_kwargs: dict):
         )
         return model, tokenizer
 
-
     def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("one_shot")
 
@@ -78,17 +67,12 @@ def get_model_adapter(model_path: str) -> BaseModelAdapter:
     raise ValueError(f"No valid model adapter for {model_path}")
 
 
-
 def get_conversation_template(model_path: str) -> Conversation:
     """Get the default conversation template."""
     adapter = get_model_adapter(model_path)
     return adapter.get_default_conv_template(model_path)
 
 
-
-
-
-
 class ChatGPTAdapter(BaseModelAdapter):
     """The model adapter for ChatGPT"""
 
@@ -102,9 +86,6 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("chatgpt")
 
 
-
-
-
 # Note: the registration order matters.
 # The one registered earlier has a higher matching priority.
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,3 @@
		## Related issue number (if applicable)

		<!-- For example: "Closes #1234" -->
Original file line number	Diff line number	Diff line change
Expand Up		@@ -210,4 +210,3 @@ tests/state_of_the_union.txt

		# Build
		build