Merge pull request #28 from hitoshizuku7/fix/useless_file

Fix/useless file
ku-nlp · Nov 21, 2023 · d48478c · d48478c
2 parents de33589 + c9c244d
commit d48478c
Show file tree

Hide file tree

Showing 32 changed files with 185 additions and 2,975 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -9,4 +9,3 @@
 ## Related issue number (if applicable)
 
 <!-- For example: "Closes #1234" -->
-
diff --git a/.gitignore b/.gitignore
@@ -176,20 +176,12 @@ pyrightconfig.json
 # End of https://www.toptal.com/developers/gitignore/api/python
 
 # folder
-fastchat/llm_judge/data/vicuna_jp
-llm_judge/data
+data
 llm_judge/judge.sh
 llm_judge/LLMs
 llm_judge/gen_model_answer copy.py
 llm_judge/gen_model_answer_ori.py
 
-# files
-fastchat/llm_judge/data/judge_prompts_jp.jsonl
-fastchat/llm_judge/data/question_Japanese_ver.jsonl
-fastchat/llm_judge/data/jp_bench/question_Japanese_ver.jsonl
-fastchat/llm_judge/data/jp_bench/question_Japanese_ver2.jsonl
-fastchat/llm_judge/run_eval.sh
-
 # Log
 *.log
 *.log.*
@@ -214,4 +206,3 @@ tests/state_of_the_union.txt
 
 # Build
 build
-
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,4 +11,4 @@ repos:
     hooks:
       - id: ruff
         args: [--fix, --exit-non-zero-on-fix]
-      - id: ruff-format
+      - id: ruff-format
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Japanese Vicuna QA Benchmark
 
- We released Japanese Vicuna QA Benchmark for measuring comprehensive capabilities of Japanese LLMs, which consists of 80 diverse questions in 10 categories (generic, coding, roleplay, writing, etc.) 
+ We released Japanese Vicuna QA Benchmark for measuring comprehensive capabilities of Japanese LLMs, which consists of 80 diverse questions in 10 categories (generic, coding, roleplay, writing, etc.)
 You can leverage this package to evaluate the answers of your Japanese LLM models in a reference-free manner with LLM-as-a-judge.
 To automate the evaluation process, we prompt strong LLMs like GPT-4 to act as judges and assess the quality of the models' responses.
 
@@ -138,13 +138,13 @@ python show_result.py \
 
 ## Sample Outputs
 
-#### Question: 
+#### Question:
 > 植物性タンパク源と動物性タンパク源の違いは何ですか？
-#### Answer of rinna-3.6b: 
+#### Answer of rinna-3.6b:
 > 栄養素の含有量に大きな違いがあります。
-#### Answer of rinna-3.6b-sft-v2: 
+#### Answer of rinna-3.6b-sft-v2:
 > 植物性タンパク源は肉や魚など、いくつかの種類の食品を指します。一方、動物性タンパク源には卵、乳製品、大豆、ナッツ、種などが含まれます。
-#### Answer of rinna-3.6b-ppo: 
+#### Answer of rinna-3.6b-ppo:
 > 植物性タンパク源とは、主に大豆や豆腐などの大豆製品を生産する植物によって作られるタンパク質で、卵、牛乳、肉などの動物性タンパク源よりも栄養価が高く、低カロリーです。一方、動物性タンパク源には、牛肉、豚肉、鶏肉などが含まれます。
 #### Answer of japanese-alpaca-lora-7b:
 > 動物タンポク源（ATP）は、動物によって生産されるエネルギー源である。ATPは、細胞におけるATP認識システムの活性化により生成されています。動物のATP源は、遺伝的に組み込まれたATP生成機構であり、これは、ATPを生成するために使用され、経験的にATPの量を増加させることができる。
@@ -160,12 +160,12 @@ python show_result.py \
 | rinna-3.6b-sft-v2       |  70 |   35 |  58 |  0.429448 |  0.214724 |          0.607362 |
 | japanese-alpaca-lora-7b |  15 |  112 |  38 |  0.090909 |  0.678788 |          0.206061 |
 
-The GPT4 judgments is placed in `data/jp_bench/model_judgment/gpt-4_pair.jsonl`. 
+The GPT4 judgments is placed in `data/jp_bench/model_judgment/gpt-4_pair.jsonl`.
 
 To be noticed, `pairwise-all` might become very inefficient when evaluating more LLMs, as it evaluates combinations of each two of them. In such cases, we recommend using the `pairwise-baseline` mode, allowing all models to be compared against a fixed baseline such as ChatGPT.
 
 ## Supported baseline Models
-To make it more convenient for users to utilize pairwise comparisons with existing Japanese LLMs, we offer the prediction of the following four baselines in `fastchat/llm_judge/data/jp_bench/model_answer`. 
+To make it more convenient for users to utilize pairwise comparisons with existing Japanese LLMs, we offer the prediction of the following four baselines in `fastchat/llm_judge/data/jp_bench/model_answer`.
 
 - [Rinna-3.6B](https://huggingface.co/rinna/japanese-gpt-neox-3.6b)
 - [Rinna-3.6B-sft-v2](https://huggingface.co/rinna/japanese-gpt-neox-3.6b-instruction-sft-v2)

diff --git a/llm_judge/clean_judgment.py b/llm_judge/clean_judgment.py
diff --git a/llm_judge/common.py b/llm_judge/common.py
@@ -11,9 +11,13 @@
 import time
 from typing import Optional
 import openai
-import anthropic
+from dotenv import load_dotenv
 
-from model.model_adapter import get_conversation_template
+from model_adapter import get_conversation_template
+
+load_dotenv()  # Load environment variables from .env file
+
+openai.api_key = os.getenv("OPENAI_API_KEY")
 
 # API setting constants
 API_MAX_RETRY = 16
@@ -174,10 +178,6 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):
 
     if model in ["gpt-3.5-turbo", "gpt-4"]:
         judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
-    elif model in ["claude-v1", "claude-instant-v1"]:
-        judgment = chat_compeletion_anthropic(
-            model, conv, temperature=0, max_tokens=1024
-        )
     else:
         raise ValueError(f"Invalid judge model name: {model}")
 
@@ -282,13 +282,8 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F
     if model in ["gpt-3.5-turbo", "gpt-4"]:
         conv.system = system_prompt
         judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
-    elif model in ["claude-v1", "claude-instant-v1"]:
-        if system_prompt != "You are a helpful assistant.":
-            user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt
-            conv.messages[0][1] = user_prompt
-        judgment = chat_compeletion_anthropic(
-            model, conv, temperature=0, max_tokens=1024
-        )
+        print(judgment)
+        assert False
     else:
         raise ValueError(f"Invalid judge model name: {model}")
 
@@ -422,6 +417,7 @@ def chat_compeletion_openai(model, conv, temperature, max_tokens):
     for _ in range(API_MAX_RETRY):
         try:
             messages = conv.to_openai_api_messages()
+            print(messages)
             response = openai.ChatCompletion.create(
                 model=model,
                 messages=messages,
@@ -438,53 +434,6 @@ def chat_compeletion_openai(model, conv, temperature, max_tokens):
     return output
 
 
-def chat_compeletion_anthropic(model, conv, temperature, max_tokens):
-    output = API_ERROR_OUTPUT
-    for _ in range(API_MAX_RETRY):
-        try:
-            c = anthropic.Client(os.environ["ANTHROPIC_API_KEY"])
-            prompt = conv.get_prompt()
-            response = c.completion(
-                model=model,
-                prompt=prompt,
-                stop_sequences=[anthropic.HUMAN_PROMPT],
-                max_tokens_to_sample=max_tokens,
-                temperature=temperature,
-            )
-            output = response["completion"]
-            break
-        except anthropic.ApiException as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-    return output.strip()
-
-
-def chat_compeletion_palm(chat_state, model, conv, temperature, max_tokens):
-    from fastchat.serve.api_provider import init_palm_chat
-
-    assert model == "palm-2-chat-bison-001"
-
-    if chat_state is None:
-        chat_state = init_palm_chat("chat-bison@001")
-
-    parameters = {
-        "temperature": temperature,
-        "top_p": 0.8,
-        "top_k": 40,
-        "max_output_tokens": max_tokens,
-    }
-    output = API_ERROR_OUTPUT
-    for _ in range(API_MAX_RETRY):
-        try:
-            response = chat_state.send_message(conv.messages[-2][1], **parameters)
-            output = response.text
-            break
-        except Exception as e:
-            print(type(e), e)
-            time.sleep(API_RETRY_SLEEP)
-    return chat_state, output
-
-
 def normalize_game_key_single(gamekey, result):
     """Make the model names sorted in a game key."""
     qid, model_1, model_2 = gamekey
Original file line number	Diff line number	Diff line change
Expand Up		@@ -9,4 +9,3 @@
		## Related issue number (if applicable)

		<!-- For example: "Closes #1234" -->