Skip to content

Commit

Permalink
Merge pull request #28 from hitoshizuku7/fix/useless_file
Browse files Browse the repository at this point in the history
Fix/useless file
  • Loading branch information
hkiyomaru authored Nov 21, 2023
2 parents de33589 + c9c244d commit d48478c
Show file tree
Hide file tree
Showing 32 changed files with 185 additions and 2,975 deletions.
1 change: 0 additions & 1 deletion .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,3 @@
## Related issue number (if applicable)

<!-- For example: "Closes #1234" -->

11 changes: 1 addition & 10 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -176,20 +176,12 @@ pyrightconfig.json
# End of https://www.toptal.com/developers/gitignore/api/python

# folder
fastchat/llm_judge/data/vicuna_jp
llm_judge/data
data
llm_judge/judge.sh
llm_judge/LLMs
llm_judge/gen_model_answer copy.py
llm_judge/gen_model_answer_ori.py

# files
fastchat/llm_judge/data/judge_prompts_jp.jsonl
fastchat/llm_judge/data/question_Japanese_ver.jsonl
fastchat/llm_judge/data/jp_bench/question_Japanese_ver.jsonl
fastchat/llm_judge/data/jp_bench/question_Japanese_ver2.jsonl
fastchat/llm_judge/run_eval.sh

# Log
*.log
*.log.*
Expand All @@ -214,4 +206,3 @@ tests/state_of_the_union.txt

# Build
build

2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ repos:
hooks:
- id: ruff
args: [--fix, --exit-non-zero-on-fix]
- id: ruff-format
- id: ruff-format
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Japanese Vicuna QA Benchmark

We released Japanese Vicuna QA Benchmark for measuring comprehensive capabilities of Japanese LLMs, which consists of 80 diverse questions in 10 categories (generic, coding, roleplay, writing, etc.)
We released Japanese Vicuna QA Benchmark for measuring comprehensive capabilities of Japanese LLMs, which consists of 80 diverse questions in 10 categories (generic, coding, roleplay, writing, etc.)
You can leverage this package to evaluate the answers of your Japanese LLM models in a reference-free manner with LLM-as-a-judge.
To automate the evaluation process, we prompt strong LLMs like GPT-4 to act as judges and assess the quality of the models' responses.

Expand Down Expand Up @@ -138,13 +138,13 @@ python show_result.py \

## Sample Outputs

#### Question:
#### Question:
> 植物性タンパク源と動物性タンパク源の違いは何ですか?
#### Answer of rinna-3.6b:
#### Answer of rinna-3.6b:
> 栄養素の含有量に大きな違いがあります。
#### Answer of rinna-3.6b-sft-v2:
#### Answer of rinna-3.6b-sft-v2:
> 植物性タンパク源は肉や魚など、いくつかの種類の食品を指します。一方、動物性タンパク源には卵、乳製品、大豆、ナッツ、種などが含まれます。
#### Answer of rinna-3.6b-ppo:
#### Answer of rinna-3.6b-ppo:
> 植物性タンパク源とは、主に大豆や豆腐などの大豆製品を生産する植物によって作られるタンパク質で、卵、牛乳、肉などの動物性タンパク源よりも栄養価が高く、低カロリーです。一方、動物性タンパク源には、牛肉、豚肉、鶏肉などが含まれます。
#### Answer of japanese-alpaca-lora-7b:
> 動物タンポク源(ATP)は、動物によって生産されるエネルギー源である。ATPは、細胞におけるATP認識システムの活性化により生成されています。動物のATP源は、遺伝的に組み込まれたATP生成機構であり、これは、ATPを生成するために使用され、経験的にATPの量を増加させることができる。
Expand All @@ -160,12 +160,12 @@ python show_result.py \
| rinna-3.6b-sft-v2 | 70 | 35 | 58 | 0.429448 | 0.214724 | 0.607362 |
| japanese-alpaca-lora-7b | 15 | 112 | 38 | 0.090909 | 0.678788 | 0.206061 |

The GPT4 judgments is placed in `data/jp_bench/model_judgment/gpt-4_pair.jsonl`.
The GPT4 judgments is placed in `data/jp_bench/model_judgment/gpt-4_pair.jsonl`.

To be noticed, `pairwise-all` might become very inefficient when evaluating more LLMs, as it evaluates combinations of each two of them. In such cases, we recommend using the `pairwise-baseline` mode, allowing all models to be compared against a fixed baseline such as ChatGPT.

## Supported baseline Models
To make it more convenient for users to utilize pairwise comparisons with existing Japanese LLMs, we offer the prediction of the following four baselines in `fastchat/llm_judge/data/jp_bench/model_answer`.
To make it more convenient for users to utilize pairwise comparisons with existing Japanese LLMs, we offer the prediction of the following four baselines in `fastchat/llm_judge/data/jp_bench/model_answer`.

- [Rinna-3.6B](https://huggingface.co/rinna/japanese-gpt-neox-3.6b)
- [Rinna-3.6B-sft-v2](https://huggingface.co/rinna/japanese-gpt-neox-3.6b-instruction-sft-v2)
Expand Down
93 changes: 0 additions & 93 deletions llm_judge/clean_judgment.py

This file was deleted.

69 changes: 9 additions & 60 deletions llm_judge/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,13 @@
import time
from typing import Optional
import openai
import anthropic
from dotenv import load_dotenv

from model.model_adapter import get_conversation_template
from model_adapter import get_conversation_template

load_dotenv() # Load environment variables from .env file

openai.api_key = os.getenv("OPENAI_API_KEY")

# API setting constants
API_MAX_RETRY = 16
Expand Down Expand Up @@ -174,10 +178,6 @@ def run_judge_single(question, answer, judge, ref_answer, multi_turn=False):

if model in ["gpt-3.5-turbo", "gpt-4"]:
judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
elif model in ["claude-v1", "claude-instant-v1"]:
judgment = chat_compeletion_anthropic(
model, conv, temperature=0, max_tokens=1024
)
else:
raise ValueError(f"Invalid judge model name: {model}")

Expand Down Expand Up @@ -282,13 +282,8 @@ def run_judge_pair(question, answer_a, answer_b, judge, ref_answer, multi_turn=F
if model in ["gpt-3.5-turbo", "gpt-4"]:
conv.system = system_prompt
judgment = chat_compeletion_openai(model, conv, temperature=0, max_tokens=2048)
elif model in ["claude-v1", "claude-instant-v1"]:
if system_prompt != "You are a helpful assistant.":
user_prompt = "[Instruction]\n" + system_prompt + "\n\n" + user_prompt
conv.messages[0][1] = user_prompt
judgment = chat_compeletion_anthropic(
model, conv, temperature=0, max_tokens=1024
)
print(judgment)
assert False
else:
raise ValueError(f"Invalid judge model name: {model}")

Expand Down Expand Up @@ -422,6 +417,7 @@ def chat_compeletion_openai(model, conv, temperature, max_tokens):
for _ in range(API_MAX_RETRY):
try:
messages = conv.to_openai_api_messages()
print(messages)
response = openai.ChatCompletion.create(
model=model,
messages=messages,
Expand All @@ -438,53 +434,6 @@ def chat_compeletion_openai(model, conv, temperature, max_tokens):
return output


def chat_compeletion_anthropic(model, conv, temperature, max_tokens):
output = API_ERROR_OUTPUT
for _ in range(API_MAX_RETRY):
try:
c = anthropic.Client(os.environ["ANTHROPIC_API_KEY"])
prompt = conv.get_prompt()
response = c.completion(
model=model,
prompt=prompt,
stop_sequences=[anthropic.HUMAN_PROMPT],
max_tokens_to_sample=max_tokens,
temperature=temperature,
)
output = response["completion"]
break
except anthropic.ApiException as e:
print(type(e), e)
time.sleep(API_RETRY_SLEEP)
return output.strip()


def chat_compeletion_palm(chat_state, model, conv, temperature, max_tokens):
from fastchat.serve.api_provider import init_palm_chat

assert model == "palm-2-chat-bison-001"

if chat_state is None:
chat_state = init_palm_chat("chat-bison@001")

parameters = {
"temperature": temperature,
"top_p": 0.8,
"top_k": 40,
"max_output_tokens": max_tokens,
}
output = API_ERROR_OUTPUT
for _ in range(API_MAX_RETRY):
try:
response = chat_state.send_message(conv.messages[-2][1], **parameters)
output = response.text
break
except Exception as e:
print(type(e), e)
time.sleep(API_RETRY_SLEEP)
return chat_state, output


def normalize_game_key_single(gamekey, result):
"""Make the model names sorted in a game key."""
qid, model_1, model_2 = gamekey
Expand Down
Loading

0 comments on commit d48478c

Please sign in to comment.