revised scoring results without truncations

allenai · Jun 26, 2024 · 88704b7 · 88704b7
1 parent 4ffcbde
commit 88704b7
Show file tree

Hide file tree

Showing 54 changed files with 1,023,115 additions and 8 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 !evaluation/results_v2.0522/pairwise.v2/eval=gpt-4-turbo-2024-04-09/ref=gpt-4-turbo-2024-04-09/*.json
+!eval_results/v2.0625/score.v2
 local_scripts/
 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -203,5 +204,5 @@ evaluation/eval_template.no_checklist.md
 *.768-1024.json
 
 result_dirs/
-*.instant.json
- 
+*.instant.json 
+eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/_merged_0625_truncation.json
diff --git a/README.md b/README.md
@@ -166,7 +166,7 @@ To analyze the correlation between WildBench (v2) and human evaluation, we consi
 
 - [ ] LLM360/K2-Chat
 - [x] DeepSeek-V2-Code
-- [ ] Yi-large-preview
+- [x] Yi-large-preview
 - [x] THUDM/glm-4-9b-chat
 - [x] chujiezheng/neo_7b_instruct_v0.1-ExPO
 - [x] ZhangShenao/SELM-Llama-3-8B-Instruct-iter-3 

diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Hermes-2-Theta-Llama-3-8B.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Hermes-2-Theta-Llama-3-8B.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Llama-2-70b-chat-hf.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Llama-2-70b-chat-hf.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Llama-2-7b-chat-hf.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Llama-2-7b-chat-hf.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Llama-3-Instruct-8B-SimPO-ExPO.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Llama-3-Instruct-8B-SimPO-ExPO.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Llama-3-Instruct-8B-SimPO.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Llama-3-Instruct-8B-SimPO.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Meta-Llama-3-70B-Instruct.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Meta-Llama-3-70B-Instruct.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Meta-Llama-3-8B-Instruct.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Meta-Llama-3-8B-Instruct.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Mistral-7B-Instruct-v0.2.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Mistral-7B-Instruct-v0.2.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Mixtral-8x7B-Instruct-v0.1.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Mixtral-8x7B-Instruct-v0.1.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Nous-Hermes-2-Mixtral-8x7B-DPO.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Nous-Hermes-2-Mixtral-8x7B-DPO.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Phi-3-medium-128k-instruct.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Phi-3-medium-128k-instruct.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Phi-3-mini-128k-instruct.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Phi-3-mini-128k-instruct.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Qwen1.5-72B-Chat-greedy.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Qwen1.5-72B-Chat-greedy.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/[email protected] b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/[email protected]
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Qwen2-72B-Instruct.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Qwen2-72B-Instruct.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/SELM-Llama-3-8B-Instruct-iter-3.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/SELM-Llama-3-8B-Instruct-iter-3.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/SELM-Zephyr-7B-iter-3.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/SELM-Zephyr-7B-iter-3.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Starling-LM-7B-beta-ExPO.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Starling-LM-7B-beta-ExPO.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Starling-LM-7B-beta.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Starling-LM-7B-beta.json
diff --git a/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Yi-1.5-34B-Chat.json b/eval_results/v2.0625/score.v2/eval=gpt-4o-2024-05-13/Yi-1.5-34B-Chat.json