add neo 7b

allenai · Jun 14, 2024 · e705a4f · e705a4f
1 parent b2324ae
commit e705a4f
Show file tree

Hide file tree

Showing 7 changed files with 106,432 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -157,7 +157,7 @@ To analyze the correlation between WildBench (v2) and human evaluation, we consi
 
 ### Models pending to test 
 
-- [ ] m-a-p/neo_7b_instruct_v0.1
+- [x] m-a-p/neo_7b_instruct_v0.1
 - [ ] GLM-4
 - [x] Reka Flash
 - [x] DeepSeekV2-Chat
@@ -214,6 +214,7 @@ python src/upload_results.py reka-flash-20240226
 python src/upload_results.py deepseekv2-chat
 python src/upload_results.py reka-edge
 python src/upload_results.py reka-core-20240501
+python src/upload_results.py neo_7b_instruct_v0.1
 
 
 ### Submit Batch Jobs
@@ -238,6 +239,7 @@ bash evaluation/run_all_eval_batch.sh deepseekv2-chat
 
 bash evaluation/run_all_eval_batch.sh reka-edge
 bash evaluation/run_all_eval_batch.sh reka-core-20240501
+bash evaluation/run_all_eval_batch.sh neo_7b_instruct_v0.1
 
 ### Check Batch Status
 python src/openai_batch_eval/check_batch_status_with_model_name.py command-r-plus
@@ -249,18 +251,17 @@ python src/openai_batch_eval/check_batch_status_with_model_name.py SELM-Zephyr-7
 python src/openai_batch_eval/check_batch_status_with_model_name.py Qwen2-72B-Instruct
 python src/openai_batch_eval/check_batch_status_with_model_name.py gemini-1.5-flash
 python src/openai_batch_eval/check_batch_status_with_model_name.py gemini-1.5-pro
-
 python src/openai_batch_eval/check_batch_status_with_model_name.py Llama-3-Instruct-8B-SimPO-ExPO
 python src/openai_batch_eval/check_batch_status_with_model_name.py Starling-LM-7B-beta-ExPO
 python src/openai_batch_eval/check_batch_status_with_model_name.py Qwen1.5-72B-Chat-greedy
-
 python src/openai_batch_eval/check_batch_status_with_model_name.py yi-large
 python src/openai_batch_eval/check_batch_status_with_model_name.py reka-flash-20240226
 python src/openai_batch_eval/check_batch_status_with_model_name.py deepseekv2-chat
-
 python src/openai_batch_eval/check_batch_status_with_model_name.py reka-edge
 python src/openai_batch_eval/check_batch_status_with_model_name.py reka-core-20240501
 
+python src/openai_batch_eval/check_batch_status_with_model_name.py neo_7b_instruct_v0.1
+
 python src/view_wb_eval.py score
 python src/view_wb_eval.py pairwise-gpt4t -1
 python src/view_wb_eval.py pairwise-haiku -1

diff --git a/...pairwise.v2/eval=gpt-4-turbo-2024-04-09/ref=Llama-2-70b-chat-hf/neo_7b_instruct_v0.1.json b/...pairwise.v2/eval=gpt-4-turbo-2024-04-09/ref=Llama-2-70b-chat-hf/neo_7b_instruct_v0.1.json
diff --git a/...wise.v2/eval=gpt-4-turbo-2024-04-09/ref=claude-3-haiku-20240307/neo_7b_instruct_v0.1.json b/...wise.v2/eval=gpt-4-turbo-2024-04-09/ref=claude-3-haiku-20240307/neo_7b_instruct_v0.1.json
diff --git a/...rwise.v2/eval=gpt-4-turbo-2024-04-09/ref=gpt-4-turbo-2024-04-09/neo_7b_instruct_v0.1.json b/...rwise.v2/eval=gpt-4-turbo-2024-04-09/ref=gpt-4-turbo-2024-04-09/neo_7b_instruct_v0.1.json
diff --git a/eval_results/v2.0522/score.v2/eval=gpt-4o-2024-05-13/neo_7b_instruct_v0.1.json b/eval_results/v2.0522/score.v2/eval=gpt-4o-2024-05-13/neo_7b_instruct_v0.1.json
diff --git a/evaluation/run_all_eval_batch.sh b/evaluation/run_all_eval_batch.sh
@@ -1,5 +1,5 @@
 MODEL=$1 # your model name
-bash evaluation/run_eval_v2_batch.score.sh $MODEL # individual scoring 
+bash evaluation/run_eval_v2_batch.score.sh $MODEL # individual scoring with GPT-4O (since June 13, 2024)
 bash evaluation/run_eval_v2_batch.sh $MODEL gpt-4-turbo-2024-04-09 # pairwise eval with gpt-4-turbo
 bash evaluation/run_eval_v2_batch.sh $MODEL claude-3-haiku-20240307 # pairwise eval with Claude-3-Opus
 bash evaluation/run_eval_v2_batch.sh $MODEL Llama-2-70b-chat-hf # pairwise eval with Llama-2-70b-chat
@@ -8,6 +8,8 @@ bash evaluation/run_eval_v2_batch.sh $MODEL Llama-2-70b-chat-hf # pairwise eval
 python src/openai_batch_eval/submit_batch.py eval_results/v2.0522/pairwise.v2/eval=gpt-4-turbo-2024-04-09/ref=gpt-4-turbo-2024-04-09/$MODEL.batch-submit.jsonl
 python src/openai_batch_eval/submit_batch.py eval_results/v2.0522/pairwise.v2/eval=gpt-4-turbo-2024-04-09/ref=claude-3-haiku-20240307/$MODEL.batch-submit.jsonl
 python src/openai_batch_eval/submit_batch.py eval_results/v2.0522/pairwise.v2/eval=gpt-4-turbo-2024-04-09/ref=Llama-2-70b-chat-hf/$MODEL.batch-submit.jsonl
-python src/openai_batch_eval/submit_batch.py eval_results/v2.0522/score.v2/eval=gpt-4-turbo-2024-04-09/$MODEL.batch-submit.jsonl
+# python src/openai_batch_eval/submit_batch.py eval_results/v2.0522/score.v2/eval=gpt-4-turbo-2024-04-09/$MODEL.batch-submit.jsonl
+python src/openai_batch_eval/submit_batch.py eval_results/v2.0522/score.v2/eval=gpt-4o-2024-05-13/$MODEL.batch-submit.jsonl
+
 
 # python src/openai_batch_eval/check_batch_status_with_model_name.py $MODEL
diff --git a/evaluation/run_eval_v2_instant.score.sh b/evaluation/run_eval_v2_instant.score.sh
@@ -1,6 +1,6 @@
 model_name=$1 # model to test 
 # by default use "gpt-4-0125-preview" as gpt_eval_name
-gpt_eval_name=${2:-"gpt-4-turbo-2024-04-09"} # evaluator name  # gpt-4-0125-preview
+gpt_eval_name=${2:-"gpt-4o-2024-05-13"} # evaluator name  # gpt-4-0125-preview
 num_shards=${3:-8} # shards 
 
 total_ex=1024