From ab0a6a89d9870e6fb216daf45a38060378a5b6c0 Mon Sep 17 00:00:00 2001 From: yuchenlin Date: Wed, 3 Jul 2024 06:53:20 +0000 Subject: [PATCH] update the readme with a quick start --- README.md | 16 + leaderboard/data_dir/wb_elo_results.json | 528 +++++++++++------------ leaderboard/show_eval.sh | 4 +- 3 files changed, 282 insertions(+), 266 deletions(-) diff --git a/README.md b/README.md index 74ee107..2adb098 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,22 @@ +## Quick Start + +```bash +HF_MODEL_ID="Magpie-Align/Llama-3-8B-Magpie-Align-v0.1" # example model id +MODEL_PRETTY_NAME="Llama-3-8B-Magpie-Align-v0.1" # example model name +NUM_GPUS=4 # depending on your hardwares; +# do inference on WildBench +bash scripts/_common_vllm.sh $HF_MODEL_ID $MODEL_PRETTY_NAME $NUM_GPUS +# submit to OpenAI for eval (WB-Score) +bash evaluation/run_score_eval_batch.sh ${MODEL_PRETTY_NAME} +# check the batch job status +python src/openai_batch_eval/check_batch_status_with_model_name.py ${MODEL_PRETTY_NAME} +# show the table +bash leaderboard/show_eval.sh score_only +``` + ## How to add a new model to 🦁 WildBench benchmark diff --git a/leaderboard/data_dir/wb_elo_results.json b/leaderboard/data_dir/wb_elo_results.json index 76db4c5..10dbdc1 100644 --- a/leaderboard/data_dir/wb_elo_results.json +++ b/leaderboard/data_dir/wb_elo_results.json @@ -5,439 +5,439 @@ "margin": 3, "tie_margin": 2, "dynamic": true, - "time": "2024-06-28 16:26:50", + "time": "2024-06-28 16:50:43", "interval": 16, "use_regressed_as_init": false, "length_margin": -1 }, "elo_stat": { "gpt-4o-2024-05-13": { - "avg": 1273.482903363584, - "std": 2.5393297753174995, - "median": 1273.704408488013, + "avg": 1273.6944696103308, + "std": 2.2695536167569696, + "median": 1273.4259372709205, "ci": [ - 1269.2099403804073, - 1278.407384886632 + 1269.7921056165535, + 1278.2840004139964 ], "init_elo": 1282.0 }, "claude-3-5-sonnet-20240620": { - "avg": 1258.7245937129444, - "std": 2.1407924261921467, - "median": 1258.5513897679846, + "avg": 1258.9357280627125, + "std": 1.9311229253283435, + "median": 1258.7017355957307, "ci": [ - 1255.3487617417827, - 1263.1194932664123 + 1255.8517783142427, + 1262.612081774449 ], "init_elo": 1266.0 }, "gemini-1.5-pro": { - "avg": 1240.607940993521, - "std": 3.0104177856475025, - "median": 1240.5214071925761, + "avg": 1240.3697430885268, + "std": 2.97765811827672, + "median": 1240.0974056170412, "ci": [ - 1235.3724552423923, - 1245.8986733072418 + 1234.7145506566799, + 1245.720838317004 ], "init_elo": 1251.0 }, "gpt-4-turbo-2024-04-09": { - "avg": 1239.5937016718508, - "std": 2.722290653714309, - "median": 1239.5224998415406, + "avg": 1240.116586076255, + "std": 2.8984954887483134, + "median": 1240.1709340525667, "ci": [ - 1234.872419809773, - 1244.9326458652438 + 1234.7160022449902, + 1245.3123858588656 ], "init_elo": 1248.0 }, "gpt-4-0125-preview": { - "avg": 1228.8633011905017, - "std": 2.7570920567232307, - "median": 1228.8041053360926, + "avg": 1228.5195336603504, + "std": 2.7631688935310343, + "median": 1228.2200662573875, "ci": [ - 1224.6177327454893, - 1234.2948688316644 + 1223.224687470263, + 1233.7110872068065 ], "init_elo": 1237.0 }, "yi-large-preview": { - "avg": 1224.2623808284445, - "std": 3.120497926531944, - "median": 1224.0261465859585, + "avg": 1224.199307151649, + "std": 2.967299293610201, + "median": 1223.9526384970068, "ci": [ - 1219.0174052987059, - 1230.2698410321261 + 1219.5815318177051, + 1231.1115492458753 ], "init_elo": 1231.0 }, "claude-3-opus-20240229": { - "avg": 1223.1736651649583, - "std": 2.97611049341583, - "median": 1222.6690143061157, + "avg": 1223.6666718823901, + "std": 2.7943050290918383, + "median": 1223.558687858616, "ci": [ - 1218.223240888589, - 1229.379999341911 + 1218.1785353209073, + 1229.1414458810118 ], "init_elo": 1230.0 }, "Meta-Llama-3-70B-Instruct": { - "avg": 1209.214844699167, - "std": 1.388121305460629, - "median": 1209.0349443337982, + "avg": 1209.281392569349, + "std": 1.8902283963893667, + "median": 1208.9568376437542, "ci": [ - 1206.6394532053466, - 1211.9709987845886 + 1206.4946693720456, + 1213.89334319003 ], "init_elo": 1213.0 }, "gemini-1.5-flash": { - "avg": 1207.9602542950745, - "std": 2.8400065446404867, - "median": 1207.6629137058717, + "avg": 1207.3310216021384, + "std": 2.7141860036019763, + "median": 1207.1940656798788, "ci": [ - 1202.5968953425268, - 1213.5825641254123 + 1202.1038037646433, + 1212.666325427604 ], "init_elo": 1213.0 }, "deepseek-coder-v2": { - "avg": 1200.0850690957973, - "std": 2.0826030927120014, - "median": 1199.820897856044, + "avg": 1200.1667696924267, + "std": 2.131430442739319, + "median": 1200.1747438123589, "ci": [ - 1196.0502219929695, - 1204.9631548203683 + 1196.1656672759632, + 1204.6571783019751 ], "init_elo": 1203.0 }, "yi-large": { - "avg": 1198.1629838433032, - "std": 2.808501150772543, - "median": 1198.1938190681462, + "avg": 1198.1260036175222, + "std": 2.5280570627873455, + "median": 1198.0470557273243, "ci": [ - 1193.2724181432297, - 1203.1387947474514 + 1194.0287120309672, + 1203.7213195530837 ], "init_elo": 1202.0 }, "claude-3-sonnet-20240229": { - "avg": 1184.6424613114752, - "std": 1.909049883539937, - "median": 1184.5392599055776, + "avg": 1184.5929417407706, + "std": 1.5690377363949868, + "median": 1184.605835917536, "ci": [ - 1181.394352038418, - 1188.557261729875 + 1181.7335015541596, + 1187.5215879089883 ], "init_elo": 1187.0 }, "Qwen2-72B-Instruct": { - "avg": 1182.3610337523694, - "std": 2.0940050252094533, - "median": 1181.9253279147047, + "avg": 1182.1973500505992, + "std": 2.138903502962538, + "median": 1182.0638036672824, "ci": [ - 1178.733131847299, - 1186.7409729361723 + 1178.662979845661, + 1186.7257395696633 ], "init_elo": 1184.0 }, "deepseekv2-chat": { - "avg": 1178.6626392091475, - "std": 7.03117104548419, - "median": 1178.2572955398173, + "avg": 1179.012452558526, + "std": 5.614531714911026, + "median": 1178.4279673119595, "ci": [ - 1164.0226987137644, - 1194.0225964354854 + 1169.2156392546076, + 1190.2308537916963 ], "init_elo": "-" }, "nemotron-4-340b-instruct": { - "avg": 1178.3496598486897, - "std": 2.2815967743291043, - "median": 1178.383422286196, + "avg": 1178.4614633609426, + "std": 2.7461771076108192, + "median": 1178.2859085672842, "ci": [ - 1174.460231760366, - 1183.2263963630949 + 1173.9192171370382, + 1184.6259498311554 ], "init_elo": 1179.0 }, "reka-core-20240501": { - "avg": 1175.4652597185925, - "std": 2.379315537890935, - "median": 1175.4925152113528, + "avg": 1175.7422873390105, + "std": 2.2891786013426296, + "median": 1175.5788951841325, "ci": [ - 1171.2384833118347, - 1179.7736254756508 + 1172.1513803269045, + 1180.6863628950496 ], "init_elo": 1176.0 }, "claude-3-haiku-20240307": { - "avg": 1168.4893453399293, - "std": 1.65343048868204, - "median": 1168.479303587149, + "avg": 1168.9657645568943, + "std": 1.6443232417371922, + "median": 1168.8087228033332, "ci": [ - 1165.4925241402386, - 1172.0471048148788 + 1166.1872748010583, + 1172.3898783675495 ], "init_elo": 1171.0 }, "Qwen1.5-72B-Chat": { - "avg": 1165.884665990428, - "std": 5.831299030226142, - "median": 1166.0868967652177, + "avg": 1164.7312148361896, + "std": 6.487887975171908, + "median": 1165.079651187285, "ci": [ - 1155.1055958915176, - 1175.4586631896345 + 1153.9121776310608, + 1178.113593842754 ], "init_elo": "-" }, "Yi-1.5-34B-Chat": { - "avg": 1160.634261817117, - "std": 2.1205404166077386, - "median": 1160.3782947787465, + "avg": 1160.678161630808, + "std": 2.268093896599675, + "median": 1160.4367539254831, "ci": [ - 1156.7593243286717, - 1164.630153322979 + 1157.0638180706683, + 1164.9788206766023 ], "init_elo": 1160.0 }, "mistral-large-2402": { - "avg": 1157.6057872921183, - "std": 2.3076354293800097, - "median": 1157.6313527685893, + "avg": 1158.0445170170678, + "std": 2.568028787147557, + "median": 1157.9137769329313, "ci": [ - 1153.1245121765257, - 1161.9277229611082 + 1153.5599370802452, + 1162.9451018832365 ], "init_elo": 1158.0 }, "command-r-plus": { - "avg": 1153.5998363389945, - "std": 2.4114240901869533, - "median": 1153.771561051433, + "avg": 1153.9576595330482, + "std": 2.7123580363524473, + "median": 1153.74148290193, "ci": [ - 1149.1054982030628, - 1157.6686788231284 + 1149.2193133509227, + 1159.2398437858374 ], "init_elo": 1155.0 }, "glm-4-9b-chat": { - "avg": 1153.3771423726205, - "std": 6.430173119328217, - "median": 1152.532570557055, + "avg": 1153.8617209270687, + "std": 6.093674487894008, + "median": 1153.6461509951043, "ci": [ - 1141.5951214689383, - 1164.5200970293301 + 1143.4110914750322, + 1164.9855009032285 ], "init_elo": "-" }, "Yi-1.5-9B-Chat": { - "avg": 1153.2501830985038, - "std": 6.18189889018411, - "median": 1152.4390407117164, + "avg": 1153.0154482705366, + "std": 6.37132155935714, + "median": 1152.3626428951625, "ci": [ - 1143.9548207489709, - 1165.7249525571726 + 1141.7629233349273, + 1166.1331042142435 ], "init_elo": "-" }, "Llama-3-Instruct-8B-SimPO": { - "avg": 1149.478530675232, - "std": 6.926241577241614, - "median": 1149.8708110364478, + "avg": 1150.6507162227235, + "std": 6.761684560480008, + "median": 1151.2139347790755, "ci": [ - 1137.103074489702, - 1161.3000929158745 + 1137.7937827751418, + 1162.571536433003 ], "init_elo": "-" }, "Llama-3-Instruct-8B-SimPO-ExPO": { - "avg": 1146.1478698004746, - "std": 6.305252987006804, - "median": 1146.1929621232948, + "avg": 1145.9338663246317, + "std": 5.755036189156874, + "median": 1145.9932350977451, "ci": [ - 1135.8220514439383, - 1159.115001539058 + 1136.6182983112976, + 1156.784351630181 ], "init_elo": "-" }, "SELM-Llama-3-8B-Instruct-iter-3": { - "avg": 1145.8969229965667, - "std": 6.71179559323931, - "median": 1145.8240882987677, + "avg": 1145.4969057369353, + "std": 7.379786305404441, + "median": 1145.032382475327, "ci": [ - 1132.988213045544, - 1160.641559677708 + 1132.2081111897853, + 1159.5072908728978 ], "init_elo": "-" }, - "Meta-Llama-3-8B-Instruct": { - "avg": 1144.154588476019, - "std": 2.573676673217022, - "median": 1144.358968367211, + "Qwen1.5-72B-Chat-greedy": { + "avg": 1144.4358478383701, + "std": 1.806738708907512, + "median": 1144.5051889415336, "ci": [ - 1138.5935859657443, - 1148.5327837848922 + 1140.7091626239403, + 1147.6060857690386 ], - "init_elo": 1146.0 + "init_elo": 1142.0 }, - "Qwen1.5-72B-Chat-greedy": { - "avg": 1144.0199097616917, - "std": 1.8129383988075978, - "median": 1144.040115969045, + "Meta-Llama-3-8B-Instruct": { + "avg": 1144.3040003324938, + "std": 2.5230972187723104, + "median": 1144.3654643422703, "ci": [ - 1140.9039597280373, - 1147.9410521804418 + 1138.8951515345798, + 1148.7207062875932 ], - "init_elo": 1142.0 + "init_elo": 1146.0 }, "Starling-LM-7B-beta-ExPO": { - "avg": 1135.860162771182, - "std": 6.088595756911488, - "median": 1135.466190808198, + "avg": 1137.2417575308302, + "std": 6.10294319660401, + "median": 1137.4261149860954, "ci": [ - 1124.1166678573982, - 1147.4495192263194 + 1126.3038793940989, + 1147.255082692601 ], "init_elo": "-" }, "Hermes-2-Theta-Llama-3-8B": { - "avg": 1135.37360363209, - "std": 6.336633304023114, - "median": 1135.87248651025, + "avg": 1136.175409309469, + "std": 6.611804261279242, + "median": 1136.0210134950307, "ci": [ - 1123.649769448853, - 1148.2412241194627 + 1121.4366061548174, + 1148.9700290061444 ], "init_elo": "-" }, "Phi-3-medium-128k-instruct": { - "avg": 1132.2814683649437, - "std": 6.079605809710065, - "median": 1132.4923887715136, + "avg": 1133.2233869321744, + "std": 6.609617281487586, + "median": 1133.0193584813924, "ci": [ - 1120.3202383546793, - 1144.72250634332 + 1120.448028512938, + 1144.3513066601556 ], "init_elo": "-" }, "reka-flash-20240226": { - "avg": 1129.4217078094814, - "std": 1.696123747592209, - "median": 1129.3813283995837, + "avg": 1129.6684213426745, + "std": 1.5396752974424324, + "median": 1129.5707177504432, "ci": [ - 1126.6227480414166, - 1133.104083799901 + 1126.5048681674114, + 1132.8763930620278 ], "init_elo": 1128.0 }, "SELM-Zephyr-7B-iter-3": { - "avg": 1124.9060496076154, - "std": 6.863363353707567, - "median": 1124.9852449753625, + "avg": 1124.2521153933094, + "std": 6.818965562116768, + "median": 1124.0581973452754, "ci": [ - 1112.8297592241654, - 1138.0218847056444 + 1111.5769966061418, + 1136.9690067730064 ], "init_elo": "-" }, "neo_7b_instruct_v0.1": { - "avg": 1122.3460401813793, - "std": 7.032337244463502, - "median": 1123.0461729845747, + "avg": 1123.1412220485615, + "std": 6.336054078746031, + "median": 1123.7939919524056, "ci": [ - 1108.1087597176984, - 1135.1301102606421 + 1109.0347120466274, + 1133.129062524871 ], "init_elo": "-" }, "neo_7b_instruct_v0.1-ExPO": { - "avg": 1120.8885787318018, - "std": 6.297518415385689, - "median": 1120.8122579265655, + "avg": 1119.8172208273575, + "std": 6.918274669772453, + "median": 1119.8565959551668, "ci": [ - 1109.1413337331348, - 1132.3766136999604 + 1106.9600189186174, + 1132.6662839742232 ], "init_elo": "-" }, "Mixtral-8x7B-Instruct-v0.1": { - "avg": 1118.0679751768923, - "std": 3.0912553084151724, - "median": 1117.9771751116587, + "avg": 1118.2071067154934, + "std": 2.746370042527508, + "median": 1118.3386405904266, "ci": [ - 1112.6864964220147, - 1123.8450543420481 + 1112.7987866576625, + 1122.7575338218364 ], "init_elo": 1114.0 }, "Starling-LM-7B-beta": { - "avg": 1116.4826821485628, - "std": 1.9813491784506625, - "median": 1116.7233933970351, + "avg": 1117.3641666167325, + "std": 2.106482668498819, + "median": 1117.3585189854375, "ci": [ - 1111.6461711029917, - 1120.3044669688707 + 1113.9623678392857, + 1122.058802124706 ], "init_elo": 1114.0 }, - "Yi-1.5-6B-Chat": { - "avg": 1115.9875441565575, - "std": 7.25921013958499, - "median": 1116.1971010839125, + "dbrx-instruct": { + "avg": 1114.494174587617, + "std": 2.2807097189400567, + "median": 1114.503324252286, "ci": [ - 1102.6432127567036, - 1133.6723004766554 + 1110.4842550865337, + 1118.1863122483928 ], - "init_elo": "-" + "init_elo": 1111.0 }, - "dbrx-instruct": { - "avg": 1114.4694626749388, - "std": 1.9293531347992723, - "median": 1114.58455964016, + "Yi-1.5-6B-Chat": { + "avg": 1114.103557914774, + "std": 5.683723898182768, + "median": 1113.8918083181966, "ci": [ - 1110.5547602857896, - 1118.11684033064 + 1102.1096693464528, + 1125.1279488567523 ], - "init_elo": 1111.0 + "init_elo": "-" }, "reka-edge": { - "avg": 1112.0541289012276, - "std": 6.867068360504895, - "median": 1111.9428919736852, + "avg": 1113.1686164740258, + "std": 7.877176078612755, + "median": 1113.4537175223022, "ci": [ - 1100.5849443946593, - 1127.0916484540676 + 1096.8057329908306, + 1125.7527668797097 ], "init_elo": "-" }, "gpt-3.5-turbo-0125": { - "avg": 1111.4614847723099, - "std": 2.5890837678251355, - "median": 1111.224895198749, + "avg": 1111.493588339155, + "std": 2.9886424766513513, + "median": 1111.4076842576035, "ci": [ - 1106.4755919400397, - 1116.2201801360404 + 1106.4399738297734, + 1117.1331052290163 ], "init_elo": 1107.0 }, "command-r": { - "avg": 1109.9013556542359, - "std": 2.444725818822553, - "median": 1109.6891597674942, + "avg": 1109.0202670669578, + "std": 2.4935912977523187, + "median": 1109.1630756220532, "ci": [ - 1104.7867183397695, - 1115.4991577527962 + 1104.418184289586, + 1114.136738120882 ], "init_elo": 1106.0 }, "tulu-2-dpo-70b": { - "avg": 1105.2411680291743, - "std": 2.177682395912544, - "median": 1105.1754515344999, + "avg": 1105.1617024172413, + "std": 2.36405488064591, + "median": 1105.2907760688968, "ci": [ - 1100.1181553385102, - 1109.6197564620782 + 1100.2525690489517, + 1109.4910095618568 ], "init_elo": 1100.0 }, @@ -452,42 +452,42 @@ "init_elo": 1099.0 }, "Mistral-7B-Instruct-v0.2": { - "avg": 1081.8192113845516, - "std": 2.423709141860324, - "median": 1081.8091999801295, + "avg": 1081.946032537576, + "std": 2.2827852496508623, + "median": 1082.175018424738, "ci": [ - 1077.3602798042134, - 1086.2648829291495 + 1076.9400141709914, + 1085.7942494178064 ], "init_elo": 1073.0 }, "Llama-2-70b-chat-hf": { - "avg": 1075.733583109572, - "std": 1.288518501038266, - "median": 1075.7115750284029, + "avg": 1075.7898082520633, + "std": 1.3121860228332556, + "median": 1075.760000046731, "ci": [ - 1073.3964869169413, - 1078.080934859178 + 1073.0242278252626, + 1078.19953318515 ], "init_elo": 1072.0 }, "Qwen1.5-7B-Chat": { - "avg": 1067.330832115312, - "std": 2.893999912282277, - "median": 1067.5987051214995, + "avg": 1067.31406349972, + "std": 2.7090611645334133, + "median": 1067.6005025282532, "ci": [ - 1062.11438179343, - 1072.4177566975627 + 1062.6316002904828, + 1071.8962524867297 ], "init_elo": 1058.0 }, "Nous-Hermes-2-Mixtral-8x7B-DPO": { - "avg": 1060.9568573614201, - "std": 2.7359983306703928, - "median": 1060.9685341967452, + "avg": 1061.2672318890118, + "std": 2.5594329084181515, + "median": 1061.105041505194, "ci": [ - 1055.3343331494573, - 1065.7047630008017 + 1056.0427773886274, + 1065.6164524599988 ], "init_elo": 1047.0 }, @@ -502,22 +502,22 @@ "init_elo": 1050.0 }, "gemma-7b-it": { - "avg": 1049.4886438187582, - "std": 2.8186060267624033, - "median": 1049.5823971565837, + "avg": 1049.6868037599752, + "std": 2.7137750639329026, + "median": 1049.5134832309593, "ci": [ - 1043.7734930342676, - 1054.7476848074934 + 1044.6649545742973, + 1055.0003135957954 ], "init_elo": 1047.0 }, "Phi-3-mini-128k-instruct": { - "avg": 1047.483930512397, - "std": 2.4916253985950343, - "median": 1047.87617146899, + "avg": 1047.1260782807199, + "std": 1.970337493932913, + "median": 1047.243295916965, "ci": [ - 1042.8617655406827, - 1051.6269411373407 + 1043.178128027113, + 1050.3877397395927 ], "init_elo": 1038.0 }, @@ -532,12 +532,12 @@ "init_elo": 1029.0 }, "Llama-2-7b-chat-hf": { - "avg": 1019.342798467166, - "std": 2.1133623776804984, - "median": 1019.4401658721845, + "avg": 1019.0033395743619, + "std": 2.2752323229449396, + "median": 1019.1606405434836, "ci": [ - 1014.9956757087675, - 1023.2017820615134 + 1013.386915265656, + 1022.5466615493968 ], "init_elo": 1012.0 }, @@ -552,12 +552,12 @@ "init_elo": 1005.0 }, "gemma-2b-it": { - "avg": 979.8542879266165, - "std": 2.096207036753793, - "median": 979.881712897273, + "avg": 979.5335868573617, + "std": 2.6487725615318842, + "median": 979.4492623267394, "ci": [ - 975.7247111561979, - 983.8561624493643 + 975.2600266786984, + 984.9505704242133 ], "init_elo": 978.0 } diff --git a/leaderboard/show_eval.sh b/leaderboard/show_eval.sh index b6b43c1..656079f 100644 --- a/leaderboard/show_eval.sh +++ b/leaderboard/show_eval.sh @@ -2,8 +2,8 @@ MODE=$1 -margin=3;tie_margin=2;K=4;dynamic=True -python -m leaderboard.wb_elo --K $K --margin $margin --tie_margin $tie_margin --num_rounds 100 --dynamic $dynamic +margin=3;tie_margin=2;K=4;dynamic=True;interval=16 +python -m leaderboard.wb_elo --K $K --margin $margin --tie_margin $tie_margin --num_rounds 100 --dynamic $dynamic --interval $interval --num_processes 4 # if MODE is not score if [ "$MODE" != "score_only" ];