-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_benchmark.py
143 lines (129 loc) · 4.96 KB
/
run_benchmark.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
import json
import logging
import math
import os
import pandas as pd
import time
from datetime import datetime
import sys
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
from dataset_preprocessing import load_from_jsonl, write_to_jsonl
from GeneralLLM import LargeLanguageModel, Qwen, ChatGPT, Gemini, GLM
from KPPerturbation import MultipleChoiceQuestion, get_mcq_llm_answer, QuestionRewriter
logging.basicConfig(level=logging.INFO)
def test_dataset(data:list, model: LargeLanguageModel)->list:
'''
Args:
data:List[dict]. Each element contains a question
keys = ['subject', 'id', 'question', 'options', 'option_ids', 'correct', 'question_first']
model:LargeLanguageModel. The LLM API
Return:
List[dict]. The list containing all output logs.
keys = ['subject', 'id', 'prompt', 'true_answer', 'model', 'model_output']
'''
results = []
n_complete = 0
for elem in data:
mcq = MultipleChoiceQuestion()
mcq.load_dict(elem)
# --- START: to delete ---
# mcq.question = mcq.question[:int(len(mcq.question)/2)]
# --- END: to delete --
prompt = mcq.get_prompt()
response, response_text = get_mcq_llm_answer(mcq, model)
results.append({
'subject':elem['subject'],
'id':elem['id'],
'prompt':prompt,
'question_text': mcq.question,
'options_text': mcq.options,
'true_answer':mcq.correct,
'model':model.model,
'model_output':response,
'model_original_output': response_text
})
n_complete += 1
# --- START: Control response time ---
time.sleep(0.2)
# --- END: Control response time ---
if n_complete % 10 == 0:
logging.info(f'One thread {n_complete} / {len(data)} samples completed.')
return results
def parallel_test_dataset(file_path:str, log_path_prefix:str, model_class:type,
model_selection:str,
temperature:float,
subjects:list,
thread_func = test_dataset,
n_thread = 4,
start_id = None, end_id = None,
simple_question_path = None)->str:
# 1. read data
raw_data = load_from_jsonl(file_path)
# data = raw_data
data = []
if simple_question_path is None:
for item in raw_data:
if item["subject"] in subjects:
data.append(item)
else:
logging.info('Question selection strategy: simple_questions')
df_sq = pd.read_csv(simple_question_path)
for item in raw_data:
if df_sq[(df_sq['subject'] == item['subject']) * (df_sq['id'] == item['id'])].shape[0] > 0:
data.append(item)
if start_id is None:
start_id = 0
if end_id is None:
end_id = len(data)
data = data[start_id:end_id]
print(f"# of samples = {len(data)}")
# 2. split data into disjoint chunks
chunk_size = math.ceil(len(data)/n_thread)
data_chunks = []
for id in range(n_thread):
data_chunks.append(data[(id*chunk_size):min((id+1)*chunk_size, len(data))])
# 3. concurrently test the response of the LLM on each chunk
log_path = f"{log_path_prefix}_{datetime.now()}.jsonl"
logging.info(f"Log path: {log_path}")
with ThreadPoolExecutor() as executor:
futures = [executor.submit(
thread_func, data = chunk,
model = model_class(model = model_selection, temperature = temperature)) for chunk in data_chunks]
# obtain and aggregate results
n_complete = 1
for future in as_completed(futures):
try:
result = future.result()
write_to_jsonl(result, log_path, 'a')
logging.info(f'{n_complete} / {n_thread} thread(s) completed.')
n_complete += 1
except Exception as exc:
logging.info(f'Task generated an exception: {exc}')
return log_path
if __name__ == "__main__":
file_paths = [
'./eval_data/test.jsonl'
]
log_path_prefices = [
'./log/gpt-3.5-turbo-test'
]
for file_path, log_path_prefix in zip(file_paths, log_path_prefices):
print(f'file_path = {file_path}')
print(f'log_path_prefix = {log_path_prefix}')
start_time = time.time()
parallel_test_dataset(
file_path = file_path,
log_path_prefix = log_path_prefix,
simple_question_path = None,
model_class = ChatGPT,
model_selection = 'gpt-3.5-turbo',
temperature = 0.2,
thread_func = test_dataset,
n_thread = 8,
start_id = None,
end_id = None
)
end_time = time.time()
run_time = end_time - start_time
print(f"Running time = {run_time} s")