-
Notifications
You must be signed in to change notification settings - Fork 0
/
caa_refusal_testing.py
29 lines (21 loc) · 1.07 KB
/
caa_refusal_testing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import json
from tqdm import tqdm
from transformers import AutoTokenizer
from steering.refusal_test_open_ended import eval_prompt, EVAL_MODEL
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained(EVAL_MODEL)
with open("/home/adam/src/CAA/datasets/generate/refusal/generate_openresponse_dataset.json") as f:
data = json.load(f)
for d in (data):
print(d["question"])
match_eval = eval_prompt(d["question"], d["answer_matching_behavior"], tokenizer)
# if match_eval in [None, False]:
# print(d["answer_matching_behavior"])
# print("Match eval failed: ", match_eval)
print("REFUSE" if match_eval else "NOTREFUSE", d["answer_matching_behavior"])
unmatch_eval = eval_prompt(d["question"], d["answer_not_matching_behavior"], tokenizer)
# if unmatch_eval in [None, True]:
# print(d["answer_not_matching_behavior"])
# print("Unmatch eval failed: ", unmatch_eval)
print("REFUSE" if unmatch_eval else "NOTREFUSE", d["answer_not_matching_behavior"])
print()