Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refusal Leaderboard Revamp #3636

Open
wants to merge 23 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion fastchat/serve/monitor/classify/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,15 @@ Your label_bench directory should follow the structure:

## How to evaluate your category classifier?

To test your new classifier for a new category, you would have to make sure you created the category child class in `category.py`. Then, to generate classification labels, make the necessary edits in `config.yaml` and run
To test your new classifier for a new category, you would have to make sure you created the category child class in `category.py`.

We currently support classifiers that can be accessed via OpenAI API, and Hugging Face classifiers.

If you are using a OpenAI API based classifier, you should create a class inheriting from `CategoryAPI`. You will need to create a `name_tag` attribute for your classifier, and implement the `pre_process` and `post_process` functions. More documentation is available on these functions in `category.py` under the `CategoryAPI` class.

If you are using a Hugging Face based classifier, you should create a class inheriting from `CategoryHF`. You will need to create a `name_tag` attribute for your classifier, and implement the `pre_process` and `post_process` functions. More documentation is available on these functions in `category.py` under the `CategoryHF` class.

Then, to generate classification labels, make the necessary edits in `config.yaml` and run
```console
python label.py --config config.yaml --testing
```
Expand Down
239 changes: 204 additions & 35 deletions fastchat/serve/monitor/classify/category.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,133 @@
# - if_v0.1
# - if
# - score
# - creative_writing_v0.1
# - creative_writing
# - score
# - refusal_v0.2
# - refusal

import ast
import re
import numpy as np
from collections import defaultdict

from utils import HuggingFaceClassifier, chat_completion_openai


def create_category(name):
if name == "criteria_v0.1":
return CategoryHardPrompt()
elif name == "if_v0.1":
return CategoryIF()
elif name == "math_v0.1":
return CategoryMath()
elif name == "creative_writing_v0.1":
return CategoryCreativeWriting()
elif name == "refusal_v0.2":
return CategoryRefusalHF()

class Category:
raise Exception(f"Category name is incorrect: {name}")


class CategoryAPI:
def __init__(self):
self.batch_size = 1
self.is_parallel = True

def get_answer(self, batch, model_name, max_tokens, temperature, api_dict):
assert len(batch) == 1, "API-based categories must have batch size of 1"

conv, uid = self.pre_process(batch)
output = chat_completion_openai(
model=model_name,
messages=conv,
temperature=temperature,
max_tokens=max_tokens,
api_dict=api_dict,
)
return self.post_process(output, uid)

def pre_process(self, row):
"""
Prepares a text to be labeled by LLM through OpenAI API

Inherited category classifier classes should implement this method.

Args:
row (pd.Dataframe): row representing single battle to be labeled

Returns:
conv (dict): processed text with system prompt in OpenAI API format:
[
{"role": "system", "content": <system prompt>"},
{"role": "user", "content": <user input>},
...
]
"""
pass

@staticmethod
def create_category(name):
if name == "criteria_v0.1":
return CategoryHardPrompt()
elif name == "if_v0.1":
return CategoryIF()
elif name == "math_v0.1":
return CategoryMath()
elif name == "creative_writing_v0.1":
return CategoryCreativeWriting()
def post_process(self, judgement, uid):
"""
Processes judgements/outputs of LLM to retrieve final labels

Inherited category classifier classes should implement this method.

raise Exception(f"Category name is incorrect: {name}")
Args:
judgement (str): text output of LLM labeler
uid (str): UID of the battle to be labeled

def post_process(self):
Returns:
output (Dict[str, Dict[str, str]]: Key is battle UID, value is the output associated with that battle (usually a dictionary)
raw_ouput (Dict[str, str]): Key is battle UID, value is the unprocessed LLM output
"""
pass


class CategoryHardPrompt(Category):
class CategoryHF:
def __init__(self):
self.batch_size = 1
self.is_parallel = False

def get_answer(self, batch, model_name, max_tokens, temperature, api_dict):
to_label, to_label_uids = self.pre_process(batch)
labels = self.classifier.classify_batch(to_label)

return self.post_process(labels, to_label_uids)

def pre_process(self, batch):
"""
Prepares a batch of texts to be labeled by Hugging Face classifier.

Inherited category classifier classes should implement this method.

Args:
batch (pd.DataFrame): Each row of the DataFrame represents one battle.

Returns:
outputs (List[str]): Texts to be labeled by HF classifier
to_label_uids (List[str]): Battle UIDs corresponding to each text to be labeled
"""
pass

def post_process(labels, to_label_uids):
"""
Processes raw HF labels.

Inherited category classifier classes should implement this method.

Args:
labels (List[bool]): labels directly from HF classifier
to_label_uids (List[str]): Battle UIDs corresponding to each string that was labeled

Returns:
output (Dict[str, Dict[str, str]]: Keys are battle uids, values are the outputs associated with that battle (usually a dictionary)
raw_ouput (Dict[str, str]): Keys is battle UIDs, value is the unprocessed HF model output or None
"""
pass


class CategoryHardPrompt(CategoryAPI):
def __init__(self):
super().__init__()
self.name_tag = "criteria_v0.1"
Expand Down Expand Up @@ -63,17 +164,21 @@ def get_score(self, judgment):
else:
return []

def pre_process(self, prompt):
def pre_process(self, row):
prompt = row["prompt"].iloc[0]
conv = [{"role": "system", "content": self.sys_prompt}]
conv.append({"role": "user", "content": prompt})
return conv
return conv, row["uid"].iloc[0]

def post_process(self, judgment, uid):
raw_output = {uid: judgment}

def post_process(self, judgment):
criteria = self.get_score(judgment=judgment)
return {name: bool(i in criteria) for i, name in self.tags.items()}
output = {uid: {name: bool(i in criteria) for i, name in self.tags.items()}}
return output, raw_output


class CategoryIF(Category):
class CategoryIF(CategoryAPI):
def __init__(self):
super().__init__()
self.name_tag = "if_v0.1"
Expand All @@ -91,23 +196,29 @@ def get_score(self, judgment):
else:
return None

def pre_process(self, prompt):
def pre_process(self, row):
prompt = row["prompt"].iloc[0]
args = {"PROMPT": prompt}
conv = [
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": self.prompt_template.format(**args)},
]
return conv
return conv, row["uid"].iloc[0]

def post_process(self, judgment, uid):
raw_output = {uid: judgment}

def post_process(self, judgment):
score = self.get_score(judgment=judgment)
return {
"if": bool(score >= 4) if score else False,
"score": score,
output = {
uid: {
"if": bool(score >= 4) if score else False,
"score": score,
}
}
return output, raw_output


class CategoryMath(Category):
class CategoryMath(CategoryAPI):
def __init__(self):
super().__init__()
self.name_tag = "math_v0.1"
Expand All @@ -125,20 +236,25 @@ def get_score(self, judgment):
else:
return None

def pre_process(self, prompt):
def pre_process(self, row):
prompt = row["prompt"].iloc[0]
args = {"PROMPT": prompt}
conv = [
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": self.prompt_template.format(**args)},
]
return conv
return conv, row["uid"].iloc[0]

def post_process(self, judgment, uid):
raw_output = {uid: judgment}

def post_process(self, judgment):
score = self.get_score(judgment=judgment)
return {"math": bool(score == "yes") if score else False}
output = {uid: {"math": bool(score == "yes") if score else False}}

return output, raw_output


class CategoryCreativeWriting(Category):
class CategoryCreativeWriting(CategoryAPI):
def __init__(self):
super().__init__()
self.name_tag = "creative_writing_v0.1"
Expand All @@ -162,15 +278,68 @@ def get_score(self, judgment):
else:
return None

def pre_process(self, prompt):
def pre_process(self, row):
prompt = row["prompt"].iloc[0]
args = {"PROMPT": prompt}
conv = [
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": self.prompt_template.format(**args)},
]
return conv
return conv, row["uid"].iloc[0]

def post_process(self, judgment, uid):
raw_output = {uid: judgment}

def post_process(self, judgment):
score = self.get_score(judgment=judgment)
bool_score = bool(score == "yes") if score else False
return {"creative_writing": bool_score, "score": score}
output = {uid: {"creative_writing": bool_score, "score": score}}

return output, raw_output


class CategoryRefusalHF(CategoryHF):
def __init__(self):
super().__init__()
self.name_tag = "refusal_v0.2"
self.prompt_template = "Here is the user query:\n<user_query>\n{QUERY}\n</user_query>\n\nHere is the LLM response to the user:\n<llm_response>\n{RESPONSE}\n</llm_response>"
self.classifier = HuggingFaceClassifier(
model_path="lmarena-ai/RefusalClassifier"
)

def conv_pre_process_helper(self, conversation):
conv = []
for i in range(0, len(conversation), 2):
args = {
"QUERY": conversation[i]["content"],
"RESPONSE": conversation[i + 1]["content"],
}
conv.append(self.prompt_template.format(**args))
return conv

def pre_process(self, batch):
to_label = []
to_label_uids = []

for _, row in batch.iterrows():
if "conversation_a" in row.index:
conv_a = self.conv_pre_process_helper(row["conversation_a"])
to_label.extend(conv_a)
to_label_uids.extend([row["uid"]] * len(conv_a))

if "conversation_b" in row.index:
conv_b = self.conv_pre_process_helper(row["conversation_b"])
to_label.extend(conv_b)
to_label_uids.extend([row["uid"]] * len(conv_b))

return to_label, to_label_uids

def post_process(self, labels, to_label_uids):
outputs = defaultdict(lambda: {"refusal": False})
query_refusals = np.where(labels)[0]

for i in query_refusals:
outputs[to_label_uids[i]] = {"refusal": True}

return outputs, defaultdict(
lambda: None
) # No raw/testing outputs for HF classifier
5 changes: 3 additions & 2 deletions fastchat/serve/monitor/classify/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,17 @@ task_name:
- if_v0.1
- math_v0.1
- creative_writing_v0.1
- refusal_v0.2

model_name: null
name: llama-3-70b-instruct
endpoints:
- api_base: null
api_key: null
parallel: 50
parallel: 64
temperature: 0.0
max_token: 512

max_retry: 2
retry_sleep: 10
error_output: $ERROR$
error_output: $ERROR$
4 changes: 3 additions & 1 deletion fastchat/serve/monitor/classify/display_score.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import pandas as pd
import argparse
import os
from pathlib import Path
from glob import glob
from sklearn.metrics import recall_score, precision_score

Expand All @@ -9,6 +10,7 @@
"math_bench": ("math_v0.1", "math"),
"hard_bench": ("criteria_v0.1", "hard"),
"creative_writing_bench": ("creative_writing_v0.1", "creative_writing"),
"refusal_bench": ("refusal_v0.2", "refusal"),
}


Expand Down Expand Up @@ -39,7 +41,7 @@
recall = recall_score(y_pred=test.pred, y_true=test.label)
precision = precision_score(y_pred=test.pred, y_true=test.label)

print(f"Model: {output.model[0]}")
print(f"Classifier: {Path(file).stem}")
print(f"Accuracy: {round(accuracy, 3)}")
print(f"Precision: {round(precision, 3)}")
print(f"Recall: {round(recall, 3)}")
Expand Down
Loading
Loading