lm-sys · derixu · Nov 30, 2024 · Nov 30, 2024 · Dec 1, 2024 · Dec 1, 2024
diff --git a/fastchat/serve/monitor/classify/README.md b/fastchat/serve/monitor/classify/README.md
@@ -24,7 +24,15 @@ Your label_bench directory should follow the structure:
 
 ## How to evaluate your category classifier?
 
-To test your new classifier for a new category, you would have to make sure you created the category child class in `category.py`. Then, to generate classification labels, make the necessary edits in `config.yaml` and run
+To test your new classifier for a new category, you would have to make sure you created the category child class in `category.py`. 
+
+We currently support classifiers that can be accessed via OpenAI API, and Hugging Face classifiers.
+
+If you are using a OpenAI API based classifier, you should create a class inheriting from `CategoryAPI`. You will need to create a `name_tag` attribute for your classifier, and implement the `pre_process` and `post_process` functions. More documentation is available on these functions in `category.py` under the `CategoryAPI` class.
+
+If you are using a Hugging Face based classifier, you should create a class inheriting from `CategoryHF`. You will need to create a `name_tag` attribute for your classifier, and implement the `pre_process` and `post_process` functions. More documentation is available on these functions in `category.py` under the `CategoryHF` class.
+
+Then, to generate classification labels, make the necessary edits in `config.yaml` and run
 ```console
 python label.py --config config.yaml --testing
 ```

diff --git a/fastchat/serve/monitor/classify/category.py b/fastchat/serve/monitor/classify/category.py
@@ -8,32 +8,133 @@
 #     - if_v0.1
 #         - if
 #         - score
+#     - creative_writing_v0.1
+#         - creative_writing
+#         - score
+#     - refusal_v0.2
+#         - refusal
+
 import ast
 import re
+import numpy as np
+from collections import defaultdict
+
+from utils import HuggingFaceClassifier, chat_completion_openai
+
 
+def create_category(name):
+    if name == "criteria_v0.1":
+        return CategoryHardPrompt()
+    elif name == "if_v0.1":
+        return CategoryIF()
+    elif name == "math_v0.1":
+        return CategoryMath()
+    elif name == "creative_writing_v0.1":
+        return CategoryCreativeWriting()
+    elif name == "refusal_v0.2":
+        return CategoryRefusalHF()
 
-class Category:
+    raise Exception(f"Category name is incorrect: {name}")
+
+
+class CategoryAPI:
     def __init__(self):
+        self.batch_size = 1
+        self.is_parallel = True
+
+    def get_answer(self, batch, model_name, max_tokens, temperature, api_dict):
+        assert len(batch) == 1, "API-based categories must have batch size of 1"
+
+        conv, uid = self.pre_process(batch)
+        output = chat_completion_openai(
+            model=model_name,
+            messages=conv,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            api_dict=api_dict,
+        )
+        return self.post_process(output, uid)
+
+    def pre_process(self, row):
+        """
+        Prepares a text to be labeled by LLM through OpenAI API
+
+        Inherited category classifier classes should implement this method.
+
+        Args:
+            row (pd.Dataframe): row representing single battle to be labeled
+
+        Returns:
+            conv (dict): processed text with system prompt in OpenAI API format:
+                [
+                    {"role": "system", "content": <system prompt>"},
+                    {"role": "user", "content": <user input>},
+                    ...
+                ]
+        """
         pass
 
-    @staticmethod
-    def create_category(name):
-        if name == "criteria_v0.1":
-            return CategoryHardPrompt()
-        elif name == "if_v0.1":
-            return CategoryIF()
-        elif name == "math_v0.1":
-            return CategoryMath()
-        elif name == "creative_writing_v0.1":
-            return CategoryCreativeWriting()
+    def post_process(self, judgement, uid):
+        """
+        Processes judgements/outputs of LLM to retrieve final labels
+
+        Inherited category classifier classes should implement this method.
 
-        raise Exception(f"Category name is incorrect: {name}")
+        Args:
+            judgement (str): text output of LLM labeler
+            uid (str): UID of the battle to be labeled
 
-    def post_process(self):
+        Returns:
+            output (Dict[str, Dict[str, str]]: Key is battle UID, value is the output associated with that battle (usually a dictionary)
+            raw_ouput (Dict[str, str]): Key is battle UID, value is the unprocessed LLM output
+        """
         pass
 
 
-class CategoryHardPrompt(Category):
+class CategoryHF:
+    def __init__(self):
+        self.batch_size = 1
+        self.is_parallel = False
+
+    def get_answer(self, batch, model_name, max_tokens, temperature, api_dict):
+        to_label, to_label_uids = self.pre_process(batch)
+        labels = self.classifier.classify_batch(to_label)
+
+        return self.post_process(labels, to_label_uids)
+
+    def pre_process(self, batch):
+        """
+        Prepares a batch of texts to be labeled by Hugging Face classifier.
+
+        Inherited category classifier classes should implement this method.
+
+        Args:
+            batch (pd.DataFrame): Each row of the DataFrame represents one battle.
+
+        Returns:
+            outputs (List[str]): Texts to be labeled by HF classifier
+            to_label_uids (List[str]): Battle UIDs corresponding to each text to be labeled
+        """
+        pass
+
+    def post_process(labels, to_label_uids):
+        """
+        Processes raw HF labels.
+
+        Inherited category classifier classes should implement this method.
+
+        Args:
+            labels (List[bool]): labels directly from HF classifier
+            to_label_uids (List[str]): Battle UIDs corresponding to each string that was labeled
+
+        Returns:
+            output (Dict[str, Dict[str, str]]: Keys are battle uids, values are the outputs associated with that battle (usually a dictionary)
+            raw_ouput (Dict[str, str]): Keys is battle UIDs, value is the unprocessed HF model output or None
+        """
+        pass
+
+
+class CategoryHardPrompt(CategoryAPI):
     def __init__(self):
         super().__init__()
         self.name_tag = "criteria_v0.1"
@@ -63,17 +164,21 @@ def get_score(self, judgment):
         else:
             return []
 
-    def pre_process(self, prompt):
+    def pre_process(self, row):
+        prompt = row["prompt"].iloc[0]
         conv = [{"role": "system", "content": self.sys_prompt}]
         conv.append({"role": "user", "content": prompt})
-        return conv
+        return conv, row["uid"].iloc[0]
+
+    def post_process(self, judgment, uid):
+        raw_output = {uid: judgment}
 
-    def post_process(self, judgment):
         criteria = self.get_score(judgment=judgment)
-        return {name: bool(i in criteria) for i, name in self.tags.items()}
+        output = {uid: {name: bool(i in criteria) for i, name in self.tags.items()}}
+        return output, raw_output
 
 
-class CategoryIF(Category):
+class CategoryIF(CategoryAPI):
     def __init__(self):
         super().__init__()
         self.name_tag = "if_v0.1"
@@ -91,23 +196,29 @@ def get_score(self, judgment):
         else:
             return None
 
-    def pre_process(self, prompt):
+    def pre_process(self, row):
+        prompt = row["prompt"].iloc[0]
         args = {"PROMPT": prompt}
         conv = [
             {"role": "system", "content": self.system_prompt},
             {"role": "user", "content": self.prompt_template.format(**args)},
         ]
-        return conv
+        return conv, row["uid"].iloc[0]
+
+    def post_process(self, judgment, uid):
+        raw_output = {uid: judgment}
 
-    def post_process(self, judgment):
         score = self.get_score(judgment=judgment)
-        return {
-            "if": bool(score >= 4) if score else False,
-            "score": score,
+        output = {
+            uid: {
+                "if": bool(score >= 4) if score else False,
+                "score": score,
+            }
         }
+        return output, raw_output
 
 
-class CategoryMath(Category):
+class CategoryMath(CategoryAPI):
     def __init__(self):
         super().__init__()
         self.name_tag = "math_v0.1"
@@ -125,20 +236,25 @@ def get_score(self, judgment):
         else:
             return None
 
-    def pre_process(self, prompt):
+    def pre_process(self, row):
+        prompt = row["prompt"].iloc[0]
         args = {"PROMPT": prompt}
         conv = [
             {"role": "system", "content": self.system_prompt},
             {"role": "user", "content": self.prompt_template.format(**args)},
         ]
-        return conv
+        return conv, row["uid"].iloc[0]
+
+    def post_process(self, judgment, uid):
+        raw_output = {uid: judgment}
 
-    def post_process(self, judgment):
         score = self.get_score(judgment=judgment)
-        return {"math": bool(score == "yes") if score else False}
+        output = {uid: {"math": bool(score == "yes") if score else False}}
+
+        return output, raw_output
 
 
-class CategoryCreativeWriting(Category):
+class CategoryCreativeWriting(CategoryAPI):
     def __init__(self):
         super().__init__()
         self.name_tag = "creative_writing_v0.1"
@@ -162,15 +278,68 @@ def get_score(self, judgment):
         else:
             return None
 
-    def pre_process(self, prompt):
+    def pre_process(self, row):
+        prompt = row["prompt"].iloc[0]
         args = {"PROMPT": prompt}
         conv = [
             {"role": "system", "content": self.system_prompt},
             {"role": "user", "content": self.prompt_template.format(**args)},
         ]
-        return conv
+        return conv, row["uid"].iloc[0]
+
+    def post_process(self, judgment, uid):
+        raw_output = {uid: judgment}
 
-    def post_process(self, judgment):
         score = self.get_score(judgment=judgment)
         bool_score = bool(score == "yes") if score else False
-        return {"creative_writing": bool_score, "score": score}
+        output = {uid: {"creative_writing": bool_score, "score": score}}
+
+        return output, raw_output
+
+
+class CategoryRefusalHF(CategoryHF):
+    def __init__(self):
+        super().__init__()
+        self.name_tag = "refusal_v0.2"
+        self.prompt_template = "Here is the user query:\n<user_query>\n{QUERY}\n</user_query>\n\nHere is the LLM response to the user:\n<llm_response>\n{RESPONSE}\n</llm_response>"
+        self.classifier = HuggingFaceClassifier(
+            model_path="lmarena-ai/RefusalClassifier"
+        )
+
+    def conv_pre_process_helper(self, conversation):
+        conv = []
+        for i in range(0, len(conversation), 2):
+            args = {
+                "QUERY": conversation[i]["content"],
+                "RESPONSE": conversation[i + 1]["content"],
+            }
+            conv.append(self.prompt_template.format(**args))
+        return conv
+
+    def pre_process(self, batch):
+        to_label = []
+        to_label_uids = []
+
+        for _, row in batch.iterrows():
+            if "conversation_a" in row.index:
+                conv_a = self.conv_pre_process_helper(row["conversation_a"])
+                to_label.extend(conv_a)
+                to_label_uids.extend([row["uid"]] * len(conv_a))
+
+            if "conversation_b" in row.index:
+                conv_b = self.conv_pre_process_helper(row["conversation_b"])
+                to_label.extend(conv_b)
+                to_label_uids.extend([row["uid"]] * len(conv_b))
+
+        return to_label, to_label_uids
+
+    def post_process(self, labels, to_label_uids):
+        outputs = defaultdict(lambda: {"refusal": False})
+        query_refusals = np.where(labels)[0]
+
+        for i in query_refusals:
+            outputs[to_label_uids[i]] = {"refusal": True}
+
+        return outputs, defaultdict(
+            lambda: None
+        )  # No raw/testing outputs for HF classifier
diff --git a/fastchat/serve/monitor/classify/config.yaml b/fastchat/serve/monitor/classify/config.yaml
@@ -11,16 +11,17 @@ task_name:
   - if_v0.1
   - math_v0.1
   - creative_writing_v0.1
+  - refusal_v0.2
 
 model_name: null
 name: llama-3-70b-instruct
 endpoints:
   - api_base: null
     api_key: null
-parallel: 50
+parallel: 64
 temperature: 0.0
 max_token: 512
 
 max_retry: 2
 retry_sleep: 10
-error_output: $ERROR$
+error_output: $ERROR$
diff --git a/fastchat/serve/monitor/classify/display_score.py b/fastchat/serve/monitor/classify/display_score.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import argparse
 import os
+from pathlib import Path
 from glob import glob
 from sklearn.metrics import recall_score, precision_score
 
@@ -9,6 +10,7 @@
     "math_bench": ("math_v0.1", "math"),
     "hard_bench": ("criteria_v0.1", "hard"),
     "creative_writing_bench": ("creative_writing_v0.1", "creative_writing"),
+    "refusal_bench": ("refusal_v0.2", "refusal"),
 }
 
 
@@ -39,7 +41,7 @@
         recall = recall_score(y_pred=test.pred, y_true=test.label)
         precision = precision_score(y_pred=test.pred, y_true=test.label)
 
-        print(f"Model: {output.model[0]}")
+        print(f"Classifier: {Path(file).stem}")
         print(f"Accuracy: {round(accuracy, 3)}")
         print(f"Precision: {round(precision, 3)}")
         print(f"Recall: {round(recall, 3)}")