From 5cad563fc879d08a0fce7ecbeb1b9c427dfb42e3 Mon Sep 17 00:00:00 2001
From: Hao LI <8520588+Leo-LiHao@users.noreply.github.com>
Date: Sat, 11 Jan 2025 04:26:35 -0500
Subject: [PATCH] Added system prompts (#2145)

---
 bertopic/representation/_cohere.py   | 17 ++++--
 bertopic/representation/_llamacpp.py | 78 ++++++++++++++++++++--------
 bertopic/representation/_openai.py   | 13 ++++-
 3 files changed, 79 insertions(+), 29 deletions(-)

diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py
index a0c74434..c1c14a58 100644
--- a/bertopic/representation/_cohere.py
+++ b/bertopic/representation/_cohere.py
@@ -35,6 +35,8 @@
 Keywords: [KEYWORDS]
 Topic name:"""
 
+DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."
+
 
 class Cohere(BaseRepresentation):
     """Use the Cohere API to generate topic labels based on their
@@ -51,6 +53,8 @@ class Cohere(BaseRepresentation):
                 NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
                 to decide where the keywords and documents need to be
                 inserted.
+        system_prompt: The system prompt to be used in the model. If no system prompt is given,
+                       `self.default_system_prompt_` is used instead.
         delay_in_seconds: The delay in seconds between consecutive prompts
                                 in order to prevent RateLimitErrors.
         nr_docs: The number of documents to pass to OpenAI if a prompt
@@ -107,8 +111,9 @@ class Cohere(BaseRepresentation):
     def __init__(
         self,
         client,
-        model: str = "xlarge",
+        model: str = "command-r",
         prompt: str = None,
+        system_prompt: str = None,
         delay_in_seconds: float = None,
         nr_docs: int = 4,
         diversity: float = None,
@@ -118,7 +123,9 @@ def __init__(
         self.client = client
         self.model = model
         self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
+        self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT
         self.default_prompt_ = DEFAULT_PROMPT
+        self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT
         self.delay_in_seconds = delay_in_seconds
         self.nr_docs = nr_docs
         self.diversity = diversity
@@ -162,14 +169,14 @@ def extract_topics(
             if self.delay_in_seconds:
                 time.sleep(self.delay_in_seconds)
 
-            request = self.client.generate(
+            request = self.client.chat(
                 model=self.model,
-                prompt=prompt,
+                preamble=self.system_prompt,
+                message=prompt,
                 max_tokens=50,
-                num_generations=1,
                 stop_sequences=["\n"],
             )
-            label = request.generations[0].text.strip()
+            label = request.text.strip()
             updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)]
 
         return updated_topics
diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py
index 3fd3541b..59fdc844 100644
--- a/bertopic/representation/_llamacpp.py
+++ b/bertopic/representation/_llamacpp.py
@@ -8,13 +8,34 @@
 
 
 DEFAULT_PROMPT = """
-Q: I have a topic that contains the following documents:
+This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title
+---
+Topic:
+Sample texts from this topic:
+- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
+- Meat, but especially beef, is the word food in terms of emissions.
+- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
+
+Keywords: meat beef eat eating emissions steak food health processed chicken
+Topic name: Environmental impacts of eating meat
+---
+Topic:
+Sample texts from this topic:
+- I have ordered the product weeks ago but it still has not arrived!
+- The website mentions that it only takes a couple of days to deliver but I still have not received mine.
+- I got a message stating that I received the monitor but that is not true!
+- It took a month longer to deliver than was advised...
+
+Keywords: deliver weeks product shipping long delivery received arrived arrive week
+Topic name: Shipping and delivery issues
+---
+Topic:
+Sample texts from this topic:
 [DOCUMENTS]
+Keywords: [KEYWORDS]
+Topic name:"""
 
-The topic is described by the following keywords: '[KEYWORDS]'.
-
-Based on the above information, can you give a short label of the topic?
-A: """
+DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."
 
 
 class LlamaCPP(BaseRepresentation):
@@ -28,6 +49,8 @@ class LlamaCPP(BaseRepresentation):
                 NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
                 to decide where the keywords and documents need to be
                 inserted.
+        system_prompt: The system prompt to be used in the model. If no system prompt is given,
+                       `self.default_system_prompt_` is used instead.
         pipeline_kwargs: Kwargs that you can pass to the `llama_cpp.Llama`
                          when it is called such as `max_tokens` to be generated.
         nr_docs: The number of documents to pass to OpenAI if a prompt
@@ -93,6 +116,7 @@ def __init__(
         self,
         model: Union[str, Llama],
         prompt: str = None,
+        system_prompt: str = None,
         pipeline_kwargs: Mapping[str, Any] = {},
         nr_docs: int = 4,
         diversity: float = None,
@@ -100,7 +124,7 @@ def __init__(
         tokenizer: Union[str, Callable] = None,
     ):
         if isinstance(model, str):
-            self.model = Llama(model_path=model, n_gpu_layers=-1, stop="Q:")
+            self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="ChatML")
         elif isinstance(model, Llama):
             self.model = model
         else:
@@ -110,7 +134,9 @@ def __init__(
                 "local LLM or a ` llama_cpp.Llama` object."
             )
         self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
+        self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT
         self.default_prompt_ = DEFAULT_PROMPT
+        self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT
         self.pipeline_kwargs = pipeline_kwargs
         self.nr_docs = nr_docs
         self.diversity = diversity
@@ -151,33 +177,39 @@ def extract_topics(
             self.prompts_.append(prompt)
 
             # Extract result from generator and use that as label
-            topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"]
-            topic_description = [(description["text"].replace(prompt, ""), 1) for description in topic_description]
-
-            if len(topic_description) < 10:
-                topic_description += [("", 0) for _ in range(10 - len(topic_description))]
-
-            updated_topics[topic] = topic_description
+            # topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"]
+            topic_description = self.model.create_chat_completion(
+                messages=[{"role": "system", "content": self.system_prompt}, {"role": "user", "content": prompt}],
+                **self.pipeline_kwargs,
+            )
+            label = topic_description["choices"][0]["message"]["content"].strip()
+            updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)]
 
         return updated_topics
 
     def _create_prompt(self, docs, topic, topics):
-        keywords = ", ".join(list(zip(*topics[topic]))[0])
+        keywords = list(zip(*topics[topic]))[0]
 
-        # Use the default prompt and replace keywords
+        # Use the Default Chat Prompt
         if self.prompt == DEFAULT_PROMPT:
-            prompt = self.prompt.replace("[KEYWORDS]", keywords)
+            prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords))
+            prompt = self._replace_documents(prompt, docs)
 
-        # Use a prompt that leverages either keywords or documents in
-        # a custom location
+        # Use a custom prompt that leverages keywords, documents or both using
+        # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively
         else:
             prompt = self.prompt
             if "[KEYWORDS]" in prompt:
-                prompt = prompt.replace("[KEYWORDS]", keywords)
+                prompt = prompt.replace("[KEYWORDS]", ", ".join(keywords))
             if "[DOCUMENTS]" in prompt:
-                to_replace = ""
-                for doc in docs:
-                    to_replace += f"- {doc}\n"
-                prompt = prompt.replace("[DOCUMENTS]", to_replace)
+                prompt = self._replace_documents(prompt, docs)
+
+        return prompt
 
+    @staticmethod
+    def _replace_documents(prompt, docs):
+        to_replace = ""
+        for doc in docs:
+            to_replace += f"- {doc}\n"
+        prompt = prompt.replace("[DOCUMENTS]", to_replace)
         return prompt
diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py
index e05a9c66..3c567273 100644
--- a/bertopic/representation/_openai.py
+++ b/bertopic/representation/_openai.py
@@ -49,6 +49,8 @@
 topic: <topic label>
 """
 
+DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."
+
 
 class OpenAI(BaseRepresentation):
     r"""Using the OpenAI API to generate topic labels based
@@ -74,6 +76,8 @@ class OpenAI(BaseRepresentation):
                 NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
                 to decide where the keywords and documents need to be
                 inserted.
+        system_prompt: The system prompt to be used in the model. If no system prompt is given,
+                       `self.default_system_prompt_` is used instead.
         delay_in_seconds: The delay in seconds between consecutive prompts
                           in order to prevent RateLimitErrors.
         exponential_backoff: Retry requests with a random exponential backoff.
@@ -145,6 +149,7 @@ def __init__(
         client,
         model: str = "text-embedding-3-small",
         prompt: str = None,
+        system_prompt: str = None,
         generator_kwargs: Mapping[str, Any] = {},
         delay_in_seconds: float = None,
         exponential_backoff: bool = False,
@@ -162,7 +167,13 @@ def __init__(
         else:
             self.prompt = prompt
 
+        if chat and system_prompt is None:
+            self.system_prompt = DEFAULT_SYSTEM_PROMPT
+        else:
+            self.system_prompt = system_prompt
+
         self.default_prompt_ = DEFAULT_CHAT_PROMPT if chat else DEFAULT_PROMPT
+        self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT
         self.delay_in_seconds = delay_in_seconds
         self.exponential_backoff = exponential_backoff
         self.chat = chat
@@ -219,7 +230,7 @@ def extract_topics(
 
             if self.chat:
                 messages = [
-                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "system", "content": self.system_prompt},
                     {"role": "user", "content": prompt},
                 ]
                 kwargs = {