From 2c3267076c9b9e953b8e78118f875278471fd610 Mon Sep 17 00:00:00 2001
From: Hao Li
Date: Fri, 3 Jan 2025 18:08:23 -0500
Subject: [PATCH] fixed system prompt and default prompt
---
bertopic/representation/_cohere.py | 6 ++--
bertopic/representation/_llamacpp.py | 54 ++++++++++++++++++----------
bertopic/representation/_openai.py | 2 +-
bertopic/representation/_utils.py | 4 +--
4 files changed, 41 insertions(+), 25 deletions(-)
diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py
index ebb947be..f038015d 100644
--- a/bertopic/representation/_cohere.py
+++ b/bertopic/representation/_cohere.py
@@ -33,9 +33,9 @@
Sample texts from this topic:
[DOCUMENTS]
Keywords: [KEYWORDS]
-Provide the topic name directly without any explanation."""
+Topic name:"""
-DEFAULT_SYSTEM_PROMPT = "You are designated as an assistant that identify and extract high-level topics from texts."
+DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."
class Cohere(BaseRepresentation):
@@ -174,7 +174,7 @@ def extract_topics(
max_tokens=50,
stop_sequences=["\n"],
)
- label = request.text.strip().replace("Topic name: ", "")
+ label = request.text.strip()
updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)]
return updated_topics
diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py
index c161ba30..645d4d07 100644
--- a/bertopic/representation/_llamacpp.py
+++ b/bertopic/representation/_llamacpp.py
@@ -9,8 +9,8 @@
DEFAULT_PROMPT = """
This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title
-
-Example 1:
+---
+Topic:
Sample texts from this topic:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
@@ -18,8 +18,8 @@
Keywords: meat beef eat eating emissions steak food health processed chicken
Topic name: Environmental impacts of eating meat
-
-Example 2:
+---
+Topic:
Sample texts from this topic:
- I have ordered the product weeks ago but it still has not arrived!
- The website mentions that it only takes a couple of days to deliver but I still have not received mine.
@@ -29,13 +29,13 @@
Keywords: deliver weeks product shipping long delivery received arrived arrive week
Topic name: Shipping and delivery issues
---
-Extract the topic name from the following documents:
+Topic:
Sample texts from this topic:
[DOCUMENTS]
Keywords: [KEYWORDS]
-Provide the extracted topic name directly without any explanation."""
+Topic name:"""
-DEFAULT_SYSTEM_PROMPT = "You are designated as an assistant that identify and extract high-level topics from texts."
+DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."
class LlamaCPP(BaseRepresentation):
@@ -49,6 +49,8 @@ class LlamaCPP(BaseRepresentation):
NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
to decide where the keywords and documents need to be
inserted.
+ system_prompt: The system prompt to be used in the model. If no system prompt is given,
+ `self.default_system_prompt_` is used instead.
pipeline_kwargs: Kwargs that you can pass to the `llama_cpp.Llama`
when it is called such as `max_tokens` to be generated.
nr_docs: The number of documents to pass to OpenAI if a prompt
@@ -118,11 +120,11 @@ def __init__(
pipeline_kwargs: Mapping[str, Any] = {},
nr_docs: int = 4,
diversity: float = None,
- doc_length: int = 100,
+ doc_length: int = None,
tokenizer: Union[str, Callable] = None,
):
if isinstance(model, str):
- self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="llama-2")
+ self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="ChatML")
elif isinstance(model, Llama):
self.model = model
else:
@@ -134,6 +136,7 @@ def __init__(
self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT
self.default_prompt_ = DEFAULT_PROMPT
+ self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT
self.pipeline_kwargs = pipeline_kwargs
self.nr_docs = nr_docs
self.diversity = diversity
@@ -183,21 +186,34 @@ def extract_topics(
}
], ** self.pipeline_kwargs
)
- label = topic_description["choices"][0]["message"]["content"].strip().replace("Topic name: ", "")
+ label = topic_description["choices"][0]["message"]["content"].strip()
updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)]
return updated_topics
def _create_prompt(self, docs, topic, topics):
- keywords = ", ".join(list(zip(*topics[topic]))[0])
+ keywords = list(zip(*topics[topic]))[0]
- prompt = self.prompt
- if "[KEYWORDS]" in prompt:
- prompt = prompt.replace("[KEYWORDS]", keywords)
- if "[DOCUMENTS]" in prompt:
- to_replace = ""
- for doc in docs:
- to_replace += f"- {doc}\n"
- prompt = prompt.replace("[DOCUMENTS]", to_replace)
+ # Use the Default Chat Prompt
+ if self.prompt == DEFAULT_PROMPT:
+ prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords))
+ prompt = self._replace_documents(prompt, docs)
+
+ # Use a custom prompt that leverages keywords, documents or both using
+ # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively
+ else:
+ prompt = self.prompt
+ if "[KEYWORDS]" in prompt:
+ prompt = prompt.replace("[KEYWORDS]", ", ".join(keywords))
+ if "[DOCUMENTS]" in prompt:
+ prompt = self._replace_documents(prompt, docs)
+
+ return prompt
+ @staticmethod
+ def _replace_documents(prompt, docs):
+ to_replace = ""
+ for doc in docs:
+ to_replace += f"- {doc}\n"
+ prompt = prompt.replace("[DOCUMENTS]", to_replace)
return prompt
diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py
index bd16c858..c081f1b7 100644
--- a/bertopic/representation/_openai.py
+++ b/bertopic/representation/_openai.py
@@ -48,7 +48,7 @@
topic:
"""
-DEFAULT_SYSTEM_PROMPT = "You are designated as an assistant that identify and extract high-level topics from texts."
+DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."
class OpenAI(BaseRepresentation):
diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py
index bd2cc566..2a99fd1f 100644
--- a/bertopic/representation/_utils.py
+++ b/bertopic/representation/_utils.py
@@ -44,8 +44,8 @@ def decode(self, doc_chunks):
truncated_document: A truncated document
"""
if doc_length is not None:
- if tokenizer == "char" or tokenizer is None:
- truncated_document = document[:doc_length] + " (...)"
+ if tokenizer == "char":
+ truncated_document = document[:doc_length]
elif tokenizer == "whitespace":
truncated_document = " ".join(document.split()[:doc_length])
elif tokenizer == "vectorizer":