Skip to content

Commit

Permalink
Added system prompts (#2145)
Browse files Browse the repository at this point in the history
  • Loading branch information
Leo-LiHao authored Jan 11, 2025
1 parent 641b3d0 commit 5cad563
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 29 deletions.
17 changes: 12 additions & 5 deletions bertopic/representation/_cohere.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
Keywords: [KEYWORDS]
Topic name:"""

DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."


class Cohere(BaseRepresentation):
"""Use the Cohere API to generate topic labels based on their
Expand All @@ -51,6 +53,8 @@ class Cohere(BaseRepresentation):
NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
to decide where the keywords and documents need to be
inserted.
system_prompt: The system prompt to be used in the model. If no system prompt is given,
`self.default_system_prompt_` is used instead.
delay_in_seconds: The delay in seconds between consecutive prompts
in order to prevent RateLimitErrors.
nr_docs: The number of documents to pass to OpenAI if a prompt
Expand Down Expand Up @@ -107,8 +111,9 @@ class Cohere(BaseRepresentation):
def __init__(
self,
client,
model: str = "xlarge",
model: str = "command-r",
prompt: str = None,
system_prompt: str = None,
delay_in_seconds: float = None,
nr_docs: int = 4,
diversity: float = None,
Expand All @@ -118,7 +123,9 @@ def __init__(
self.client = client
self.model = model
self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT
self.default_prompt_ = DEFAULT_PROMPT
self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT
self.delay_in_seconds = delay_in_seconds
self.nr_docs = nr_docs
self.diversity = diversity
Expand Down Expand Up @@ -162,14 +169,14 @@ def extract_topics(
if self.delay_in_seconds:
time.sleep(self.delay_in_seconds)

request = self.client.generate(
request = self.client.chat(
model=self.model,
prompt=prompt,
preamble=self.system_prompt,
message=prompt,
max_tokens=50,
num_generations=1,
stop_sequences=["\n"],
)
label = request.generations[0].text.strip()
label = request.text.strip()
updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)]

return updated_topics
Expand Down
78 changes: 55 additions & 23 deletions bertopic/representation/_llamacpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,34 @@


DEFAULT_PROMPT = """
Q: I have a topic that contains the following documents:
This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title
---
Topic:
Sample texts from this topic:
- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food.
- Meat, but especially beef, is the word food in terms of emissions.
- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one.
Keywords: meat beef eat eating emissions steak food health processed chicken
Topic name: Environmental impacts of eating meat
---
Topic:
Sample texts from this topic:
- I have ordered the product weeks ago but it still has not arrived!
- The website mentions that it only takes a couple of days to deliver but I still have not received mine.
- I got a message stating that I received the monitor but that is not true!
- It took a month longer to deliver than was advised...
Keywords: deliver weeks product shipping long delivery received arrived arrive week
Topic name: Shipping and delivery issues
---
Topic:
Sample texts from this topic:
[DOCUMENTS]
Keywords: [KEYWORDS]
Topic name:"""

The topic is described by the following keywords: '[KEYWORDS]'.
Based on the above information, can you give a short label of the topic?
A: """
DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."


class LlamaCPP(BaseRepresentation):
Expand All @@ -28,6 +49,8 @@ class LlamaCPP(BaseRepresentation):
NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
to decide where the keywords and documents need to be
inserted.
system_prompt: The system prompt to be used in the model. If no system prompt is given,
`self.default_system_prompt_` is used instead.
pipeline_kwargs: Kwargs that you can pass to the `llama_cpp.Llama`
when it is called such as `max_tokens` to be generated.
nr_docs: The number of documents to pass to OpenAI if a prompt
Expand Down Expand Up @@ -93,14 +116,15 @@ def __init__(
self,
model: Union[str, Llama],
prompt: str = None,
system_prompt: str = None,
pipeline_kwargs: Mapping[str, Any] = {},
nr_docs: int = 4,
diversity: float = None,
doc_length: int = None,
tokenizer: Union[str, Callable] = None,
):
if isinstance(model, str):
self.model = Llama(model_path=model, n_gpu_layers=-1, stop="Q:")
self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="ChatML")
elif isinstance(model, Llama):
self.model = model
else:
Expand All @@ -110,7 +134,9 @@ def __init__(
"local LLM or a ` llama_cpp.Llama` object."
)
self.prompt = prompt if prompt is not None else DEFAULT_PROMPT
self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT
self.default_prompt_ = DEFAULT_PROMPT
self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT
self.pipeline_kwargs = pipeline_kwargs
self.nr_docs = nr_docs
self.diversity = diversity
Expand Down Expand Up @@ -151,33 +177,39 @@ def extract_topics(
self.prompts_.append(prompt)

# Extract result from generator and use that as label
topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"]
topic_description = [(description["text"].replace(prompt, ""), 1) for description in topic_description]

if len(topic_description) < 10:
topic_description += [("", 0) for _ in range(10 - len(topic_description))]

updated_topics[topic] = topic_description
# topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"]
topic_description = self.model.create_chat_completion(
messages=[{"role": "system", "content": self.system_prompt}, {"role": "user", "content": prompt}],
**self.pipeline_kwargs,
)
label = topic_description["choices"][0]["message"]["content"].strip()
updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)]

return updated_topics

def _create_prompt(self, docs, topic, topics):
keywords = ", ".join(list(zip(*topics[topic]))[0])
keywords = list(zip(*topics[topic]))[0]

# Use the default prompt and replace keywords
# Use the Default Chat Prompt
if self.prompt == DEFAULT_PROMPT:
prompt = self.prompt.replace("[KEYWORDS]", keywords)
prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords))
prompt = self._replace_documents(prompt, docs)

# Use a prompt that leverages either keywords or documents in
# a custom location
# Use a custom prompt that leverages keywords, documents or both using
# custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively
else:
prompt = self.prompt
if "[KEYWORDS]" in prompt:
prompt = prompt.replace("[KEYWORDS]", keywords)
prompt = prompt.replace("[KEYWORDS]", ", ".join(keywords))
if "[DOCUMENTS]" in prompt:
to_replace = ""
for doc in docs:
to_replace += f"- {doc}\n"
prompt = prompt.replace("[DOCUMENTS]", to_replace)
prompt = self._replace_documents(prompt, docs)

return prompt

@staticmethod
def _replace_documents(prompt, docs):
to_replace = ""
for doc in docs:
to_replace += f"- {doc}\n"
prompt = prompt.replace("[DOCUMENTS]", to_replace)
return prompt
13 changes: 12 additions & 1 deletion bertopic/representation/_openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@
topic: <topic label>
"""

DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts."


class OpenAI(BaseRepresentation):
r"""Using the OpenAI API to generate topic labels based
Expand All @@ -74,6 +76,8 @@ class OpenAI(BaseRepresentation):
NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt
to decide where the keywords and documents need to be
inserted.
system_prompt: The system prompt to be used in the model. If no system prompt is given,
`self.default_system_prompt_` is used instead.
delay_in_seconds: The delay in seconds between consecutive prompts
in order to prevent RateLimitErrors.
exponential_backoff: Retry requests with a random exponential backoff.
Expand Down Expand Up @@ -145,6 +149,7 @@ def __init__(
client,
model: str = "text-embedding-3-small",
prompt: str = None,
system_prompt: str = None,
generator_kwargs: Mapping[str, Any] = {},
delay_in_seconds: float = None,
exponential_backoff: bool = False,
Expand All @@ -162,7 +167,13 @@ def __init__(
else:
self.prompt = prompt

if chat and system_prompt is None:
self.system_prompt = DEFAULT_SYSTEM_PROMPT
else:
self.system_prompt = system_prompt

self.default_prompt_ = DEFAULT_CHAT_PROMPT if chat else DEFAULT_PROMPT
self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT
self.delay_in_seconds = delay_in_seconds
self.exponential_backoff = exponential_backoff
self.chat = chat
Expand Down Expand Up @@ -219,7 +230,7 @@ def extract_topics(

if self.chat:
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "system", "content": self.system_prompt},
{"role": "user", "content": prompt},
]
kwargs = {
Expand Down

0 comments on commit 5cad563

Please sign in to comment.