From 5cad563fc879d08a0fce7ecbeb1b9c427dfb42e3 Mon Sep 17 00:00:00 2001 From: Hao LI <8520588+Leo-LiHao@users.noreply.github.com> Date: Sat, 11 Jan 2025 04:26:35 -0500 Subject: [PATCH] Added system prompts (#2145) --- bertopic/representation/_cohere.py | 17 ++++-- bertopic/representation/_llamacpp.py | 78 ++++++++++++++++++++-------- bertopic/representation/_openai.py | 13 ++++- 3 files changed, 79 insertions(+), 29 deletions(-) diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py index a0c74434..c1c14a58 100644 --- a/bertopic/representation/_cohere.py +++ b/bertopic/representation/_cohere.py @@ -35,6 +35,8 @@ Keywords: [KEYWORDS] Topic name:""" +DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts." + class Cohere(BaseRepresentation): """Use the Cohere API to generate topic labels based on their @@ -51,6 +53,8 @@ class Cohere(BaseRepresentation): NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt to decide where the keywords and documents need to be inserted. + system_prompt: The system prompt to be used in the model. If no system prompt is given, + `self.default_system_prompt_` is used instead. delay_in_seconds: The delay in seconds between consecutive prompts in order to prevent RateLimitErrors. nr_docs: The number of documents to pass to OpenAI if a prompt @@ -107,8 +111,9 @@ class Cohere(BaseRepresentation): def __init__( self, client, - model: str = "xlarge", + model: str = "command-r", prompt: str = None, + system_prompt: str = None, delay_in_seconds: float = None, nr_docs: int = 4, diversity: float = None, @@ -118,7 +123,9 @@ def __init__( self.client = client self.model = model self.prompt = prompt if prompt is not None else DEFAULT_PROMPT + self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT self.default_prompt_ = DEFAULT_PROMPT + self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT self.delay_in_seconds = delay_in_seconds self.nr_docs = nr_docs self.diversity = diversity @@ -162,14 +169,14 @@ def extract_topics( if self.delay_in_seconds: time.sleep(self.delay_in_seconds) - request = self.client.generate( + request = self.client.chat( model=self.model, - prompt=prompt, + preamble=self.system_prompt, + message=prompt, max_tokens=50, - num_generations=1, stop_sequences=["\n"], ) - label = request.generations[0].text.strip() + label = request.text.strip() updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)] return updated_topics diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py index 3fd3541b..59fdc844 100644 --- a/bertopic/representation/_llamacpp.py +++ b/bertopic/representation/_llamacpp.py @@ -8,13 +8,34 @@ DEFAULT_PROMPT = """ -Q: I have a topic that contains the following documents: +This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title +--- +Topic: +Sample texts from this topic: +- Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food. +- Meat, but especially beef, is the word food in terms of emissions. +- Eating meat doesn't make you a bad person, not eating meat doesn't make you a good one. + +Keywords: meat beef eat eating emissions steak food health processed chicken +Topic name: Environmental impacts of eating meat +--- +Topic: +Sample texts from this topic: +- I have ordered the product weeks ago but it still has not arrived! +- The website mentions that it only takes a couple of days to deliver but I still have not received mine. +- I got a message stating that I received the monitor but that is not true! +- It took a month longer to deliver than was advised... + +Keywords: deliver weeks product shipping long delivery received arrived arrive week +Topic name: Shipping and delivery issues +--- +Topic: +Sample texts from this topic: [DOCUMENTS] +Keywords: [KEYWORDS] +Topic name:""" -The topic is described by the following keywords: '[KEYWORDS]'. - -Based on the above information, can you give a short label of the topic? -A: """ +DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts." class LlamaCPP(BaseRepresentation): @@ -28,6 +49,8 @@ class LlamaCPP(BaseRepresentation): NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt to decide where the keywords and documents need to be inserted. + system_prompt: The system prompt to be used in the model. If no system prompt is given, + `self.default_system_prompt_` is used instead. pipeline_kwargs: Kwargs that you can pass to the `llama_cpp.Llama` when it is called such as `max_tokens` to be generated. nr_docs: The number of documents to pass to OpenAI if a prompt @@ -93,6 +116,7 @@ def __init__( self, model: Union[str, Llama], prompt: str = None, + system_prompt: str = None, pipeline_kwargs: Mapping[str, Any] = {}, nr_docs: int = 4, diversity: float = None, @@ -100,7 +124,7 @@ def __init__( tokenizer: Union[str, Callable] = None, ): if isinstance(model, str): - self.model = Llama(model_path=model, n_gpu_layers=-1, stop="Q:") + self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="ChatML") elif isinstance(model, Llama): self.model = model else: @@ -110,7 +134,9 @@ def __init__( "local LLM or a ` llama_cpp.Llama` object." ) self.prompt = prompt if prompt is not None else DEFAULT_PROMPT + self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT self.default_prompt_ = DEFAULT_PROMPT + self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT self.pipeline_kwargs = pipeline_kwargs self.nr_docs = nr_docs self.diversity = diversity @@ -151,33 +177,39 @@ def extract_topics( self.prompts_.append(prompt) # Extract result from generator and use that as label - topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"] - topic_description = [(description["text"].replace(prompt, ""), 1) for description in topic_description] - - if len(topic_description) < 10: - topic_description += [("", 0) for _ in range(10 - len(topic_description))] - - updated_topics[topic] = topic_description + # topic_description = self.model(prompt, **self.pipeline_kwargs)["choices"] + topic_description = self.model.create_chat_completion( + messages=[{"role": "system", "content": self.system_prompt}, {"role": "user", "content": prompt}], + **self.pipeline_kwargs, + ) + label = topic_description["choices"][0]["message"]["content"].strip() + updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)] return updated_topics def _create_prompt(self, docs, topic, topics): - keywords = ", ".join(list(zip(*topics[topic]))[0]) + keywords = list(zip(*topics[topic]))[0] - # Use the default prompt and replace keywords + # Use the Default Chat Prompt if self.prompt == DEFAULT_PROMPT: - prompt = self.prompt.replace("[KEYWORDS]", keywords) + prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) + prompt = self._replace_documents(prompt, docs) - # Use a prompt that leverages either keywords or documents in - # a custom location + # Use a custom prompt that leverages keywords, documents or both using + # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively else: prompt = self.prompt if "[KEYWORDS]" in prompt: - prompt = prompt.replace("[KEYWORDS]", keywords) + prompt = prompt.replace("[KEYWORDS]", ", ".join(keywords)) if "[DOCUMENTS]" in prompt: - to_replace = "" - for doc in docs: - to_replace += f"- {doc}\n" - prompt = prompt.replace("[DOCUMENTS]", to_replace) + prompt = self._replace_documents(prompt, docs) + + return prompt + @staticmethod + def _replace_documents(prompt, docs): + to_replace = "" + for doc in docs: + to_replace += f"- {doc}\n" + prompt = prompt.replace("[DOCUMENTS]", to_replace) return prompt diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index e05a9c66..3c567273 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -49,6 +49,8 @@ topic: """ +DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts." + class OpenAI(BaseRepresentation): r"""Using the OpenAI API to generate topic labels based @@ -74,6 +76,8 @@ class OpenAI(BaseRepresentation): NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt to decide where the keywords and documents need to be inserted. + system_prompt: The system prompt to be used in the model. If no system prompt is given, + `self.default_system_prompt_` is used instead. delay_in_seconds: The delay in seconds between consecutive prompts in order to prevent RateLimitErrors. exponential_backoff: Retry requests with a random exponential backoff. @@ -145,6 +149,7 @@ def __init__( client, model: str = "text-embedding-3-small", prompt: str = None, + system_prompt: str = None, generator_kwargs: Mapping[str, Any] = {}, delay_in_seconds: float = None, exponential_backoff: bool = False, @@ -162,7 +167,13 @@ def __init__( else: self.prompt = prompt + if chat and system_prompt is None: + self.system_prompt = DEFAULT_SYSTEM_PROMPT + else: + self.system_prompt = system_prompt + self.default_prompt_ = DEFAULT_CHAT_PROMPT if chat else DEFAULT_PROMPT + self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT self.delay_in_seconds = delay_in_seconds self.exponential_backoff = exponential_backoff self.chat = chat @@ -219,7 +230,7 @@ def extract_topics( if self.chat: messages = [ - {"role": "system", "content": "You are a helpful assistant."}, + {"role": "system", "content": self.system_prompt}, {"role": "user", "content": prompt}, ] kwargs = {