From 2c3267076c9b9e953b8e78118f875278471fd610 Mon Sep 17 00:00:00 2001 From: Hao Li Date: Fri, 3 Jan 2025 18:08:23 -0500 Subject: [PATCH] fixed system prompt and default prompt --- bertopic/representation/_cohere.py | 6 ++-- bertopic/representation/_llamacpp.py | 54 ++++++++++++++++++---------- bertopic/representation/_openai.py | 2 +- bertopic/representation/_utils.py | 4 +-- 4 files changed, 41 insertions(+), 25 deletions(-) diff --git a/bertopic/representation/_cohere.py b/bertopic/representation/_cohere.py index ebb947be..f038015d 100644 --- a/bertopic/representation/_cohere.py +++ b/bertopic/representation/_cohere.py @@ -33,9 +33,9 @@ Sample texts from this topic: [DOCUMENTS] Keywords: [KEYWORDS] -Provide the topic name directly without any explanation.""" +Topic name:""" -DEFAULT_SYSTEM_PROMPT = "You are designated as an assistant that identify and extract high-level topics from texts." +DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts." class Cohere(BaseRepresentation): @@ -174,7 +174,7 @@ def extract_topics( max_tokens=50, stop_sequences=["\n"], ) - label = request.text.strip().replace("Topic name: ", "") + label = request.text.strip() updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)] return updated_topics diff --git a/bertopic/representation/_llamacpp.py b/bertopic/representation/_llamacpp.py index c161ba30..645d4d07 100644 --- a/bertopic/representation/_llamacpp.py +++ b/bertopic/representation/_llamacpp.py @@ -9,8 +9,8 @@ DEFAULT_PROMPT = """ This is a list of texts where each collection of texts describe a topic. After each collection of texts, the name of the topic they represent is mentioned as a short-highly-descriptive title - -Example 1: +--- +Topic: Sample texts from this topic: - Traditional diets in most cultures were primarily plant-based with a little meat on top, but with the rise of industrial style meat production and factory farming, meat has become a staple food. - Meat, but especially beef, is the word food in terms of emissions. @@ -18,8 +18,8 @@ Keywords: meat beef eat eating emissions steak food health processed chicken Topic name: Environmental impacts of eating meat - -Example 2: +--- +Topic: Sample texts from this topic: - I have ordered the product weeks ago but it still has not arrived! - The website mentions that it only takes a couple of days to deliver but I still have not received mine. @@ -29,13 +29,13 @@ Keywords: deliver weeks product shipping long delivery received arrived arrive week Topic name: Shipping and delivery issues --- -Extract the topic name from the following documents: +Topic: Sample texts from this topic: [DOCUMENTS] Keywords: [KEYWORDS] -Provide the extracted topic name directly without any explanation.""" +Topic name:""" -DEFAULT_SYSTEM_PROMPT = "You are designated as an assistant that identify and extract high-level topics from texts." +DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts." class LlamaCPP(BaseRepresentation): @@ -49,6 +49,8 @@ class LlamaCPP(BaseRepresentation): NOTE: Use `"[KEYWORDS]"` and `"[DOCUMENTS]"` in the prompt to decide where the keywords and documents need to be inserted. + system_prompt: The system prompt to be used in the model. If no system prompt is given, + `self.default_system_prompt_` is used instead. pipeline_kwargs: Kwargs that you can pass to the `llama_cpp.Llama` when it is called such as `max_tokens` to be generated. nr_docs: The number of documents to pass to OpenAI if a prompt @@ -118,11 +120,11 @@ def __init__( pipeline_kwargs: Mapping[str, Any] = {}, nr_docs: int = 4, diversity: float = None, - doc_length: int = 100, + doc_length: int = None, tokenizer: Union[str, Callable] = None, ): if isinstance(model, str): - self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="llama-2") + self.model = Llama(model_path=model, n_gpu_layers=-1, stop="\n", chat_format="ChatML") elif isinstance(model, Llama): self.model = model else: @@ -134,6 +136,7 @@ def __init__( self.prompt = prompt if prompt is not None else DEFAULT_PROMPT self.system_prompt = system_prompt if system_prompt is not None else DEFAULT_SYSTEM_PROMPT self.default_prompt_ = DEFAULT_PROMPT + self.default_system_prompt_ = DEFAULT_SYSTEM_PROMPT self.pipeline_kwargs = pipeline_kwargs self.nr_docs = nr_docs self.diversity = diversity @@ -183,21 +186,34 @@ def extract_topics( } ], ** self.pipeline_kwargs ) - label = topic_description["choices"][0]["message"]["content"].strip().replace("Topic name: ", "") + label = topic_description["choices"][0]["message"]["content"].strip() updated_topics[topic] = [(label, 1)] + [("", 0) for _ in range(9)] return updated_topics def _create_prompt(self, docs, topic, topics): - keywords = ", ".join(list(zip(*topics[topic]))[0]) + keywords = list(zip(*topics[topic]))[0] - prompt = self.prompt - if "[KEYWORDS]" in prompt: - prompt = prompt.replace("[KEYWORDS]", keywords) - if "[DOCUMENTS]" in prompt: - to_replace = "" - for doc in docs: - to_replace += f"- {doc}\n" - prompt = prompt.replace("[DOCUMENTS]", to_replace) + # Use the Default Chat Prompt + if self.prompt == DEFAULT_PROMPT: + prompt = self.prompt.replace("[KEYWORDS]", ", ".join(keywords)) + prompt = self._replace_documents(prompt, docs) + + # Use a custom prompt that leverages keywords, documents or both using + # custom tags, namely [KEYWORDS] and [DOCUMENTS] respectively + else: + prompt = self.prompt + if "[KEYWORDS]" in prompt: + prompt = prompt.replace("[KEYWORDS]", ", ".join(keywords)) + if "[DOCUMENTS]" in prompt: + prompt = self._replace_documents(prompt, docs) + + return prompt + @staticmethod + def _replace_documents(prompt, docs): + to_replace = "" + for doc in docs: + to_replace += f"- {doc}\n" + prompt = prompt.replace("[DOCUMENTS]", to_replace) return prompt diff --git a/bertopic/representation/_openai.py b/bertopic/representation/_openai.py index bd16c858..c081f1b7 100644 --- a/bertopic/representation/_openai.py +++ b/bertopic/representation/_openai.py @@ -48,7 +48,7 @@ topic: """ -DEFAULT_SYSTEM_PROMPT = "You are designated as an assistant that identify and extract high-level topics from texts." +DEFAULT_SYSTEM_PROMPT = "You are an assistant that extracts high-level topics from texts." class OpenAI(BaseRepresentation): diff --git a/bertopic/representation/_utils.py b/bertopic/representation/_utils.py index bd2cc566..2a99fd1f 100644 --- a/bertopic/representation/_utils.py +++ b/bertopic/representation/_utils.py @@ -44,8 +44,8 @@ def decode(self, doc_chunks): truncated_document: A truncated document """ if doc_length is not None: - if tokenizer == "char" or tokenizer is None: - truncated_document = document[:doc_length] + " (...)" + if tokenizer == "char": + truncated_document = document[:doc_length] elif tokenizer == "whitespace": truncated_document = " ".join(document.split()[:doc_length]) elif tokenizer == "vectorizer":