MaartenGr · MaartenGr · Jun 8, 2024 · Mar 28, 2024 · Apr 17, 2024 · Apr 18, 2024
diff --git a/bertopic/_bertopic.py b/bertopic/_bertopic.py
@@ -54,7 +54,7 @@
 from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan
 from bertopic._utils import (
     MyLogger, check_documents_type, check_embeddings_shape,
-    check_is_fitted, validate_distance_matrix
+    check_is_fitted, validate_distance_matrix, select_topic_representation
 )
 import bertopic._save_utils as save_utils
 
@@ -913,12 +913,13 @@ def topics_per_class(self,
 
     def hierarchical_topics(self,
                             docs: List[str],
+                            use_ctfidf: bool = True,
                             linkage_function: Callable[[csr_matrix], np.ndarray] = None,
                             distance_function: Callable[[csr_matrix], csr_matrix] = None) -> pd.DataFrame:
         """ Create a hierarchy of topics
 
         To create this hierarchy, BERTopic needs to be already fitted once.
-        Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF
+        Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF or topic embeddings
         representation using `scipy.cluster.hierarchy.linkage`.
 
         Based on that hierarchy, we calculate the topic representation at each
@@ -928,12 +929,14 @@ def hierarchical_topics(self,
 
         Arguments:
             docs: The documents you used when calling either `fit` or `fit_transform`
+            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
+                        embeddings are used.
             linkage_function: The linkage function to use. Default is:
                               `lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`
             distance_function: The distance function to use on the c-TF-IDF matrix. Default is:
                                `lambda x: 1 - cosine_similarity(x)`.
-                               You can pass any function that returns either a square matrix of 
-                               shape (n_samples, n_samples) with zeros on the diagonal and 
+                               You can pass any function that returns either a square matrix of
+                               shape (n_samples, n_samples) with zeros on the diagonal and
                                non-negative values or condensed distance matrix of shape
                                (n_samples * (n_samples - 1) / 2,) containing the upper
                                triangular of the distance matrix.
@@ -972,7 +975,7 @@ def hierarchical_topics(self,
             linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)
 
         # Calculate distance
-        embeddings = self.c_tf_idf_[self._outliers:]
+        embeddings = select_topic_representation(self.c_tf_idf_, self.topic_embeddings_, use_ctfidf)[0][self._outliers:]
         X = distance_function(embeddings)
         X = validate_distance_matrix(X, embeddings.shape[0])
 
@@ -2004,13 +2007,15 @@ def merge_topics(self,
     def reduce_topics(self,
                       docs: List[str],
                       nr_topics: Union[int, str] = 20,
-                      images: List[str] = None) -> None:
+                      images: List[str] = None,
+                      use_ctfidf: bool = False,
+                      ) -> None:
         """ Reduce the number of topics to a fixed number of topics
         or automatically.
 
         If nr_topics is an integer, then the number of topics is reduced
         to nr_topics using `AgglomerativeClustering` on the cosine distance matrix
-        of the topic embeddings.
+        of the topic c-TF-IDF or semantic embeddings.
 
         If nr_topics is `"auto"`, then HDBSCAN is used to automatically
         reduce the number of topics by running it on the topic embeddings.
@@ -2022,6 +2027,8 @@ def reduce_topics(self,
             nr_topics: The number of topics you want reduced to
             images: A list of paths to the images used when calling either
                     `fit` or `fit_transform`
+            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
+                        embeddings are used.
 
         Updates:
             topics_ : Assigns topics to their merged representations.
@@ -2050,7 +2057,7 @@ def reduce_topics(self,
         documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs))})
 
         # Reduce number of topics
-        documents = self._reduce_topics(documents)
+        documents = self._reduce_topics(documents, use_ctfidf)
         self._merged_topics = None
         self._save_representative_docs(documents)
         self.probabilities_ = self._map_probabilities(self.probabilities_)
@@ -2821,6 +2828,7 @@ def visualize_hierarchy(self,
                             orientation: str = "left",
                             topics: List[int] = None,
                             top_n_topics: int = None,
+                            use_ctfidf: bool = True,
                             custom_labels: bool = False,
                             title: str = "<b>Hierarchical Clustering</b>",
                             width: int = 1000,
@@ -2833,14 +2841,16 @@ def visualize_hierarchy(self,
 
         A ward linkage function is used to perform the
         hierarchical clustering based on the cosine distance
-        matrix between topic embeddings.
+        matrix between c-TF-IDF or semantic embeddings of the topics.
 
         Arguments:
             topic_model: A fitted BERTopic instance.
             orientation: The orientation of the figure.
                          Either 'left' or 'bottom'
             topics: A selection of topics to visualize
             top_n_topics: Only select the top n most frequent topics
+            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
+                        embeddings are used.
             custom_labels: Whether to use custom topic labels that were defined using
                            `topic_model.set_topic_labels`.
                            NOTE: Custom labels are only generated for the original
@@ -2901,6 +2911,7 @@ def visualize_hierarchy(self,
                                             orientation=orientation,
                                             topics=topics,
                                             top_n_topics=top_n_topics,
+                                            use_ctfidf=use_ctfidf,
                                             custom_labels=custom_labels,
                                             title=title,
                                             width=width,
@@ -2915,20 +2926,23 @@ def visualize_heatmap(self,
                           topics: List[int] = None,
                           top_n_topics: int = None,
                           n_clusters: int = None,
+                          use_ctfidf: bool = False,
                           custom_labels: bool = False,
                           title: str = "<b>Similarity Matrix</b>",
                           width: int = 800,
                           height: int = 800) -> go.Figure:
         """ Visualize a heatmap of the topic's similarity matrix
 
-        Based on the cosine similarity matrix between topic embeddings,
+        Based on the cosine similarity matrix between c-TF-IDFs or semantic embeddings of the topics,
         a heatmap is created showing the similarity between topics.
 
         Arguments:
             topics: A selection of topics to visualize.
             top_n_topics: Only select the top n most frequent topics.
             n_clusters: Create n clusters and order the similarity
                         matrix by those clusters.
+            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
+                        embeddings are used.
             custom_labels: Whether to use custom topic labels that were defined using
                            `topic_model.set_topic_labels`.
             title: Title of the plot.
@@ -2959,6 +2973,7 @@ def visualize_heatmap(self,
                                           topics=topics,
                                           top_n_topics=top_n_topics,
                                           n_clusters=n_clusters,
+                                          use_ctfidf=use_ctfidf,
                                           custom_labels=custom_labels,
                                           title=title,
                                           width=width,
@@ -4084,45 +4099,49 @@ def _extract_words_per_topic(self,
 
         return topics
 
-    def _reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
+    def _reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:
         """ Reduce topics to self.nr_topics
 
         Arguments:
             documents: Dataframe with documents and their corresponding IDs and Topics
+            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
+                        embeddings are used.
 
         Returns:
             documents: Updated dataframe with documents and the reduced number of Topics
         """
+
         logger.info("Topic reduction - Reducing number of topics")
         initial_nr_topics = len(self.get_topics())
 
         if isinstance(self.nr_topics, int):
             if self.nr_topics < initial_nr_topics:
-                documents = self._reduce_to_n_topics(documents)
+                documents = self._reduce_to_n_topics(documents, use_ctfidf)
         elif isinstance(self.nr_topics, str):
-            documents = self._auto_reduce_topics(documents)
+            documents = self._auto_reduce_topics(documents, use_ctfidf)
         else:
             raise ValueError("nr_topics needs to be an int or 'auto'! ")
 
         logger.info(f"Topic reduction - Reduced number of topics from {initial_nr_topics} to {len(self.get_topic_freq())}")
         return documents
 
-    def _reduce_to_n_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
+    def _reduce_to_n_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:
         """ Reduce topics to self.nr_topics
 
         Arguments:
             documents: Dataframe with documents and their corresponding IDs and Topics
+            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
+                        embedding are used.
 
         Returns:
             documents: Updated dataframe with documents and the reduced number of Topics
         """
         topics = documents.Topic.tolist().copy()
 
         # Create topic distance matrix
-        if self.topic_embeddings_ is not None:
-            topic_embeddings = self.topic_embeddings_[self._outliers:, ]
-        else:
-            topic_embeddings = self.c_tf_idf_[self._outliers:, ].toarray()
+        topic_embeddings = select_topic_representation(
+            self.c_tf_idf_, self.topic_embeddings_, use_ctfidf
+        )[0][self._outliers:]
         distance_matrix = 1-cosine_similarity(topic_embeddings)
         np.fill_diagonal(distance_matrix, 0)
 
@@ -4155,11 +4174,13 @@ def _reduce_to_n_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
         self._update_topic_size(documents)
         return documents
 
-    def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
+    def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:
         """ Reduce the number of topics automatically using HDBSCAN
 
         Arguments:
             documents: Dataframe with documents and their corresponding IDs and Topics
+            use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
+                        embedding are used.
 
         Returns:
             documents: Updated dataframe with documents and the reduced number of Topics
@@ -4169,10 +4190,7 @@ def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
         max_topic = unique_topics[-1]
 
         # Find similar topics
-        if self.topic_embeddings_ is not None:
-            embeddings = np.array(self.topic_embeddings_)
-        else:
-            embeddings = self.c_tf_idf_.toarray()
+        embeddings = select_topic_representation(self.c_tf_idf_, self.topic_embeddings_, use_ctfidf)[0]
         norm_data = normalize(embeddings, norm='l2')
         predictions = hdbscan.HDBSCAN(min_cluster_size=2,
                                       metric='euclidean',

diff --git a/bertopic/_utils.py b/bertopic/_utils.py
@@ -4,7 +4,7 @@
 from collections.abc import Iterable
 from scipy.sparse import csr_matrix
 from scipy.spatial.distance import squareform
-
+from typing import Optional, Union
 
 class MyLogger:
     def __init__(self, level):
@@ -147,3 +147,57 @@ def validate_distance_matrix(X, n_samples):
         raise ValueError("Distance matrix cannot contain negative values.")
 
     return X
+
+
+def select_topic_representation(
+    ctfidf_embeddings: Optional[Union[np.ndarray, csr_matrix]] = None,
+    semantic_embeddings: Optional[np.ndarray] = None,
+    use_ctfidf: bool = True
+) -> tuple[np.ndarray, bool]:
+    """Select the topic representation.
+
+    Arguments:
+        ctfidf_embeddings: The c-TF-IDF embedding matrix
+        semantic_embeddings: The semantic embedding matrix
+        use_ctfidf: Whether to use the c-TF-IDF representation. If False, topics embedding representation is used, if it
+                    exists. Default is True.
+    Raises
+        ValueError:
+            - If no topic representation was found
+            - If c-TF-IDF embeddings are not a numpy array or a scipy.sparse.csr_matrix
+
+    Returns:
+        The selected topic representation and a boolean indicating whether it is c-TF-IDF.
+    """
+
+    def to_ndarray(array: Union[np.ndarray, csr_matrix]) -> np.ndarray:
+        if isinstance(array, csr_matrix):
+            return array.toarray()
+        elif not isinstance(array, np.ndarray):
+            raise ValueError("The embeddings should be either a type of `numpy.ndarray` or `scipy.sparse.csr_matrix`")
+        return array
+
+    logger = MyLogger("WARNING")
+
+    if ctfidf_embeddings is None and semantic_embeddings is None:
+        raise ValueError("No topic representation was found.")
+
+    if ctfidf_embeddings is not None:
+        ctfidf_embeddings = to_ndarray(ctfidf_embeddings)
+
+    if use_ctfidf:
+        if ctfidf_embeddings is None:
+            logger.warning(
+                "No c-TF-IDF matrix was found despite it is supposed to be used (`use_ctfidf` is True). "
+                "Defaulting to semantic embeddings."
+            )
+            return semantic_embeddings, False
+        return ctfidf_embeddings, True
+    else:
+        if semantic_embeddings is None:
+            logger.warning(
+                "No topic embeddings were found despite they are supposed to be used (`use_ctfidf` is False). "
+                "Defaulting to c-TF-IDF representation."
+            )
+            return ctfidf_embeddings, True
+        return semantic_embeddings, False
diff --git a/bertopic/plotting/_heatmap.py b/bertopic/plotting/_heatmap.py
@@ -2,6 +2,7 @@
 from typing import List, Union
 from scipy.cluster.hierarchy import fcluster, linkage
 from sklearn.metrics.pairwise import cosine_similarity
+from bertopic._utils import select_topic_representation
 
 import plotly.express as px
 import plotly.graph_objects as go
@@ -11,13 +12,14 @@ def visualize_heatmap(topic_model,
                       topics: List[int] = None,
                       top_n_topics: int = None,
                       n_clusters: int = None,
+                      use_ctfidf: bool = False,
                       custom_labels: Union[bool, str] = False,
                       title: str = "<b>Similarity Matrix</b>",
                       width: int = 800,
                       height: int = 800) -> go.Figure:
     """ Visualize a heatmap of the topic's similarity matrix
 
-    Based on the cosine similarity matrix between topic embeddings,
+    Based on the cosine similarity matrix between topic embeddings (either c-TF-IDF or semantic embeddings),
     a heatmap is created showing the similarity between topics.
 
     Arguments:
@@ -26,6 +28,8 @@ def visualize_heatmap(topic_model,
         top_n_topics: Only select the top n most frequent topics.
         n_clusters: Create n clusters and order the similarity
                     matrix by those clusters.
+        use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
+                    embedding are used.
         custom_labels: If bool, whether to use custom topic labels that were defined using 
                        `topic_model.set_topic_labels`.
                        If `str`, it uses labels from other aspects, e.g., "Aspect1".
@@ -55,11 +59,9 @@ def visualize_heatmap(topic_model,
     style="width:1000px; height: 720px; border: 0px;""></iframe>
     """
 
-    # Select topic embeddings
-    if topic_model.topic_embeddings_ is not None:
-        embeddings = np.array(topic_model.topic_embeddings_)[topic_model._outliers:]
-    else:
-        embeddings = topic_model.c_tf_idf_[topic_model._outliers:]
+    embeddings = select_topic_representation(
+        topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf
+    )[0][topic_model._outliers:]
 
     # Select topics based on top_n and topics args
     freq_df = topic_model.get_topic_freq()