Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hierachical topics calculated with topic embeddings #1894

Merged
merged 9 commits into from
Jun 8, 2024
64 changes: 41 additions & 23 deletions bertopic/_bertopic.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
from bertopic.cluster._utils import hdbscan_delegator, is_supported_hdbscan
from bertopic._utils import (
MyLogger, check_documents_type, check_embeddings_shape,
check_is_fitted, validate_distance_matrix
check_is_fitted, validate_distance_matrix, select_topic_representation
)
import bertopic._save_utils as save_utils

Expand Down Expand Up @@ -913,12 +913,13 @@ def topics_per_class(self,

def hierarchical_topics(self,
docs: List[str],
use_ctfidf: bool = True,
linkage_function: Callable[[csr_matrix], np.ndarray] = None,
distance_function: Callable[[csr_matrix], csr_matrix] = None) -> pd.DataFrame:
""" Create a hierarchy of topics

To create this hierarchy, BERTopic needs to be already fitted once.
Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF
Then, a hierarchy is calculated on the distance matrix of the c-TF-IDF or topic embeddings
representation using `scipy.cluster.hierarchy.linkage`.

Based on that hierarchy, we calculate the topic representation at each
Expand All @@ -928,12 +929,14 @@ def hierarchical_topics(self,

Arguments:
docs: The documents you used when calling either `fit` or `fit_transform`
use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
embeddings are used.
MaartenGr marked this conversation as resolved.
Show resolved Hide resolved
linkage_function: The linkage function to use. Default is:
`lambda x: sch.linkage(x, 'ward', optimal_ordering=True)`
distance_function: The distance function to use on the c-TF-IDF matrix. Default is:
`lambda x: 1 - cosine_similarity(x)`.
You can pass any function that returns either a square matrix of
shape (n_samples, n_samples) with zeros on the diagonal and
You can pass any function that returns either a square matrix of
shape (n_samples, n_samples) with zeros on the diagonal and
non-negative values or condensed distance matrix of shape
(n_samples * (n_samples - 1) / 2,) containing the upper
triangular of the distance matrix.
Expand Down Expand Up @@ -972,7 +975,7 @@ def hierarchical_topics(self,
linkage_function = lambda x: sch.linkage(x, 'ward', optimal_ordering=True)

# Calculate distance
embeddings = self.c_tf_idf_[self._outliers:]
embeddings = select_topic_representation(self.c_tf_idf_, self.topic_embeddings_, use_ctfidf)[0][self._outliers:]
X = distance_function(embeddings)
X = validate_distance_matrix(X, embeddings.shape[0])

Expand Down Expand Up @@ -2004,13 +2007,15 @@ def merge_topics(self,
def reduce_topics(self,
docs: List[str],
nr_topics: Union[int, str] = 20,
images: List[str] = None) -> None:
images: List[str] = None,
use_ctfidf: bool = False,
) -> None:
""" Reduce the number of topics to a fixed number of topics
or automatically.

If nr_topics is an integer, then the number of topics is reduced
to nr_topics using `AgglomerativeClustering` on the cosine distance matrix
of the topic embeddings.
of the topic c-TF-IDF or semantic embeddings.

If nr_topics is `"auto"`, then HDBSCAN is used to automatically
reduce the number of topics by running it on the topic embeddings.
Expand All @@ -2022,6 +2027,8 @@ def reduce_topics(self,
nr_topics: The number of topics you want reduced to
images: A list of paths to the images used when calling either
`fit` or `fit_transform`
use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
embeddings are used.

Updates:
topics_ : Assigns topics to their merged representations.
Expand Down Expand Up @@ -2050,7 +2057,7 @@ def reduce_topics(self,
documents = pd.DataFrame({"Document": docs, "Topic": self.topics_, "Image": images, "ID": range(len(docs))})

# Reduce number of topics
documents = self._reduce_topics(documents)
documents = self._reduce_topics(documents, use_ctfidf)
self._merged_topics = None
self._save_representative_docs(documents)
self.probabilities_ = self._map_probabilities(self.probabilities_)
Expand Down Expand Up @@ -2821,6 +2828,7 @@ def visualize_hierarchy(self,
orientation: str = "left",
topics: List[int] = None,
top_n_topics: int = None,
use_ctfidf: bool = True,
custom_labels: bool = False,
title: str = "<b>Hierarchical Clustering</b>",
width: int = 1000,
Expand All @@ -2833,14 +2841,16 @@ def visualize_hierarchy(self,

A ward linkage function is used to perform the
hierarchical clustering based on the cosine distance
matrix between topic embeddings.
matrix between c-TF-IDF or semantic embeddings of the topics.

Arguments:
topic_model: A fitted BERTopic instance.
orientation: The orientation of the figure.
Either 'left' or 'bottom'
topics: A selection of topics to visualize
top_n_topics: Only select the top n most frequent topics
use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
embeddings are used.
custom_labels: Whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
NOTE: Custom labels are only generated for the original
Expand Down Expand Up @@ -2901,6 +2911,7 @@ def visualize_hierarchy(self,
orientation=orientation,
topics=topics,
top_n_topics=top_n_topics,
use_ctfidf=use_ctfidf,
custom_labels=custom_labels,
title=title,
width=width,
Expand All @@ -2915,20 +2926,23 @@ def visualize_heatmap(self,
topics: List[int] = None,
top_n_topics: int = None,
n_clusters: int = None,
use_ctfidf: bool = False,
custom_labels: bool = False,
title: str = "<b>Similarity Matrix</b>",
width: int = 800,
height: int = 800) -> go.Figure:
""" Visualize a heatmap of the topic's similarity matrix

Based on the cosine similarity matrix between topic embeddings,
Based on the cosine similarity matrix between c-TF-IDFs or semantic embeddings of the topics,
a heatmap is created showing the similarity between topics.

Arguments:
topics: A selection of topics to visualize.
top_n_topics: Only select the top n most frequent topics.
n_clusters: Create n clusters and order the similarity
matrix by those clusters.
use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
embeddings are used.
custom_labels: Whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
title: Title of the plot.
Expand Down Expand Up @@ -2959,6 +2973,7 @@ def visualize_heatmap(self,
topics=topics,
top_n_topics=top_n_topics,
n_clusters=n_clusters,
use_ctfidf=use_ctfidf,
custom_labels=custom_labels,
title=title,
width=width,
Expand Down Expand Up @@ -4084,45 +4099,49 @@ def _extract_words_per_topic(self,

return topics

def _reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
def _reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:
""" Reduce topics to self.nr_topics

Arguments:
documents: Dataframe with documents and their corresponding IDs and Topics
use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
embeddings are used.

Returns:
documents: Updated dataframe with documents and the reduced number of Topics
"""

logger.info("Topic reduction - Reducing number of topics")
initial_nr_topics = len(self.get_topics())

if isinstance(self.nr_topics, int):
if self.nr_topics < initial_nr_topics:
documents = self._reduce_to_n_topics(documents)
documents = self._reduce_to_n_topics(documents, use_ctfidf)
elif isinstance(self.nr_topics, str):
documents = self._auto_reduce_topics(documents)
documents = self._auto_reduce_topics(documents, use_ctfidf)
else:
raise ValueError("nr_topics needs to be an int or 'auto'! ")

logger.info(f"Topic reduction - Reduced number of topics from {initial_nr_topics} to {len(self.get_topic_freq())}")
return documents

def _reduce_to_n_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
def _reduce_to_n_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:
""" Reduce topics to self.nr_topics

Arguments:
documents: Dataframe with documents and their corresponding IDs and Topics
use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
embedding are used.

Returns:
documents: Updated dataframe with documents and the reduced number of Topics
"""
topics = documents.Topic.tolist().copy()

# Create topic distance matrix
if self.topic_embeddings_ is not None:
topic_embeddings = self.topic_embeddings_[self._outliers:, ]
else:
topic_embeddings = self.c_tf_idf_[self._outliers:, ].toarray()
topic_embeddings = select_topic_representation(
self.c_tf_idf_, self.topic_embeddings_, use_ctfidf
)[0][self._outliers:]
distance_matrix = 1-cosine_similarity(topic_embeddings)
np.fill_diagonal(distance_matrix, 0)

Expand Down Expand Up @@ -4155,11 +4174,13 @@ def _reduce_to_n_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
self._update_topic_size(documents)
return documents

def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
def _auto_reduce_topics(self, documents: pd.DataFrame, use_ctfidf: bool = False) -> pd.DataFrame:
""" Reduce the number of topics automatically using HDBSCAN

Arguments:
documents: Dataframe with documents and their corresponding IDs and Topics
use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
embedding are used.

Returns:
documents: Updated dataframe with documents and the reduced number of Topics
Expand All @@ -4169,10 +4190,7 @@ def _auto_reduce_topics(self, documents: pd.DataFrame) -> pd.DataFrame:
max_topic = unique_topics[-1]

# Find similar topics
if self.topic_embeddings_ is not None:
embeddings = np.array(self.topic_embeddings_)
else:
embeddings = self.c_tf_idf_.toarray()
embeddings = select_topic_representation(self.c_tf_idf_, self.topic_embeddings_, use_ctfidf)[0]
norm_data = normalize(embeddings, norm='l2')
predictions = hdbscan.HDBSCAN(min_cluster_size=2,
metric='euclidean',
Expand Down
56 changes: 55 additions & 1 deletion bertopic/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from collections.abc import Iterable
from scipy.sparse import csr_matrix
from scipy.spatial.distance import squareform

from typing import Optional, Union

class MyLogger:
def __init__(self, level):
Expand Down Expand Up @@ -147,3 +147,57 @@ def validate_distance_matrix(X, n_samples):
raise ValueError("Distance matrix cannot contain negative values.")

return X


def select_topic_representation(
ctfidf_embeddings: Optional[Union[np.ndarray, csr_matrix]] = None,
semantic_embeddings: Optional[np.ndarray] = None,
use_ctfidf: bool = True
) -> tuple[np.ndarray, bool]:
"""Select the topic representation.

Arguments:
ctfidf_embeddings: The c-TF-IDF embedding matrix
semantic_embeddings: The semantic embedding matrix
use_ctfidf: Whether to use the c-TF-IDF representation. If False, topics embedding representation is used, if it
exists. Default is True.
Raises
ValueError:
- If no topic representation was found
- If c-TF-IDF embeddings are not a numpy array or a scipy.sparse.csr_matrix

Returns:
The selected topic representation and a boolean indicating whether it is c-TF-IDF.
"""

def to_ndarray(array: Union[np.ndarray, csr_matrix]) -> np.ndarray:
if isinstance(array, csr_matrix):
return array.toarray()
MaartenGr marked this conversation as resolved.
Show resolved Hide resolved
elif not isinstance(array, np.ndarray):
raise ValueError("The embeddings should be either a type of `numpy.ndarray` or `scipy.sparse.csr_matrix`")
MaartenGr marked this conversation as resolved.
Show resolved Hide resolved
return array

logger = MyLogger("WARNING")
MaartenGr marked this conversation as resolved.
Show resolved Hide resolved

if ctfidf_embeddings is None and semantic_embeddings is None:
raise ValueError("No topic representation was found.")
MaartenGr marked this conversation as resolved.
Show resolved Hide resolved

if ctfidf_embeddings is not None:
ctfidf_embeddings = to_ndarray(ctfidf_embeddings)
MaartenGr marked this conversation as resolved.
Show resolved Hide resolved

if use_ctfidf:
if ctfidf_embeddings is None:
logger.warning(
"No c-TF-IDF matrix was found despite it is supposed to be used (`use_ctfidf` is True). "
"Defaulting to semantic embeddings."
)
return semantic_embeddings, False
return ctfidf_embeddings, True
else:
if semantic_embeddings is None:
logger.warning(
"No topic embeddings were found despite they are supposed to be used (`use_ctfidf` is False). "
"Defaulting to c-TF-IDF representation."
)
return ctfidf_embeddings, True
return semantic_embeddings, False
14 changes: 8 additions & 6 deletions bertopic/plotting/_heatmap.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import List, Union
from scipy.cluster.hierarchy import fcluster, linkage
from sklearn.metrics.pairwise import cosine_similarity
from bertopic._utils import select_topic_representation

import plotly.express as px
import plotly.graph_objects as go
Expand All @@ -11,13 +12,14 @@ def visualize_heatmap(topic_model,
topics: List[int] = None,
top_n_topics: int = None,
n_clusters: int = None,
use_ctfidf: bool = False,
custom_labels: Union[bool, str] = False,
title: str = "<b>Similarity Matrix</b>",
width: int = 800,
height: int = 800) -> go.Figure:
""" Visualize a heatmap of the topic's similarity matrix

Based on the cosine similarity matrix between topic embeddings,
Based on the cosine similarity matrix between topic embeddings (either c-TF-IDF or semantic embeddings),
a heatmap is created showing the similarity between topics.

Arguments:
Expand All @@ -26,6 +28,8 @@ def visualize_heatmap(topic_model,
top_n_topics: Only select the top n most frequent topics.
n_clusters: Create n clusters and order the similarity
matrix by those clusters.
use_ctfidf: Whether to calculate distances between topics based on c-TF-IDF embeddings. If False, semantic
embedding are used.
custom_labels: If bool, whether to use custom topic labels that were defined using
`topic_model.set_topic_labels`.
If `str`, it uses labels from other aspects, e.g., "Aspect1".
Expand Down Expand Up @@ -55,11 +59,9 @@ def visualize_heatmap(topic_model,
style="width:1000px; height: 720px; border: 0px;""></iframe>
"""

# Select topic embeddings
if topic_model.topic_embeddings_ is not None:
embeddings = np.array(topic_model.topic_embeddings_)[topic_model._outliers:]
else:
embeddings = topic_model.c_tf_idf_[topic_model._outliers:]
embeddings = select_topic_representation(
topic_model.c_tf_idf_, topic_model.topic_embeddings_, use_ctfidf
)[0][topic_model._outliers:]

# Select topics based on top_n and topics args
freq_df = topic_model.get_topic_freq()
Expand Down
Loading
Loading