-
Notifications
You must be signed in to change notification settings - Fork 357
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
140 additions
and
50 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from keybert._model import KeyBERT | ||
from keybert._llm import KeyLLM | ||
from keybert._model import KeyBERT | ||
|
||
__version__ = "0.7.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,38 +1,20 @@ | ||
import pandas as pd | ||
from scipy.sparse import csr_matrix | ||
from sklearn.base import BaseEstimator | ||
from typing import Mapping, List, Tuple | ||
from typing import List | ||
|
||
|
||
class BaseRepresentation(BaseEstimator): | ||
class BaseLLM(BaseEstimator): | ||
""" The base representation model for fine-tuning topic representations """ | ||
def extract_topics(self, | ||
topic_model, | ||
documents: pd.DataFrame, | ||
c_tf_idf: csr_matrix, | ||
topics: Mapping[str, List[Tuple[str, float]]] | ||
) -> Mapping[str, List[Tuple[str, float]]]: | ||
def extract_keywords(self, documents: List[str], candidate_keywords: List[List[str]] = None): | ||
""" Extract topics | ||
Each representation model that inherits this class will have | ||
its arguments (topic_model, documents, c_tf_idf, topics) | ||
automatically passed. Therefore, the representation model | ||
will only have access to the information about topics related | ||
to those arguments. | ||
Arguments: | ||
topic_model: The BERTopic model that is fitted until topic | ||
representations are calculated. | ||
documents: A dataframe with columns "Document" and "Topic" | ||
that contains all documents with each corresponding | ||
topic. | ||
c_tf_idf: A c-TF-IDF representation that is typically | ||
identical to `topic_model.c_tf_idf_` except for | ||
dynamic, class-based, and hierarchical topic modeling | ||
where it is calculated on a subset of the documents. | ||
topics: A dictionary with topic (key) and tuple of word and | ||
weight (value) as calculated by c-TF-IDF. This is the | ||
default topics that are returned if no representation | ||
model is used. | ||
documents: The documents to extract keywords from | ||
candidate_keywords: A list of candidate keywords that the LLM will fine-tune | ||
For example, it will create a nicer representation of | ||
the candidate keywords, remove redundant keywords, or | ||
shorten them depending on the input prompt. | ||
Returns: | ||
all_keywords: All keywords for each document | ||
""" | ||
return topic_model.topic_representations_ | ||
return [None for document in documents] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters