Skip to content

Commit

Permalink
clean up unnecessary overrides
Browse files Browse the repository at this point in the history
Some overrides were not needed and were introduced while understanding the best place to save bertopic results.

This commit cleans those up
  • Loading branch information
varun646 committed Feb 25, 2025
1 parent 320d510 commit 8152e29
Showing 1 changed file with 8 additions and 266 deletions.
274 changes: 8 additions & 266 deletions src/PatientX/models/BERTopicModel.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,13 @@ def _extract_words_per_topic(
c_tf_idf: csr_matrix = None,
calculate_aspects: bool = True,
) -> Mapping[str, List[Tuple[str, float]]]:
"""Based on tf_idf scores per topic, extract the top n words per topic.
"""
NOTE: this function overrides bertopic._extract_words_per_topic()
The only difference is that we explicitly save the representative words to self.bertopic_representative_words
so that we can later save the intermediate bertopic results
Based on tf_idf scores per topic, extract the top n words per topic.
If the top words per topic need to be extracted, then only the `words` parameter
needs to be passed. If the top words per topic in a specific timestamp, then it
Expand Down Expand Up @@ -185,6 +191,7 @@ def _extract_words_per_topic(
for index, label in enumerate(labels)
}

# NOTE: this is the only change from bertopic._extract_words_per_topic()
self.bertopic_representative_words = {label: values[: self.top_n_words] for label, values in base_topics.items()}

# Fine-tune the topic representations
Expand Down Expand Up @@ -234,271 +241,6 @@ def _extract_words_per_topic(

return topics

@override
def fit(self,
documents: List[str],
embeddings: np.ndarray = None,
images: List[str] = None,
y: Union[List[int], np.ndarray] = None,):
"""Fit the models on a collection of documents, generate topics,
and return the probabilities and topic per document.
Arguments:
documents: A list of documents to fit on
embeddings: Pre-trained document embeddings. These can be used
instead of the sentence-transformer model
images: A list of paths to the images to fit on or the images themselves
y: The target class for (semi)-supervised modeling. Use -1 if no class for a
specific instance is specified.
Returns:
predictions: Topic predictions for each documents
probabilities: The probability of the assigned topic per document.
If `calculate_probabilities` in BERTopic is set to True, then
it calculates the probabilities of all topics across all documents
instead of only the assigned topic. This, however, slows down
computation and may increase memory usage.
Examples:
```python
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
docs = fetch_20newsgroups(subset='all')['data']
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs)
```
If you want to use your own embeddings, use it as follows:
```python
from bertopic import BERTopic
from sklearn.datasets import fetch_20newsgroups
from sentence_transformers import SentenceTransformer
# Create embeddings
docs = fetch_20newsgroups(subset='all')['data']
sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = sentence_model.encode(docs, show_progress_bar=True)
# Create topic model
topic_model = BERTopic()
topics, probs = topic_model.fit_transform(docs, embeddings)
```
"""
if documents is not None:
check_documents_type(documents)
check_embeddings_shape(embeddings, documents)

doc_ids = range(len(documents)) if documents is not None else range(len(images))
documents = pd.DataFrame({"Document": documents, "ID": doc_ids, "Topic": None, "Image": images})

# Extract embeddings
if embeddings is None:
logger.info("Embedding - Transforming documents to embeddings.")
self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose)
embeddings = self._extract_embeddings(
documents.Document.values.tolist(),
images=images,
method="document",
verbose=self.verbose,
)
logger.info("Embedding - Completed \u2713")
else:
if self.embedding_model is not None:
self.embedding_model = select_backend(
self.embedding_model, language=self.language, verbose=self.verbose
)

# Guided Topic Modeling
if self.seed_topic_list is not None and self.embedding_model is not None:
y, embeddings = self._guided_topic_modeling(embeddings)

# Reduce dimensionality and fit UMAP model
umap_embeddings = self._reduce_dimensionality(embeddings, y)

# Zero-shot Topic Modeling
if self._is_zeroshot():
documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling(
documents, embeddings
)

# Filter UMAP embeddings to only non-assigned embeddings to be used for clustering
if len(documents) > 0:
umap_embeddings = self.umap_model.transform(embeddings)

if len(documents) > 0:
# Cluster reduced embeddings
documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y)
if self._is_zeroshot() and len(assigned_documents) > 0:
documents, embeddings = self._combine_zeroshot_topics(
documents, embeddings, assigned_documents, assigned_embeddings
)
else:
# All documents matches zero-shot topics
documents = assigned_documents
embeddings = assigned_embeddings

# Sort and Map Topic IDs by their frequency
if not self.nr_topics:
documents = self._sort_mappings_by_frequency(documents)

# Create documents from images if we have images only
if documents.Document.values[0] is None:
custom_documents = self._images_to_text(documents, embeddings)

# Extract topics by calculating c-TF-IDF
self._extract_topics(custom_documents, embeddings=embeddings)
self._create_topic_vectors(documents=documents, embeddings=embeddings)

# Reduce topics
if self.nr_topics:
custom_documents = self._reduce_topics(custom_documents)

# Save the top representative documents per topic
self._save_representative_docs(custom_documents)
else:
# Extract topics by calculating c-TF-IDF
self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose)

# Reduce topics
if self.nr_topics:
documents = self._reduce_topics(documents)

# Save the top 3 most representative documents per topic
self._save_representative_docs(documents)

# In the case of zero-shot topics, probability will come from cosine similarity,
# and the HDBSCAN model will be removed
if self._is_zeroshot() and len(assigned_documents) > 0:
self.hdbscan_model = BaseCluster()
sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_))

if self.calculate_probabilities:
self.probabilities_ = sim_matrix
else:
self.probabilities_ = np.max(sim_matrix, axis=1)
else:
self.probabilities_ = self._map_probabilities(probabilities, original_topics=True)
predictions = documents.Topic.to_list()

# TODO: save bertopic only results

return predictions, self.probabilities_

@override
def update_topics(
self,
docs: List[str],
images: List[str] = None,
topics: List[int] = None,
top_n_words: int = 10,
n_gram_range: tuple[int, int] = None,
vectorizer_model: CountVectorizer = None,
ctfidf_model: ClassTfidfTransformer = None,
representation_model: BaseRepresentation = None,
):
"""Updates the topic representation by recalculating c-TF-IDF with the new
parameters as defined in this function.
When you have trained a model and viewed the topics and the words that represent them,
you might not be satisfied with the representation. Perhaps you forgot to remove
stop_words or you want to try out a different n_gram_range. This function allows you
to update the topic representation after they have been formed.
Arguments:
docs: The documents you used when calling either `fit` or `fit_transform`
images: The images you used when calling either `fit` or `fit_transform`
topics: A list of topics where each topic is related to a document in `docs`.
Use this variable to change or map the topics.
NOTE: Using a custom list of topic assignments may lead to errors if
topic reduction techniques are used afterwards. Make sure that
manually assigning topics is the last step in the pipeline
top_n_words: The number of words per topic to extract. Setting this
too high can negatively impact topic embeddings as topics
are typically best represented by at most 10 words.
n_gram_range: The n-gram range for the CountVectorizer.
vectorizer_model: Pass in your own CountVectorizer from scikit-learn
ctfidf_model: Pass in your own c-TF-IDF model to update the representations
representation_model: Pass in a model that fine-tunes the topic representations
calculated through c-TF-IDF. Models from `bertopic.representation`
are supported.
Examples:
In order to update the topic representation, you will need to first fit the topic
model and extract topics from them. Based on these, you can update the representation:
```python
topic_model.update_topics(docs, n_gram_range=(2, 3))
```
You can also use a custom vectorizer to update the representation:
```python
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
topic_model.update_topics(docs, vectorizer_model=vectorizer_model)
```
You can also use this function to change or map the topics to something else.
You can update them as follows:
```python
topic_model.update_topics(docs, my_updated_topics)
```
"""
check_documents_type(docs)
check_is_fitted(self)
if not n_gram_range:
n_gram_range = self.n_gram_range

if top_n_words > 100:
logger.warning(
"Note that extracting more than 100 words from a sparse " "can slow down computation quite a bit."
)
self.top_n_words = top_n_words
self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range)
self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()
self.representation_model = representation_model

if topics is None:
topics = self.topics_
else:
logger.warning(
"Using a custom list of topic assignments may lead to errors if "
"topic reduction techniques are used afterwards. Make sure that "
"manually assigning topics is the last step in the pipeline."
"Note that topic embeddings will also be created through weighted"
"c-TF-IDF embeddings instead of centroid embeddings."
)

documents = pd.DataFrame({"Document": docs, "Topic": topics, "ID": range(len(docs)), "Image": images})
documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})

# Update topic sizes and assignments
self._update_topic_size(documents)

# Extract words and update topic labels
self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)
self.topic_representations_ = self._extract_words_per_topic(words, documents)

# Update topic vectors
if set(topics) != self.topics_:
# Remove outlier topic embedding if all that has changed is the outlier class
same_position = all(
[
True if old_topic == new_topic else False
for old_topic, new_topic in zip(self.topics_, topics)
if old_topic != -1
]
)
if same_position and -1 not in topics and -1 in self.topics_:
self.topic_embeddings_ = self.topic_embeddings_[1:]
else:
self._create_topic_vectors()



def visualize_document_datamap(
self,
docs: List[str],
Expand Down

0 comments on commit 8152e29

Please sign in to comment.