clean up unnecessary overrides

Some overrides were not needed and were introduced while understanding the best place to save bertopic results. This commit cleans those up
gt-sse-center · Feb 25, 2025 · 8152e29 · 8152e29
1 parent 320d510
commit 8152e29
Showing 1 changed file with 8 additions and 266 deletions.
diff --git a/src/PatientX/models/BERTopicModel.py b/src/PatientX/models/BERTopicModel.py
@@ -146,7 +146,13 @@ def _extract_words_per_topic(
         c_tf_idf: csr_matrix = None,
         calculate_aspects: bool = True,
     ) -> Mapping[str, List[Tuple[str, float]]]:
-        """Based on tf_idf scores per topic, extract the top n words per topic.
+        """
+        NOTE: this function overrides bertopic._extract_words_per_topic()
+        The only difference is that we explicitly save the representative words to self.bertopic_representative_words
+        so that we can later save the intermediate bertopic results
+
+
+        Based on tf_idf scores per topic, extract the top n words per topic.
 
         If the top words per topic need to be extracted, then only the `words` parameter
         needs to be passed. If the top words per topic in a specific timestamp, then it
@@ -185,6 +191,7 @@ def _extract_words_per_topic(
             for index, label in enumerate(labels)
         }
 
+        # NOTE: this is the only change from bertopic._extract_words_per_topic()
         self.bertopic_representative_words = {label: values[: self.top_n_words] for label, values in base_topics.items()}
 
         # Fine-tune the topic representations
@@ -234,271 +241,6 @@ def _extract_words_per_topic(
 
         return topics
 
-    @override
-    def fit(self,
-        documents: List[str],
-        embeddings: np.ndarray = None,
-        images: List[str] = None,
-        y: Union[List[int], np.ndarray] = None,):
-        """Fit the models on a collection of documents, generate topics,
-        and return the probabilities and topic per document.
-
-        Arguments:
-            documents: A list of documents to fit on
-            embeddings: Pre-trained document embeddings. These can be used
-                        instead of the sentence-transformer model
-            images: A list of paths to the images to fit on or the images themselves
-            y: The target class for (semi)-supervised modeling. Use -1 if no class for a
-               specific instance is specified.
-
-        Returns:
-            predictions: Topic predictions for each documents
-            probabilities: The probability of the assigned topic per document.
-                           If `calculate_probabilities` in BERTopic is set to True, then
-                           it calculates the probabilities of all topics across all documents
-                           instead of only the assigned topic. This, however, slows down
-                           computation and may increase memory usage.
-
-        Examples:
-        ```python
-        from bertopic import BERTopic
-        from sklearn.datasets import fetch_20newsgroups
-
-        docs = fetch_20newsgroups(subset='all')['data']
-        topic_model = BERTopic()
-        topics, probs = topic_model.fit_transform(docs)
-        ```
-
-        If you want to use your own embeddings, use it as follows:
-
-        ```python
-        from bertopic import BERTopic
-        from sklearn.datasets import fetch_20newsgroups
-        from sentence_transformers import SentenceTransformer
-
-        # Create embeddings
-        docs = fetch_20newsgroups(subset='all')['data']
-        sentence_model = SentenceTransformer("all-MiniLM-L6-v2")
-        embeddings = sentence_model.encode(docs, show_progress_bar=True)
-
-        # Create topic model
-        topic_model = BERTopic()
-        topics, probs = topic_model.fit_transform(docs, embeddings)
-        ```
-        """
-        if documents is not None:
-            check_documents_type(documents)
-            check_embeddings_shape(embeddings, documents)
-
-        doc_ids = range(len(documents)) if documents is not None else range(len(images))
-        documents = pd.DataFrame({"Document": documents, "ID": doc_ids, "Topic": None, "Image": images})
-
-        # Extract embeddings
-        if embeddings is None:
-            logger.info("Embedding - Transforming documents to embeddings.")
-            self.embedding_model = select_backend(self.embedding_model, language=self.language, verbose=self.verbose)
-            embeddings = self._extract_embeddings(
-                documents.Document.values.tolist(),
-                images=images,
-                method="document",
-                verbose=self.verbose,
-            )
-            logger.info("Embedding - Completed \u2713")
-        else:
-            if self.embedding_model is not None:
-                self.embedding_model = select_backend(
-                    self.embedding_model, language=self.language, verbose=self.verbose
-                )
-
-        # Guided Topic Modeling
-        if self.seed_topic_list is not None and self.embedding_model is not None:
-            y, embeddings = self._guided_topic_modeling(embeddings)
-
-        # Reduce dimensionality and fit UMAP model
-        umap_embeddings = self._reduce_dimensionality(embeddings, y)
-
-        # Zero-shot Topic Modeling
-        if self._is_zeroshot():
-            documents, embeddings, assigned_documents, assigned_embeddings = self._zeroshot_topic_modeling(
-                documents, embeddings
-            )
-
-            # Filter UMAP embeddings to only non-assigned embeddings to be used for clustering
-            if len(documents) > 0:
-                umap_embeddings = self.umap_model.transform(embeddings)
-
-        if len(documents) > 0:
-            # Cluster reduced embeddings
-            documents, probabilities = self._cluster_embeddings(umap_embeddings, documents, y=y)
-            if self._is_zeroshot() and len(assigned_documents) > 0:
-                documents, embeddings = self._combine_zeroshot_topics(
-                    documents, embeddings, assigned_documents, assigned_embeddings
-                )
-        else:
-            # All documents matches zero-shot topics
-            documents = assigned_documents
-            embeddings = assigned_embeddings
-
-        # Sort and Map Topic IDs by their frequency
-        if not self.nr_topics:
-            documents = self._sort_mappings_by_frequency(documents)
-
-        # Create documents from images if we have images only
-        if documents.Document.values[0] is None:
-            custom_documents = self._images_to_text(documents, embeddings)
-
-            # Extract topics by calculating c-TF-IDF
-            self._extract_topics(custom_documents, embeddings=embeddings)
-            self._create_topic_vectors(documents=documents, embeddings=embeddings)
-
-            # Reduce topics
-            if self.nr_topics:
-                custom_documents = self._reduce_topics(custom_documents)
-
-            # Save the top representative documents per topic
-            self._save_representative_docs(custom_documents)
-        else:
-            # Extract topics by calculating c-TF-IDF
-            self._extract_topics(documents, embeddings=embeddings, verbose=self.verbose)
-
-            # Reduce topics
-            if self.nr_topics:
-                documents = self._reduce_topics(documents)
-
-            # Save the top 3 most representative documents per topic
-            self._save_representative_docs(documents)
-
-        # In the case of zero-shot topics, probability will come from cosine similarity,
-        # and the HDBSCAN model will be removed
-        if self._is_zeroshot() and len(assigned_documents) > 0:
-            self.hdbscan_model = BaseCluster()
-            sim_matrix = cosine_similarity(embeddings, np.array(self.topic_embeddings_))
-
-            if self.calculate_probabilities:
-                self.probabilities_ = sim_matrix
-            else:
-                self.probabilities_ = np.max(sim_matrix, axis=1)
-        else:
-            self.probabilities_ = self._map_probabilities(probabilities, original_topics=True)
-        predictions = documents.Topic.to_list()
-
-        # TODO: save bertopic only results
-
-        return predictions, self.probabilities_
-
-    @override
-    def update_topics(
-            self,
-            docs: List[str],
-            images: List[str] = None,
-            topics: List[int] = None,
-            top_n_words: int = 10,
-            n_gram_range: tuple[int, int] = None,
-            vectorizer_model: CountVectorizer = None,
-            ctfidf_model: ClassTfidfTransformer = None,
-            representation_model: BaseRepresentation = None,
-    ):
-        """Updates the topic representation by recalculating c-TF-IDF with the new
-        parameters as defined in this function.
-
-        When you have trained a model and viewed the topics and the words that represent them,
-        you might not be satisfied with the representation. Perhaps you forgot to remove
-        stop_words or you want to try out a different n_gram_range. This function allows you
-        to update the topic representation after they have been formed.
-
-        Arguments:
-            docs: The documents you used when calling either `fit` or `fit_transform`
-            images: The images you used when calling either `fit` or `fit_transform`
-            topics: A list of topics where each topic is related to a document in `docs`.
-                    Use this variable to change or map the topics.
-                    NOTE: Using a custom list of topic assignments may lead to errors if
-                          topic reduction techniques are used afterwards. Make sure that
-                          manually assigning topics is the last step in the pipeline
-            top_n_words: The number of words per topic to extract. Setting this
-                         too high can negatively impact topic embeddings as topics
-                         are typically best represented by at most 10 words.
-            n_gram_range: The n-gram range for the CountVectorizer.
-            vectorizer_model: Pass in your own CountVectorizer from scikit-learn
-            ctfidf_model: Pass in your own c-TF-IDF model to update the representations
-            representation_model: Pass in a model that fine-tunes the topic representations
-                                  calculated through c-TF-IDF. Models from `bertopic.representation`
-                                  are supported.
-
-        Examples:
-        In order to update the topic representation, you will need to first fit the topic
-        model and extract topics from them. Based on these, you can update the representation:
-
-        ```python
-        topic_model.update_topics(docs, n_gram_range=(2, 3))
-        ```
-
-        You can also use a custom vectorizer to update the representation:
-
-        ```python
-        from sklearn.feature_extraction.text import CountVectorizer
-        vectorizer_model = CountVectorizer(ngram_range=(1, 2), stop_words="english")
-        topic_model.update_topics(docs, vectorizer_model=vectorizer_model)
-        ```
-
-        You can also use this function to change or map the topics to something else.
-        You can update them as follows:
-
-        ```python
-        topic_model.update_topics(docs, my_updated_topics)
-        ```
-        """
-        check_documents_type(docs)
-        check_is_fitted(self)
-        if not n_gram_range:
-            n_gram_range = self.n_gram_range
-
-        if top_n_words > 100:
-            logger.warning(
-                "Note that extracting more than 100 words from a sparse " "can slow down computation quite a bit."
-            )
-        self.top_n_words = top_n_words
-        self.vectorizer_model = vectorizer_model or CountVectorizer(ngram_range=n_gram_range)
-        self.ctfidf_model = ctfidf_model or ClassTfidfTransformer()
-        self.representation_model = representation_model
-
-        if topics is None:
-            topics = self.topics_
-        else:
-            logger.warning(
-                "Using a custom list of topic assignments may lead to errors if "
-                "topic reduction techniques are used afterwards. Make sure that "
-                "manually assigning topics is the last step in the pipeline."
-                "Note that topic embeddings will also be created through weighted"
-                "c-TF-IDF embeddings instead of centroid embeddings."
-            )
-
-        documents = pd.DataFrame({"Document": docs, "Topic": topics, "ID": range(len(docs)), "Image": images})
-        documents_per_topic = documents.groupby(["Topic"], as_index=False).agg({"Document": " ".join})
-
-        # Update topic sizes and assignments
-        self._update_topic_size(documents)
-
-        # Extract words and update topic labels
-        self.c_tf_idf_, words = self._c_tf_idf(documents_per_topic)
-        self.topic_representations_ = self._extract_words_per_topic(words, documents)
-
-        # Update topic vectors
-        if set(topics) != self.topics_:
-            # Remove outlier topic embedding if all that has changed is the outlier class
-            same_position = all(
-                [
-                    True if old_topic == new_topic else False
-                    for old_topic, new_topic in zip(self.topics_, topics)
-                    if old_topic != -1
-                ]
-            )
-            if same_position and -1 not in topics and -1 in self.topics_:
-                self.topic_embeddings_ = self.topic_embeddings_[1:]
-            else:
-                self._create_topic_vectors()
-
-
-
     def visualize_document_datamap(
             self,
             docs: List[str],