@@ -175,6 +175,11 @@ class Top2Vec:
175
175
Ignores all words with total frequency lower than this. For smaller
176
176
corpora a smaller min_count will be necessary.
177
177
178
+ topic_merge_delta: float (default 0.1)
179
+ Merges topic vectors which have a cosine distance smaller than
180
+ topic_merge_delta using dbscan. The epsilon parameter of dbscan is
181
+ set to the topic_merge_delta.
182
+
178
183
ngram_vocab: bool (Optional, default False)
179
184
Add phrases to topic descriptions.
180
185
@@ -369,6 +374,7 @@ class Top2Vec:
369
374
def __init__ (self ,
370
375
documents ,
371
376
min_count = 50 ,
377
+ topic_merge_delta = 0.1 ,
372
378
ngram_vocab = False ,
373
379
ngram_vocab_args = None ,
374
380
embedding_model = 'doc2vec' ,
@@ -609,7 +615,7 @@ def return_doc(doc):
609
615
610
616
# embed words
611
617
self .word_indexes = dict (zip (self .vocab , range (len (self .vocab ))))
612
- self .word_vectors = self ._l2_normalize ( np . array ( self .embed ( self . vocab )) )
618
+ self .word_vectors = self ._embed_documents ( self .vocab , embedding_batch_size )
613
619
614
620
# embed documents
615
621
@@ -657,56 +663,7 @@ def return_doc(doc):
657
663
else :
658
664
raise ValueError (f"{ embedding_model } is an invalid embedding model." )
659
665
660
- # create 5D embeddings of documents
661
- logger .info ('Creating lower dimension embedding of documents' )
662
-
663
- if umap_args is None :
664
- umap_args = {'n_neighbors' : 15 ,
665
- 'n_components' : 5 ,
666
- 'metric' : 'cosine' }
667
-
668
- umap_model = umap .UMAP (** umap_args ).fit (self .document_vectors )
669
-
670
- # find dense areas of document vectors
671
- logger .info ('Finding dense areas of documents' )
672
-
673
- if hdbscan_args is None :
674
- hdbscan_args = {'min_cluster_size' : 15 ,
675
- 'metric' : 'euclidean' ,
676
- 'cluster_selection_method' : 'eom' }
677
-
678
- cluster = hdbscan .HDBSCAN (** hdbscan_args ).fit (umap_model .embedding_ )
679
-
680
- # calculate topic vectors from dense areas of documents
681
- logger .info ('Finding topics' )
682
-
683
- # create topic vectors
684
- self ._create_topic_vectors (cluster .labels_ )
685
-
686
- # deduplicate topics
687
- self ._deduplicate_topics ()
688
-
689
- # find topic words and scores
690
- self .topic_words , self .topic_word_scores = self ._find_topic_words_and_scores (topic_vectors = self .topic_vectors )
691
-
692
- # assign documents to topic
693
- self .doc_top , self .doc_dist = self ._calculate_documents_topic (self .topic_vectors ,
694
- self .document_vectors )
695
-
696
- # calculate topic sizes
697
- self .topic_sizes = self ._calculate_topic_sizes (hierarchy = False )
698
-
699
- # re-order topics
700
- self ._reorder_topics (hierarchy = False )
701
-
702
- # initialize variables for hierarchical topic reduction
703
- self .topic_vectors_reduced = None
704
- self .doc_top_reduced = None
705
- self .doc_dist_reduced = None
706
- self .topic_sizes_reduced = None
707
- self .topic_words_reduced = None
708
- self .topic_word_scores_reduced = None
709
- self .hierarchy = None
666
+ self .compute_topics (umap_args = umap_args , hdbscan_args = hdbscan_args , topic_merge_delta = topic_merge_delta )
710
667
711
668
# initialize document indexing variables
712
669
self .document_index = None
@@ -841,7 +798,7 @@ def _embed_documents(self, train_corpus, batch_size):
841
798
document_vectors = self ._l2_normalize (np .array (np .vstack (document_vectors )))
842
799
843
800
else :
844
- document_vectors = self .embed (train_corpus , batch_size = batch_size )
801
+ document_vectors = self ._l2_normalize ( self . embed (train_corpus , batch_size = batch_size ) )
845
802
846
803
return document_vectors
847
804
@@ -859,9 +816,9 @@ def _create_topic_vectors(self, cluster_labels):
859
816
np .vstack ([self .document_vectors [np .where (cluster_labels == label )[0 ]]
860
817
.mean (axis = 0 ) for label in unique_labels ]))
861
818
862
- def _deduplicate_topics (self ):
819
+ def _deduplicate_topics (self , topic_merge_delta ):
863
820
core_samples , labels = dbscan (X = self .topic_vectors ,
864
- eps = 0.1 ,
821
+ eps = topic_merge_delta ,
865
822
min_samples = 2 ,
866
823
metric = "cosine" )
867
824
@@ -1261,6 +1218,84 @@ def _validate_vector(self, vector):
1261
1218
if not vector .shape [0 ] == vec_size :
1262
1219
raise ValueError (f"Vector needs to be of { vec_size } dimensions." )
1263
1220
1221
+ def compute_topics (self , umap_args = None , hdbscan_args = None , topic_merge_delta = 0.1 ):
1222
+ """
1223
+ Computes topics from current document vectors.
1224
+
1225
+ New topic vectors will be computed along with new topic descriptions.
1226
+ Documents will be reassigned to new topics. If topics were previously
1227
+ reduced they will be removed. You will need to call
1228
+ hierarchical_topic_reduction to recompute them.
1229
+
1230
+ This is useful for experimenting with different umap and hdbscan
1231
+ parameters and also if many new documents were added since
1232
+ training the initial model.
1233
+
1234
+ Parameters
1235
+ ----------
1236
+ umap_args: dict (Optional, default None)
1237
+ Pass custom arguments to UMAP.
1238
+
1239
+ hdbscan_args: dict (Optional, default None)
1240
+ Pass custom arguments to HDBSCAN.
1241
+
1242
+ topic_merge_delta: float (default 0.1)
1243
+ Merges topic vectors which have a cosine distance smaller than
1244
+ topic_merge_delta using dbscan. The epsilon parameter of dbscan is
1245
+ set to the topic_merge_delta.
1246
+ """
1247
+
1248
+ # create 5D embeddings of documents
1249
+ logger .info ('Creating lower dimension embedding of documents' )
1250
+
1251
+ if umap_args is None :
1252
+ umap_args = {'n_neighbors' : 15 ,
1253
+ 'n_components' : 5 ,
1254
+ 'metric' : 'cosine' }
1255
+
1256
+ umap_model = umap .UMAP (** umap_args ).fit (self .document_vectors )
1257
+
1258
+ # find dense areas of document vectors
1259
+ logger .info ('Finding dense areas of documents' )
1260
+
1261
+ if hdbscan_args is None :
1262
+ hdbscan_args = {'min_cluster_size' : 15 ,
1263
+ 'metric' : 'euclidean' ,
1264
+ 'cluster_selection_method' : 'eom' }
1265
+
1266
+ cluster = hdbscan .HDBSCAN (** hdbscan_args ).fit (umap_model .embedding_ )
1267
+
1268
+ # calculate topic vectors from dense areas of documents
1269
+ logger .info ('Finding topics' )
1270
+
1271
+ # create topic vectors
1272
+ self ._create_topic_vectors (cluster .labels_ )
1273
+
1274
+ # deduplicate topics
1275
+ self ._deduplicate_topics (topic_merge_delta )
1276
+
1277
+ # find topic words and scores
1278
+ self .topic_words , self .topic_word_scores = self ._find_topic_words_and_scores (topic_vectors = self .topic_vectors )
1279
+
1280
+ # assign documents to topic
1281
+ self .doc_top , self .doc_dist = self ._calculate_documents_topic (self .topic_vectors ,
1282
+ self .document_vectors )
1283
+
1284
+ # calculate topic sizes
1285
+ self .topic_sizes = self ._calculate_topic_sizes (hierarchy = False )
1286
+
1287
+ # re-order topics
1288
+ self ._reorder_topics (hierarchy = False )
1289
+
1290
+ # initialize variables for hierarchical topic reduction
1291
+ self .topic_vectors_reduced = None
1292
+ self .doc_top_reduced = None
1293
+ self .doc_dist_reduced = None
1294
+ self .topic_sizes_reduced = None
1295
+ self .topic_words_reduced = None
1296
+ self .topic_word_scores_reduced = None
1297
+ self .hierarchy = None
1298
+
1264
1299
def index_document_vectors (self , ef_construction = 200 , M = 64 ):
1265
1300
"""
1266
1301
Creates an index of the document vectors using hnswlib. This will
@@ -1596,7 +1631,7 @@ def delete_documents(self, doc_ids):
1596
1631
# update index
1597
1632
if self .documents_indexed :
1598
1633
# delete doc_ids from index
1599
- index_ids = [self .doc_id2index_id ( doc_id ) for doc_id in doc_ids ]
1634
+ index_ids = [self .doc_id2index_id [ doc_id ] for doc_id in doc_ids ]
1600
1635
for index_id in index_ids :
1601
1636
self .document_index .mark_deleted (index_id )
1602
1637
# update index_id and doc_ids
0 commit comments