19
19
from sklearn .preprocessing import normalize
20
20
from scipy .special import softmax
21
21
22
+ try :
23
+ from cuml .manifold .umap import UMAP as cuUMAP
24
+
25
+ _HAVE_CUMAP = True
26
+ except ImportError :
27
+ _HAVE_CUMAP = False
28
+
22
29
try :
23
30
import hnswlib
24
31
@@ -364,6 +371,12 @@ class Top2Vec:
364
371
umap_args: dict (Optional, default None)
365
372
Pass custom arguments to UMAP.
366
373
374
+ gpu_umap: bool (default False)
375
+ If True umap will use the rapidsai cuml library to perform the
376
+ dimensionality reduction. This will lead to a significant speedup
377
+ in the computation time. To install rapidsai cuml follow the
378
+ instructions here: https://docs.rapids.ai/install
379
+
367
380
hdbscan_args: dict (Optional, default None)
368
381
Pass custom arguments to HDBSCAN.
369
382
@@ -377,7 +390,7 @@ def __init__(self,
377
390
topic_merge_delta = 0.1 ,
378
391
ngram_vocab = False ,
379
392
ngram_vocab_args = None ,
380
- embedding_model = 'doc2vec ' ,
393
+ embedding_model = 'universal-sentence-encoder-multilingual ' ,
381
394
embedding_model_path = None ,
382
395
embedding_batch_size = 32 ,
383
396
split_documents = False ,
@@ -395,6 +408,7 @@ def __init__(self,
395
408
tokenizer = None ,
396
409
use_embedding_model_tokenizer = False ,
397
410
umap_args = None ,
411
+ gpu_umap = False ,
398
412
hdbscan_args = None ,
399
413
verbose = True
400
414
):
@@ -663,7 +677,10 @@ def return_doc(doc):
663
677
else :
664
678
raise ValueError (f"{ embedding_model } is an invalid embedding model." )
665
679
666
- self .compute_topics (umap_args = umap_args , hdbscan_args = hdbscan_args , topic_merge_delta = topic_merge_delta )
680
+ self .compute_topics (umap_args = umap_args ,
681
+ hdbscan_args = hdbscan_args ,
682
+ topic_merge_delta = topic_merge_delta ,
683
+ gpu_umap = gpu_umap )
667
684
668
685
# initialize document indexing variables
669
686
self .document_index = None
@@ -1218,7 +1235,7 @@ def _validate_vector(self, vector):
1218
1235
if not vector .shape [0 ] == vec_size :
1219
1236
raise ValueError (f"Vector needs to be of { vec_size } dimensions." )
1220
1237
1221
- def compute_topics (self , umap_args = None , hdbscan_args = None , topic_merge_delta = 0.1 ):
1238
+ def compute_topics (self , umap_args = None , hdbscan_args = None , topic_merge_delta = 0.1 , gpu_umap = False ):
1222
1239
"""
1223
1240
Computes topics from current document vectors.
1224
1241
@@ -1243,6 +1260,12 @@ def compute_topics(self, umap_args=None, hdbscan_args=None, topic_merge_delta=0.
1243
1260
Merges topic vectors which have a cosine distance smaller than
1244
1261
topic_merge_delta using dbscan. The epsilon parameter of dbscan is
1245
1262
set to the topic_merge_delta.
1263
+
1264
+ gpu_umap: bool (default False)
1265
+ If True umap will use the rapidsai cuml library to perform the
1266
+ dimensionality reduction. This will lead to a significant speedup
1267
+ in the computation time. To install rapidsai cuml follow the
1268
+ instructions here: https://docs.rapids.ai/install
1246
1269
"""
1247
1270
1248
1271
# create 5D embeddings of documents
@@ -1253,7 +1276,10 @@ def compute_topics(self, umap_args=None, hdbscan_args=None, topic_merge_delta=0.
1253
1276
'n_components' : 5 ,
1254
1277
'metric' : 'cosine' }
1255
1278
1256
- umap_model = umap .UMAP (** umap_args ).fit (self .document_vectors )
1279
+ if gpu_umap :
1280
+ pass
1281
+ else :
1282
+ umap_model = umap .UMAP (** umap_args ).fit (self .document_vectors )
1257
1283
1258
1284
# find dense areas of document vectors
1259
1285
logger .info ('Finding dense areas of documents' )
0 commit comments