Skip to content

Commit 0bcdb43

Browse files
committed
gpu umap
1 parent d70dab7 commit 0bcdb43

File tree

1 file changed

+30
-4
lines changed

1 file changed

+30
-4
lines changed

top2vec/Top2Vec.py

+30-4
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@
1919
from sklearn.preprocessing import normalize
2020
from scipy.special import softmax
2121

22+
try:
23+
from cuml.manifold.umap import UMAP as cuUMAP
24+
25+
_HAVE_CUMAP = True
26+
except ImportError:
27+
_HAVE_CUMAP = False
28+
2229
try:
2330
import hnswlib
2431

@@ -364,6 +371,12 @@ class Top2Vec:
364371
umap_args: dict (Optional, default None)
365372
Pass custom arguments to UMAP.
366373
374+
gpu_umap: bool (default False)
375+
If True umap will use the rapidsai cuml library to perform the
376+
dimensionality reduction. This will lead to a significant speedup
377+
in the computation time. To install rapidsai cuml follow the
378+
instructions here: https://docs.rapids.ai/install
379+
367380
hdbscan_args: dict (Optional, default None)
368381
Pass custom arguments to HDBSCAN.
369382
@@ -377,7 +390,7 @@ def __init__(self,
377390
topic_merge_delta=0.1,
378391
ngram_vocab=False,
379392
ngram_vocab_args=None,
380-
embedding_model='doc2vec',
393+
embedding_model='universal-sentence-encoder-multilingual',
381394
embedding_model_path=None,
382395
embedding_batch_size=32,
383396
split_documents=False,
@@ -395,6 +408,7 @@ def __init__(self,
395408
tokenizer=None,
396409
use_embedding_model_tokenizer=False,
397410
umap_args=None,
411+
gpu_umap=False,
398412
hdbscan_args=None,
399413
verbose=True
400414
):
@@ -663,7 +677,10 @@ def return_doc(doc):
663677
else:
664678
raise ValueError(f"{embedding_model} is an invalid embedding model.")
665679

666-
self.compute_topics(umap_args=umap_args, hdbscan_args=hdbscan_args, topic_merge_delta=topic_merge_delta)
680+
self.compute_topics(umap_args=umap_args,
681+
hdbscan_args=hdbscan_args,
682+
topic_merge_delta=topic_merge_delta,
683+
gpu_umap=gpu_umap)
667684

668685
# initialize document indexing variables
669686
self.document_index = None
@@ -1218,7 +1235,7 @@ def _validate_vector(self, vector):
12181235
if not vector.shape[0] == vec_size:
12191236
raise ValueError(f"Vector needs to be of {vec_size} dimensions.")
12201237

1221-
def compute_topics(self, umap_args=None, hdbscan_args=None, topic_merge_delta=0.1):
1238+
def compute_topics(self, umap_args=None, hdbscan_args=None, topic_merge_delta=0.1, gpu_umap=False):
12221239
"""
12231240
Computes topics from current document vectors.
12241241
@@ -1243,6 +1260,12 @@ def compute_topics(self, umap_args=None, hdbscan_args=None, topic_merge_delta=0.
12431260
Merges topic vectors which have a cosine distance smaller than
12441261
topic_merge_delta using dbscan. The epsilon parameter of dbscan is
12451262
set to the topic_merge_delta.
1263+
1264+
gpu_umap: bool (default False)
1265+
If True umap will use the rapidsai cuml library to perform the
1266+
dimensionality reduction. This will lead to a significant speedup
1267+
in the computation time. To install rapidsai cuml follow the
1268+
instructions here: https://docs.rapids.ai/install
12461269
"""
12471270

12481271
# create 5D embeddings of documents
@@ -1253,7 +1276,10 @@ def compute_topics(self, umap_args=None, hdbscan_args=None, topic_merge_delta=0.
12531276
'n_components': 5,
12541277
'metric': 'cosine'}
12551278

1256-
umap_model = umap.UMAP(**umap_args).fit(self.document_vectors)
1279+
if gpu_umap:
1280+
pass
1281+
else:
1282+
umap_model = umap.UMAP(**umap_args).fit(self.document_vectors)
12571283

12581284
# find dense areas of document vectors
12591285
logger.info('Finding dense areas of documents')

0 commit comments

Comments
 (0)