Skip to content

Commit b84cbda

Browse files
committed
adding compute_topics method and bug fixes
1 parent 3a13fac commit b84cbda

File tree

5 files changed

+111
-58
lines changed

5 files changed

+111
-58
lines changed

docs/conf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
author = 'Dimo Angelov'
3030

3131
# The full version, including alpha/beta/rc tags
32-
release = '1.0.28 '
32+
release = '1.0.29'
3333

3434

3535
# -- General configuration ---------------------------------------------------

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
setuptools.setup(
77
name="top2vec",
88
packages=["top2vec"],
9-
version="1.0.28",
9+
version="1.0.29",
1010
author="Dimo Angelov",
1111
author_email="dimo.angelov@gmail.com",
1212
description="Top2Vec learns jointly embedded topic, document and word vectors.",

top2vec/Top2Vec.py

+90-55
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,11 @@ class Top2Vec:
175175
Ignores all words with total frequency lower than this. For smaller
176176
corpora a smaller min_count will be necessary.
177177
178+
topic_merge_delta: float (default 0.1)
179+
Merges topic vectors which have a cosine distance smaller than
180+
topic_merge_delta using dbscan. The epsilon parameter of dbscan is
181+
set to the topic_merge_delta.
182+
178183
ngram_vocab: bool (Optional, default False)
179184
Add phrases to topic descriptions.
180185
@@ -369,6 +374,7 @@ class Top2Vec:
369374
def __init__(self,
370375
documents,
371376
min_count=50,
377+
topic_merge_delta=0.1,
372378
ngram_vocab=False,
373379
ngram_vocab_args=None,
374380
embedding_model='doc2vec',
@@ -609,7 +615,7 @@ def return_doc(doc):
609615

610616
# embed words
611617
self.word_indexes = dict(zip(self.vocab, range(len(self.vocab))))
612-
self.word_vectors = self._l2_normalize(np.array(self.embed(self.vocab)))
618+
self.word_vectors = self._embed_documents(self.vocab, embedding_batch_size)
613619

614620
# embed documents
615621

@@ -657,56 +663,7 @@ def return_doc(doc):
657663
else:
658664
raise ValueError(f"{embedding_model} is an invalid embedding model.")
659665

660-
# create 5D embeddings of documents
661-
logger.info('Creating lower dimension embedding of documents')
662-
663-
if umap_args is None:
664-
umap_args = {'n_neighbors': 15,
665-
'n_components': 5,
666-
'metric': 'cosine'}
667-
668-
umap_model = umap.UMAP(**umap_args).fit(self.document_vectors)
669-
670-
# find dense areas of document vectors
671-
logger.info('Finding dense areas of documents')
672-
673-
if hdbscan_args is None:
674-
hdbscan_args = {'min_cluster_size': 15,
675-
'metric': 'euclidean',
676-
'cluster_selection_method': 'eom'}
677-
678-
cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)
679-
680-
# calculate topic vectors from dense areas of documents
681-
logger.info('Finding topics')
682-
683-
# create topic vectors
684-
self._create_topic_vectors(cluster.labels_)
685-
686-
# deduplicate topics
687-
self._deduplicate_topics()
688-
689-
# find topic words and scores
690-
self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)
691-
692-
# assign documents to topic
693-
self.doc_top, self.doc_dist = self._calculate_documents_topic(self.topic_vectors,
694-
self.document_vectors)
695-
696-
# calculate topic sizes
697-
self.topic_sizes = self._calculate_topic_sizes(hierarchy=False)
698-
699-
# re-order topics
700-
self._reorder_topics(hierarchy=False)
701-
702-
# initialize variables for hierarchical topic reduction
703-
self.topic_vectors_reduced = None
704-
self.doc_top_reduced = None
705-
self.doc_dist_reduced = None
706-
self.topic_sizes_reduced = None
707-
self.topic_words_reduced = None
708-
self.topic_word_scores_reduced = None
709-
self.hierarchy = None
666+
self.compute_topics(umap_args=umap_args, hdbscan_args=hdbscan_args, topic_merge_delta=topic_merge_delta)
710667

711668
# initialize document indexing variables
712669
self.document_index = None
@@ -841,7 +798,7 @@ def _embed_documents(self, train_corpus, batch_size):
841798
document_vectors = self._l2_normalize(np.array(np.vstack(document_vectors)))
842799

843800
else:
844-
document_vectors = self.embed(train_corpus, batch_size=batch_size)
801+
document_vectors = self._l2_normalize(self.embed(train_corpus, batch_size=batch_size))
845802

846803
return document_vectors
847804

@@ -859,9 +816,9 @@ def _create_topic_vectors(self, cluster_labels):
859816
np.vstack([self.document_vectors[np.where(cluster_labels == label)[0]]
860817
.mean(axis=0) for label in unique_labels]))
861818

862-
def _deduplicate_topics(self):
819+
def _deduplicate_topics(self, topic_merge_delta):
863820
core_samples, labels = dbscan(X=self.topic_vectors,
864-
eps=0.1,
821+
eps=topic_merge_delta,
865822
min_samples=2,
866823
metric="cosine")
867824

@@ -1261,6 +1218,84 @@ def _validate_vector(self, vector):
12611218
if not vector.shape[0] == vec_size:
12621219
raise ValueError(f"Vector needs to be of {vec_size} dimensions.")
12631220

1221+
def compute_topics(self, umap_args=None, hdbscan_args=None, topic_merge_delta=0.1):
1222+
"""
1223+
Computes topics from current document vectors.
1224+
1225+
New topic vectors will be computed along with new topic descriptions.
1226+
Documents will be reassigned to new topics. If topics were previously
1227+
reduced they will be removed. You will need to call
1228+
hierarchical_topic_reduction to recompute them.
1229+
1230+
This is useful for experimenting with different umap and hdbscan
1231+
parameters and also if many new documents were added since
1232+
training the initial model.
1233+
1234+
Parameters
1235+
----------
1236+
umap_args: dict (Optional, default None)
1237+
Pass custom arguments to UMAP.
1238+
1239+
hdbscan_args: dict (Optional, default None)
1240+
Pass custom arguments to HDBSCAN.
1241+
1242+
topic_merge_delta: float (default 0.1)
1243+
Merges topic vectors which have a cosine distance smaller than
1244+
topic_merge_delta using dbscan. The epsilon parameter of dbscan is
1245+
set to the topic_merge_delta.
1246+
"""
1247+
1248+
# create 5D embeddings of documents
1249+
logger.info('Creating lower dimension embedding of documents')
1250+
1251+
if umap_args is None:
1252+
umap_args = {'n_neighbors': 15,
1253+
'n_components': 5,
1254+
'metric': 'cosine'}
1255+
1256+
umap_model = umap.UMAP(**umap_args).fit(self.document_vectors)
1257+
1258+
# find dense areas of document vectors
1259+
logger.info('Finding dense areas of documents')
1260+
1261+
if hdbscan_args is None:
1262+
hdbscan_args = {'min_cluster_size': 15,
1263+
'metric': 'euclidean',
1264+
'cluster_selection_method': 'eom'}
1265+
1266+
cluster = hdbscan.HDBSCAN(**hdbscan_args).fit(umap_model.embedding_)
1267+
1268+
# calculate topic vectors from dense areas of documents
1269+
logger.info('Finding topics')
1270+
1271+
# create topic vectors
1272+
self._create_topic_vectors(cluster.labels_)
1273+
1274+
# deduplicate topics
1275+
self._deduplicate_topics(topic_merge_delta)
1276+
1277+
# find topic words and scores
1278+
self.topic_words, self.topic_word_scores = self._find_topic_words_and_scores(topic_vectors=self.topic_vectors)
1279+
1280+
# assign documents to topic
1281+
self.doc_top, self.doc_dist = self._calculate_documents_topic(self.topic_vectors,
1282+
self.document_vectors)
1283+
1284+
# calculate topic sizes
1285+
self.topic_sizes = self._calculate_topic_sizes(hierarchy=False)
1286+
1287+
# re-order topics
1288+
self._reorder_topics(hierarchy=False)
1289+
1290+
# initialize variables for hierarchical topic reduction
1291+
self.topic_vectors_reduced = None
1292+
self.doc_top_reduced = None
1293+
self.doc_dist_reduced = None
1294+
self.topic_sizes_reduced = None
1295+
self.topic_words_reduced = None
1296+
self.topic_word_scores_reduced = None
1297+
self.hierarchy = None
1298+
12641299
def index_document_vectors(self, ef_construction=200, M=64):
12651300
"""
12661301
Creates an index of the document vectors using hnswlib. This will
@@ -1596,7 +1631,7 @@ def delete_documents(self, doc_ids):
15961631
# update index
15971632
if self.documents_indexed:
15981633
# delete doc_ids from index
1599-
index_ids = [self.doc_id2index_id(doc_id) for doc_id in doc_ids]
1634+
index_ids = [self.doc_id2index_id[doc_id] for doc_id in doc_ids]
16001635
for index_id in index_ids:
16011636
self.document_index.mark_deleted(index_id)
16021637
# update index_id and doc_ids

top2vec/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
from top2vec.Top2Vec import Top2Vec
22

3-
__version__ = '1.0.28'
3+
__version__ = '1.0.29'

top2vec/tests/test_top2vec.py

+18
Original file line numberDiff line numberDiff line change
@@ -120,6 +120,24 @@ def test_add_documents_original(top2vec_model):
120120
assert num_docs_new == len(top2vec_model.documents)
121121

122122

123+
@pytest.mark.parametrize('top2vec_model', models)
124+
def test_compute_topics(top2vec_model):
125+
top2vec_model.compute_topics()
126+
127+
num_topics = top2vec_model.get_num_topics()
128+
words, word_scores, topic_nums = top2vec_model.get_topics()
129+
130+
# check that for each topic there are words, word_scores and topic_nums
131+
assert len(words) == len(word_scores) == len(topic_nums) == num_topics
132+
133+
# check that for each word there is a score
134+
assert len(words[0]) == len(word_scores[0])
135+
136+
# check that topics words are returned in decreasing order
137+
topic_words_scores = word_scores[0]
138+
assert all(topic_words_scores[i] >= topic_words_scores[i + 1] for i in range(len(topic_words_scores) - 1))
139+
140+
123141
@pytest.mark.parametrize('top2vec_model', models)
124142
def test_hierarchical_topic_reduction(top2vec_model):
125143
num_topics = top2vec_model.get_num_topics()

0 commit comments

Comments
 (0)