Skip to content

Commit 8b55182

Browse files
committed
adding query_topics
1 parent 63ca01f commit 8b55182

File tree

4 files changed

+156
-19
lines changed

4 files changed

+156
-19
lines changed

docs/conf.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
author = 'Dimo Angelov'
3030

3131
# The full version, including alpha/beta/rc tags
32-
release = '1.0.24'
32+
release = '1.0.25'
3333

3434

3535
# -- General configuration ---------------------------------------------------

setup.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
setuptools.setup(
77
name="top2vec",
88
packages=["top2vec"],
9-
version="1.0.24",
9+
version="1.0.25",
1010
author="Dimo Angelov",
1111
author_email="dimo.angelov@gmail.com",
1212
description="Top2Vec learns jointly embedded topic, document and word vectors.",

top2vec/Top2Vec.py

+153-16
Original file line numberDiff line numberDiff line change
@@ -1680,6 +1680,82 @@ def query_documents(self, query, num_docs, return_documents=True, use_index=Fals
16801680
return self.search_documents_by_vector(query_vec, num_docs, return_documents=return_documents,
16811681
use_index=use_index, ef=ef)
16821682

1683+
def query_topics(self, query, num_topics, reduced=False, tokenizer=None):
1684+
"""
1685+
Semantic search of topics using keywords.
1686+
1687+
These are the topics closest to the vector. Topics are ordered by
1688+
proximity to the vector. Successive topics in the list are less
1689+
semantically similar to the vector.
1690+
1691+
Parameters
1692+
----------
1693+
query: string
1694+
Any sequence of text. This could be an actual question, a sentence,
1695+
a paragraph or a document.
1696+
1697+
num_topics: int
1698+
Number of documents to return.
1699+
1700+
reduced: bool (Optional, default False)
1701+
Original topics are searched by default. If True the
1702+
reduced topics will be searched.
1703+
1704+
tokenizer: callable (Optional, default None)
1705+
1706+
** For doc2vec embedding model only **
1707+
1708+
Override the default tokenization method. If None then
1709+
gensim.utils.simple_preprocess will be used.
1710+
1711+
Returns
1712+
-------
1713+
topics_words: array of shape (num_topics, 50)
1714+
For each topic the top 50 words are returned, in order of semantic
1715+
similarity to topic.
1716+
1717+
Example:
1718+
[['data', 'deep', 'learning' ... 'artificial'], <Topic 0>
1719+
['environment', 'warming', 'climate ... 'temperature'] <Topic 1>
1720+
...]
1721+
1722+
word_scores: array of shape (num_topics, 50)
1723+
For each topic the cosine similarity scores of the top 50 words
1724+
to the topic are returned.
1725+
1726+
Example:
1727+
[[0.7132, 0.6473, 0.5700 ... 0.3455], <Topic 0>
1728+
[0.7818', 0.7671, 0.7603 ... 0.6769] <Topic 1>
1729+
...]
1730+
1731+
topic_scores: array of float, shape(num_topics)
1732+
For each topic the cosine similarity to the search keywords will be
1733+
returned.
1734+
1735+
topic_nums: array of int, shape(num_topics)
1736+
The unique number of every topic will be returned.
1737+
"""
1738+
1739+
self._validate_query(query)
1740+
1741+
if self.embedding_model != "doc2vec":
1742+
query_vec = self._embed_documents(query)[0]
1743+
1744+
else:
1745+
1746+
# if tokenizer is not passed use default
1747+
if tokenizer is None:
1748+
tokenizer = default_tokenizer
1749+
1750+
tokenized_query = tokenizer(query)
1751+
1752+
query_vec = self.model.infer_vector(doc_words=tokenized_query,
1753+
alpha=0.025,
1754+
min_alpha=0.01,
1755+
epochs=100)
1756+
1757+
return self.search_topics_by_vector(query_vec, num_topics=num_topics, reduced=reduced)
1758+
16831759
def search_documents_by_vector(self, vector, num_docs, return_documents=True, use_index=False, ef=None):
16841760
"""
16851761
Semantic search of documents using a vector.
@@ -1731,6 +1807,8 @@ def search_documents_by_vector(self, vector, num_docs, return_documents=True, us
17311807
self._validate_vector(vector)
17321808
self._validate_num_docs(num_docs)
17331809

1810+
vector = self._l2_normalize(vector)
1811+
17341812
if use_index:
17351813
self._check_document_index_status()
17361814

@@ -1794,6 +1872,10 @@ def search_words_by_vector(self, vector, num_words, use_index=False, ef=None):
17941872
the word and vector.
17951873
"""
17961874

1875+
self._validate_vector(vector)
1876+
1877+
vector = self._l2_normalize(vector)
1878+
17971879
if use_index:
17981880
self._check_word_index_status()
17991881

@@ -1815,6 +1897,76 @@ def search_words_by_vector(self, vector, num_words, use_index=False, ef=None):
18151897

18161898
return words, word_scores
18171899

1900+
def search_topics_by_vector(self, vector, num_topics, reduced=False):
1901+
"""
1902+
Semantic search of topics using keywords.
1903+
1904+
These are the topics closest to the vector. Topics are ordered by
1905+
proximity to the vector. Successive topics in the list are less
1906+
semantically similar to the vector.
1907+
1908+
Parameters
1909+
----------
1910+
vector: array of shape(vector dimension, 1)
1911+
The vector dimension should be the same as the vectors in
1912+
the topic_vectors variable. (i.e. model.topic_vectors.shape[1])
1913+
1914+
num_topics: int
1915+
Number of documents to return.
1916+
1917+
reduced: bool (Optional, default False)
1918+
Original topics are searched by default. If True the
1919+
reduced topics will be searched.
1920+
1921+
Returns
1922+
-------
1923+
topics_words: array of shape (num_topics, 50)
1924+
For each topic the top 50 words are returned, in order of semantic
1925+
similarity to topic.
1926+
1927+
Example:
1928+
[['data', 'deep', 'learning' ... 'artificial'], <Topic 0>
1929+
['environment', 'warming', 'climate ... 'temperature'] <Topic 1>
1930+
...]
1931+
1932+
word_scores: array of shape (num_topics, 50)
1933+
For each topic the cosine similarity scores of the top 50 words
1934+
to the topic are returned.
1935+
1936+
Example:
1937+
[[0.7132, 0.6473, 0.5700 ... 0.3455], <Topic 0>
1938+
[0.7818', 0.7671, 0.7603 ... 0.6769] <Topic 1>
1939+
...]
1940+
1941+
topic_scores: array of float, shape(num_topics)
1942+
For each topic the cosine similarity to the search keywords will be
1943+
returned.
1944+
1945+
topic_nums: array of int, shape(num_topics)
1946+
The unique number of every topic will be returned.
1947+
"""
1948+
1949+
self._validate_vector(vector)
1950+
self._validate_num_topics(num_topics, reduced)
1951+
1952+
vector = self._l2_normalize(vector)
1953+
1954+
if reduced:
1955+
self._validate_hierarchical_reduction()
1956+
1957+
topic_nums, topic_scores = self._search_vectors_by_vector(self.topic_vectors_reduced,
1958+
vector, num_topics)
1959+
topic_words = [self.topic_words_reduced[topic] for topic in topic_nums]
1960+
word_scores = [self.topic_word_scores_reduced[topic] for topic in topic_nums]
1961+
1962+
else:
1963+
topic_nums, topic_scores = self._search_vectors_by_vector(self.topic_vectors,
1964+
vector, num_topics)
1965+
topic_words = [self.topic_words[topic] for topic in topic_nums]
1966+
word_scores = [self.topic_word_scores[topic] for topic in topic_nums]
1967+
1968+
return topic_words, word_scores, topic_scores, topic_nums
1969+
18181970
def search_documents_by_topic(self, topic_num, num_docs, return_documents=True, reduced=False):
18191971
"""
18201972
Get the most semantically similar documents to the topic.
@@ -2102,27 +2254,12 @@ def search_topics(self, keywords, num_topics, keywords_neg=None, reduced=False):
21022254
if keywords_neg is None:
21032255
keywords_neg = []
21042256

2105-
self._validate_num_topics(num_topics, reduced)
21062257
keywords, keywords_neg = self._validate_keywords(keywords, keywords_neg)
21072258
word_vecs = self._words2word_vectors(keywords)
21082259
neg_word_vecs = self._words2word_vectors(keywords_neg)
21092260
combined_vector = self._get_combined_vec(word_vecs, neg_word_vecs)
21102261

2111-
if reduced:
2112-
self._validate_hierarchical_reduction()
2113-
2114-
topic_nums, topic_scores = self._search_vectors_by_vector(self.topic_vectors_reduced,
2115-
combined_vector, num_topics)
2116-
topic_words = [self.topic_words_reduced[topic] for topic in topic_nums]
2117-
word_scores = [self.topic_word_scores_reduced[topic] for topic in topic_nums]
2118-
2119-
else:
2120-
topic_nums, topic_scores = self._search_vectors_by_vector(self.topic_vectors,
2121-
combined_vector, num_topics)
2122-
topic_words = [self.topic_words[topic] for topic in topic_nums]
2123-
word_scores = [self.topic_word_scores[topic] for topic in topic_nums]
2124-
2125-
return topic_words, word_scores, topic_scores, topic_nums
2262+
return self.search_topics_by_vector(combined_vector, num_topics=num_topics, reduced=reduced)
21262263

21272264
def search_documents_by_documents(self, doc_ids, num_docs, doc_ids_neg=None, return_documents=True,
21282265
use_index=False, ef=None):

top2vec/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,3 @@
11
from top2vec.Top2Vec import Top2Vec
22

3-
__version__ = '1.0.24'
3+
__version__ = '1.0.25'

0 commit comments

Comments
 (0)