@@ -1680,6 +1680,82 @@ def query_documents(self, query, num_docs, return_documents=True, use_index=Fals
1680
1680
return self .search_documents_by_vector (query_vec , num_docs , return_documents = return_documents ,
1681
1681
use_index = use_index , ef = ef )
1682
1682
1683
+ def query_topics (self , query , num_topics , reduced = False , tokenizer = None ):
1684
+ """
1685
+ Semantic search of topics using keywords.
1686
+
1687
+ These are the topics closest to the vector. Topics are ordered by
1688
+ proximity to the vector. Successive topics in the list are less
1689
+ semantically similar to the vector.
1690
+
1691
+ Parameters
1692
+ ----------
1693
+ query: string
1694
+ Any sequence of text. This could be an actual question, a sentence,
1695
+ a paragraph or a document.
1696
+
1697
+ num_topics: int
1698
+ Number of documents to return.
1699
+
1700
+ reduced: bool (Optional, default False)
1701
+ Original topics are searched by default. If True the
1702
+ reduced topics will be searched.
1703
+
1704
+ tokenizer: callable (Optional, default None)
1705
+
1706
+ ** For doc2vec embedding model only **
1707
+
1708
+ Override the default tokenization method. If None then
1709
+ gensim.utils.simple_preprocess will be used.
1710
+
1711
+ Returns
1712
+ -------
1713
+ topics_words: array of shape (num_topics, 50)
1714
+ For each topic the top 50 words are returned, in order of semantic
1715
+ similarity to topic.
1716
+
1717
+ Example:
1718
+ [['data', 'deep', 'learning' ... 'artificial'], <Topic 0>
1719
+ ['environment', 'warming', 'climate ... 'temperature'] <Topic 1>
1720
+ ...]
1721
+
1722
+ word_scores: array of shape (num_topics, 50)
1723
+ For each topic the cosine similarity scores of the top 50 words
1724
+ to the topic are returned.
1725
+
1726
+ Example:
1727
+ [[0.7132, 0.6473, 0.5700 ... 0.3455], <Topic 0>
1728
+ [0.7818', 0.7671, 0.7603 ... 0.6769] <Topic 1>
1729
+ ...]
1730
+
1731
+ topic_scores: array of float, shape(num_topics)
1732
+ For each topic the cosine similarity to the search keywords will be
1733
+ returned.
1734
+
1735
+ topic_nums: array of int, shape(num_topics)
1736
+ The unique number of every topic will be returned.
1737
+ """
1738
+
1739
+ self ._validate_query (query )
1740
+
1741
+ if self .embedding_model != "doc2vec" :
1742
+ query_vec = self ._embed_documents (query )[0 ]
1743
+
1744
+ else :
1745
+
1746
+ # if tokenizer is not passed use default
1747
+ if tokenizer is None :
1748
+ tokenizer = default_tokenizer
1749
+
1750
+ tokenized_query = tokenizer (query )
1751
+
1752
+ query_vec = self .model .infer_vector (doc_words = tokenized_query ,
1753
+ alpha = 0.025 ,
1754
+ min_alpha = 0.01 ,
1755
+ epochs = 100 )
1756
+
1757
+ return self .search_topics_by_vector (query_vec , num_topics = num_topics , reduced = reduced )
1758
+
1683
1759
def search_documents_by_vector (self , vector , num_docs , return_documents = True , use_index = False , ef = None ):
1684
1760
"""
1685
1761
Semantic search of documents using a vector.
@@ -1731,6 +1807,8 @@ def search_documents_by_vector(self, vector, num_docs, return_documents=True, us
1731
1807
self ._validate_vector (vector )
1732
1808
self ._validate_num_docs (num_docs )
1733
1809
1810
+ vector = self ._l2_normalize (vector )
1811
+
1734
1812
if use_index :
1735
1813
self ._check_document_index_status ()
1736
1814
@@ -1794,6 +1872,10 @@ def search_words_by_vector(self, vector, num_words, use_index=False, ef=None):
1794
1872
the word and vector.
1795
1873
"""
1796
1874
1875
+ self ._validate_vector (vector )
1876
+
1877
+ vector = self ._l2_normalize (vector )
1878
+
1797
1879
if use_index :
1798
1880
self ._check_word_index_status ()
1799
1881
@@ -1815,6 +1897,76 @@ def search_words_by_vector(self, vector, num_words, use_index=False, ef=None):
1815
1897
1816
1898
return words , word_scores
1817
1899
1900
+ def search_topics_by_vector (self , vector , num_topics , reduced = False ):
1901
+ """
1902
+ Semantic search of topics using keywords.
1903
+
1904
+ These are the topics closest to the vector. Topics are ordered by
1905
+ proximity to the vector. Successive topics in the list are less
1906
+ semantically similar to the vector.
1907
+
1908
+ Parameters
1909
+ ----------
1910
+ vector: array of shape(vector dimension, 1)
1911
+ The vector dimension should be the same as the vectors in
1912
+ the topic_vectors variable. (i.e. model.topic_vectors.shape[1])
1913
+
1914
+ num_topics: int
1915
+ Number of documents to return.
1916
+
1917
+ reduced: bool (Optional, default False)
1918
+ Original topics are searched by default. If True the
1919
+ reduced topics will be searched.
1920
+
1921
+ Returns
1922
+ -------
1923
+ topics_words: array of shape (num_topics, 50)
1924
+ For each topic the top 50 words are returned, in order of semantic
1925
+ similarity to topic.
1926
+
1927
+ Example:
1928
+ [['data', 'deep', 'learning' ... 'artificial'], <Topic 0>
1929
+ ['environment', 'warming', 'climate ... 'temperature'] <Topic 1>
1930
+ ...]
1931
+
1932
+ word_scores: array of shape (num_topics, 50)
1933
+ For each topic the cosine similarity scores of the top 50 words
1934
+ to the topic are returned.
1935
+
1936
+ Example:
1937
+ [[0.7132, 0.6473, 0.5700 ... 0.3455], <Topic 0>
1938
+ [0.7818', 0.7671, 0.7603 ... 0.6769] <Topic 1>
1939
+ ...]
1940
+
1941
+ topic_scores: array of float, shape(num_topics)
1942
+ For each topic the cosine similarity to the search keywords will be
1943
+ returned.
1944
+
1945
+ topic_nums: array of int, shape(num_topics)
1946
+ The unique number of every topic will be returned.
1947
+ """
1948
+
1949
+ self ._validate_vector (vector )
1950
+ self ._validate_num_topics (num_topics , reduced )
1951
+
1952
+ vector = self ._l2_normalize (vector )
1953
+
1954
+ if reduced :
1955
+ self ._validate_hierarchical_reduction ()
1956
+
1957
+ topic_nums , topic_scores = self ._search_vectors_by_vector (self .topic_vectors_reduced ,
1958
+ vector , num_topics )
1959
+ topic_words = [self .topic_words_reduced [topic ] for topic in topic_nums ]
1960
+ word_scores = [self .topic_word_scores_reduced [topic ] for topic in topic_nums ]
1961
+
1962
+ else :
1963
+ topic_nums , topic_scores = self ._search_vectors_by_vector (self .topic_vectors ,
1964
+ vector , num_topics )
1965
+ topic_words = [self .topic_words [topic ] for topic in topic_nums ]
1966
+ word_scores = [self .topic_word_scores [topic ] for topic in topic_nums ]
1967
+
1968
+ return topic_words , word_scores , topic_scores , topic_nums
1969
+
1818
1970
def search_documents_by_topic (self , topic_num , num_docs , return_documents = True , reduced = False ):
1819
1971
"""
1820
1972
Get the most semantically similar documents to the topic.
@@ -2102,27 +2254,12 @@ def search_topics(self, keywords, num_topics, keywords_neg=None, reduced=False):
2102
2254
if keywords_neg is None :
2103
2255
keywords_neg = []
2104
2256
2105
- self ._validate_num_topics (num_topics , reduced )
2106
2257
keywords , keywords_neg = self ._validate_keywords (keywords , keywords_neg )
2107
2258
word_vecs = self ._words2word_vectors (keywords )
2108
2259
neg_word_vecs = self ._words2word_vectors (keywords_neg )
2109
2260
combined_vector = self ._get_combined_vec (word_vecs , neg_word_vecs )
2110
2261
2111
- if reduced :
2112
- self ._validate_hierarchical_reduction ()
2113
-
2114
- topic_nums , topic_scores = self ._search_vectors_by_vector (self .topic_vectors_reduced ,
2115
- combined_vector , num_topics )
2116
- topic_words = [self .topic_words_reduced [topic ] for topic in topic_nums ]
2117
- word_scores = [self .topic_word_scores_reduced [topic ] for topic in topic_nums ]
2118
-
2119
- else :
2120
- topic_nums , topic_scores = self ._search_vectors_by_vector (self .topic_vectors ,
2121
- combined_vector , num_topics )
2122
- topic_words = [self .topic_words [topic ] for topic in topic_nums ]
2123
- word_scores = [self .topic_word_scores [topic ] for topic in topic_nums ]
2124
-
2125
- return topic_words , word_scores , topic_scores , topic_nums
2262
+ return self .search_topics_by_vector (combined_vector , num_topics = num_topics , reduced = reduced )
2126
2263
2127
2264
def search_documents_by_documents (self , doc_ids , num_docs , doc_ids_neg = None , return_documents = True ,
2128
2265
use_index = False , ef = None ):
0 commit comments