add the last chunk of text corpus to the output

uhh-lt · Jul 30, 2018 · 11b5bc6 · 11b5bc6
1 parent b02a88d
commit 11b5bc6
Showing 1 changed file with 9 additions and 5 deletions.
diff --git a/word_embeddings.py b/word_embeddings.py
@@ -4,20 +4,21 @@
 from gensim.models.phrases import Phrases, Phraser
 from gensim.models import Word2Vec
 from time import time
-from os.path import exists, isfile, isdir
+from os import listdir
+from os.path import exists, isdir, join
 from tqdm import tqdm
 from multiprocessing import cpu_count, Pool
 from collections import defaultdict
-from glob import glob
 
 
 class GzippedCorpusStreamer(object):
     def __init__(self, corpus_fpath):
         self._corpus_fpath = corpus_fpath
 
     def __iter__(self):
-        if isdir(self._corpus_fpath) and not isfile(self._corpus_fpath):
-            for corpus_fpath in glob(self._corpus_fpath + "/*"):
+        if isdir(self._corpus_fpath):
+            for fname in listdir(self._corpus_fpath):
+                corpus_fpath = join(self._corpus_fpath, fname)
                 print("Reading from file:", corpus_fpath)
                 yield from self._read_file(corpus_fpath)
         else:
@@ -172,6 +173,9 @@ def detect_phrases(corpus_fpath, phrases_fpath, batch_size=500000):
 
                 s_batch = []
 
+        for s in pool.map(pd.add_phrases, s_batch): out.write("{}\n".format(" ".join(s)))
+
+
     pd.print_stats()
 
     return output_fpath
@@ -185,7 +189,7 @@ def learn_word_embeddings(corpus_fpath, vectors_fpath, cbow, window, iter_num, s
     if exists(phrases_fpath):
         tic = time()
         print("Finding phrases from the input dictionary:", phrases_fpath)
-        corpus_fpath = detect_phrases(corpus_fpath, phrases_fpath, batch_size=500000)
+        corpus_fpath = detect_phrases(corpus_fpath, phrases_fpath, batch_size=5000)
         print("Time, sec.: {}".format(time() - tic))
 
     sentences = GzippedCorpusStreamer(corpus_fpath)