From 11b5bc6d1ffba60af3bd8bf72336bde4e94e7451 Mon Sep 17 00:00:00 2001
From: Alexander Panchenko <panchenko.alexander@gmail.com>
Date: Mon, 30 Jul 2018 19:14:41 +0200
Subject: [PATCH] add the last chunk of text corpus to the output

---
 word_embeddings.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/word_embeddings.py b/word_embeddings.py
index 8cfcad6..502fcbb 100644
--- a/word_embeddings.py
+++ b/word_embeddings.py
@@ -4,11 +4,11 @@
 from gensim.models.phrases import Phrases, Phraser
 from gensim.models import Word2Vec
 from time import time
-from os.path import exists, isfile, isdir
+from os import listdir
+from os.path import exists, isdir, join
 from tqdm import tqdm
 from multiprocessing import cpu_count, Pool
 from collections import defaultdict
-from glob import glob
 
 
 class GzippedCorpusStreamer(object):
@@ -16,8 +16,9 @@ def __init__(self, corpus_fpath):
         self._corpus_fpath = corpus_fpath
         
     def __iter__(self):
-        if isdir(self._corpus_fpath) and not isfile(self._corpus_fpath):
-            for corpus_fpath in glob(self._corpus_fpath + "/*"):
+        if isdir(self._corpus_fpath):
+            for fname in listdir(self._corpus_fpath):
+                corpus_fpath = join(self._corpus_fpath, fname)
                 print("Reading from file:", corpus_fpath)
                 yield from self._read_file(corpus_fpath)
         else:
@@ -172,6 +173,9 @@ def detect_phrases(corpus_fpath, phrases_fpath, batch_size=500000):
 
                 s_batch = []
 
+        for s in pool.map(pd.add_phrases, s_batch): out.write("{}\n".format(" ".join(s)))
+
+
     pd.print_stats()
 
     return output_fpath
@@ -185,7 +189,7 @@ def learn_word_embeddings(corpus_fpath, vectors_fpath, cbow, window, iter_num, s
     if exists(phrases_fpath):
         tic = time()
         print("Finding phrases from the input dictionary:", phrases_fpath)
-        corpus_fpath = detect_phrases(corpus_fpath, phrases_fpath, batch_size=500000)
+        corpus_fpath = detect_phrases(corpus_fpath, phrases_fpath, batch_size=5000)
         print("Time, sec.: {}".format(time() - tic))
 
     sentences = GzippedCorpusStreamer(corpus_fpath)