Skip to content

Commit

Permalink
add the last chunk of text corpus to the output
Browse files Browse the repository at this point in the history
  • Loading branch information
alexanderpanchenko committed Jul 30, 2018
1 parent b02a88d commit 11b5bc6
Showing 1 changed file with 9 additions and 5 deletions.
14 changes: 9 additions & 5 deletions word_embeddings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,21 @@
from gensim.models.phrases import Phrases, Phraser
from gensim.models import Word2Vec
from time import time
from os.path import exists, isfile, isdir
from os import listdir
from os.path import exists, isdir, join
from tqdm import tqdm
from multiprocessing import cpu_count, Pool
from collections import defaultdict
from glob import glob


class GzippedCorpusStreamer(object):
def __init__(self, corpus_fpath):
self._corpus_fpath = corpus_fpath

def __iter__(self):
if isdir(self._corpus_fpath) and not isfile(self._corpus_fpath):
for corpus_fpath in glob(self._corpus_fpath + "/*"):
if isdir(self._corpus_fpath):
for fname in listdir(self._corpus_fpath):
corpus_fpath = join(self._corpus_fpath, fname)
print("Reading from file:", corpus_fpath)
yield from self._read_file(corpus_fpath)
else:
Expand Down Expand Up @@ -172,6 +173,9 @@ def detect_phrases(corpus_fpath, phrases_fpath, batch_size=500000):

s_batch = []

for s in pool.map(pd.add_phrases, s_batch): out.write("{}\n".format(" ".join(s)))


pd.print_stats()

return output_fpath
Expand All @@ -185,7 +189,7 @@ def learn_word_embeddings(corpus_fpath, vectors_fpath, cbow, window, iter_num, s
if exists(phrases_fpath):
tic = time()
print("Finding phrases from the input dictionary:", phrases_fpath)
corpus_fpath = detect_phrases(corpus_fpath, phrases_fpath, batch_size=500000)
corpus_fpath = detect_phrases(corpus_fpath, phrases_fpath, batch_size=5000)
print("Time, sec.: {}".format(time() - tic))

sentences = GzippedCorpusStreamer(corpus_fpath)
Expand Down

0 comments on commit 11b5bc6

Please sign in to comment.