-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathrun.sh
26 lines (21 loc) · 1.18 KB
/
run.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
# download the latest English wikipedia dump
curl -L -O "https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2"
# extract and clean texts from downloaded wikipedia dump
mkdir -p extracted
python WikiExtractor.py enwiki-latest-pages-articles.xml.bz2 -b 1G -o extracted --no-template --processes 24
# process the extracted wiki text corpus including: splitting the sentence, sentence tokenization,
# removing sentences that contain less than 20 characters or less then 5 tokens,
# converting all numerals into 0, say, 1993 converted to 0000
mkdir -p processed
for i in {00..11};
do
python wiki-corpus-prepare.py extracted/wiki_$i processed/wiki_$i &
done
# merge all the wikipedia text file
cat processed/wiki_* > processed/wiki
# train the word embeddings using skipgram model
mkdir -p results
cd word2vec
./word2vec -train ../processed/wiki -output ../results/enwiki.skip.size300.win10.neg15.sample1e-5.min15.bin -cbow 0 -size 300 -window 10 -negative 15 -hs 0 -sample 1e-5 -threads 24 -binary 1 -min-count 15
# test the performace of word embeddings on word relation test set
./compute-accuracy ../results/enwiki.skip.size300.win10.neg15.sample1e-5.min15.bin < questions-words.txt