-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprep_data.py
37 lines (28 loc) · 999 Bytes
/
prep_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import wget
import os
import tarfile
import gzip
import zipfile
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--glove", action="store_true")
args = parser.parse_args()
# Extract data file
with tarfile.open("summary.tar.gz", "r:gz") as tar:
tar.extractall()
with gzip.open("sumdata/train/train.article.txt.gz", "rb") as gz:
with open("sumdata/train/train.article.txt", "wb") as out:
out.write(gz.read())
with gzip.open("sumdata/train/train.title.txt.gz", "rb") as gz:
with open("sumdata/train/train.title.txt", "wb") as out:
out.write(gz.read())
if args.glove:
glove_dir = "glove"
glove_url = "https://nlp.stanford.edu/data/wordvecs/glove.42B.300d.zip"
if not os.path.exists(glove_dir):
os.mkdir(glove_dir)
# Download glove vector
wget.download(glove_url, out=glove_dir)
# Extract glove file
with zipfile.ZipFile(os.path.join("glove", "glove.42B.300d.zip"), "r") as z:
z.extractall(glove_dir)