-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathen_postprocessing.py
53 lines (43 loc) · 2.04 KB
/
en_postprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import spacy
import logging
from pathlib import Path
from horology import timed
logging.basicConfig(filename="en_norm.log", level=logging.DEBUG, filemode="w")
def tokenize_line(line, nlp):
normalized_line = ''
doc = nlp(line)
for token in doc:
if token.is_punct:
# if token.is_stop or token.is_punct:
continue
# normalized_line += f'{token.lemma_.lower()} '
normalized_line += f'{token.lower_} '
# normalized_line += f'{token.text} '
# normalized_line += f'{token.norm_} '
normalized_line = normalized_line.strip()
return normalized_line + '\n'
@timed(unit='min', name='Tokenizing took ')
def tokenize_text(text_name, text, nlp):
tokenized_text = ''
lines = text.splitlines()
for line in lines:
tokenized_text += tokenize_line(line,nlp)
logging.info(f'Number of lines in text {text_name} is {len(lines)}')
return tokenized_text
if __name__ == "__main__":
text_paths = list(Path('./en_text/').iterdir())
text_paths.sort()
nlp = spacy.load('en_core_web_sm')
corpus = ''
for text_path in text_paths:
text = text_path.read_text(encoding='utf8')
corpus += tokenize_text(text_path.stem, text,nlp)
print(f'{text_path.stem} completed')
# Path('./normalized/en/en_corpus-punct-stopword_lower.txt').write_text(corpus, encoding='utf-8')
# Path('./normalized/en/en_corpus-punct-stopword_lemma_lower.txt').write_text(corpus, encoding='utf-8')
# Path('./normalized/en/en_corpus-punct-stopword_norm.txt').write_text(corpus, encoding='utf-8')
# Path('./normalized/en/en_corpus-punct-stopword_text.txt').write_text(corpus, encoding='utf-8')
Path('./normalized/en/en_corpus-punct_lower.txt').write_text(corpus, encoding='utf-8')
# Path('./normalized/en/en_corpus-punct_text.txt').write_text(corpus, encoding='utf-8')
# Path('./normalized/en/en_corpus-punct_lemma_lower.txt').write_text(corpus, encoding='utf-8')
# Path('./normalized/en/en_corpus-punct_norm.txt').write_text(corpus, encoding='utf-8')