-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparsewiktionary.py
153 lines (107 loc) · 3.63 KB
/
parsewiktionary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import re
TAG_EN = '#en'
TAG_ZH = '#cmn'
TAG_TR = '#tr'
TAG_UZ = '#uz'
TAG_HA = '#ha'
singleton = []
def include_chinese(part):
"""
Check if a string contains Chinese.
If it does contain Chinese, then return all the Chinese characters in one string.
This function does not discriminate simplified and traditional Chinese.
"""
stripped = ''
for ch in part:
if ch.isdigit(): return ''
elif ord(ch) < 0x4e00 or ord(ch) > 0x9fff: stripped += ''
else: stripped += ch
return stripped
def parse_translation(language, line):
"""
Given a language and a line from Wiktionay XML,
this function returns a list, consisting of all the translations of this language.
Due to difficulties in parse Chinese translation,
this function discriminates Chinese and all other languages.
"""
return_string = []
groups = re.split(',', line)
# Chinese
if language == TAG_ZH:
for each in groups:
minor = each.split('|')
for ii in range(len(minor)):
part = include_chinese(minor[ii])
if (part != ''): return_string.append(part+language)
return return_string
# Turkish, Uzbek and Hausa
else:
code = language[1:]
for each in groups:
minor = each.split('|')
mode = False
for ii in range(len(minor)):
if (code in minor[ii]): mode = True
elif (mode):
if ('}}' in minor[ii]): ending = minor[ii].index('}}')
else: ending = len(minor[ii])
if (minor[ii][:ending] != ''):
return_string.append((minor[ii][:ending].replace('[', '')).replace(']', '').lower() + language)
mode = False
elif ('alt=' in minor[ii]) & ('alt=;' not in minor[ii]):
if ('}}' in minor[ii]): ending = minor[ii].index('}}')
else: ending = len(minor[ii])
return_string.append((minor[ii][4:ending].replace('[', '')).replace(']', '').lower() + language)
return return_string
def parse_wiktionary(input_dir, output_dir):
"""
input_dir should be the directory of the Wiktionary XML file.
output_dir is the directory of the output file.
"""
handle = open(input_dir, 'r')
writer = open(output_dir, 'w')
loadtrans = False
loadtitle = False
title = ''
parsed = []
a = 0
for line in handle:
# Mark the start of a new page (exclude irrelevant pages)
if ('<title>' in line) & (':' not in line):
title = (re.split('<title>|</title>', line))[1]
loadtitle = True
# Pass the non-English words
elif ('==English==' in line) & (loadtitle):
loadtrans = True
elif (('* Turkish:' in line) or ('*: Turkish:' in line)) & ('t-needed' not in line):
parsed += parse_translation(TAG_TR, line)
elif (('* Uzbek:' in line) or ('*: Uzbek:' in line)) & ('t-needed' not in line):
parsed += parse_translation(TAG_UZ, line)
elif (('* Hausa:' in line) or ('*: Hausa:' in line)) & ('t-needed' not in line):
parsed += parse_translation(TAG_HA, line)
elif (('* Mandarin:' in line) or ('*: Mandarin:' in line)) & ('t-needed' not in line):
parsed += parse_translation(TAG_ZH, line)
elif ('</page>' in line) & (loadtrans):
if (len(parsed) == 0):
if (title.lower() not in singleton):
singleton.append(title.lower())
writer.write(title.lower()+TAG_EN)
writer.write('\n')
loadtrans = False
loadtitle = False
parsed = []
else:
writer.write(title.lower()+TAG_EN)
for member in set(parsed): writer.write(':' + member.lower())
writer.write('\n')
loadtrans = False
loadtitle = False
parsed = []
print(a)
a += 1
handle.close()
writer.close()
if __name__ == "__main__":
wiktionary_dir = '../dataset/enwiktionary-latest-pages-articles.xml'
vocabulary_dir = 'vocabulary.txt'
parse_wiktionary(wiktionary_dir, vocabulary_dir)