-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathwellington.py
executable file
·125 lines (104 loc) · 3.17 KB
/
wellington.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/python
import re
import os, sys
from collections import namedtuple
from voxutils.dictionaries import lookup_words
def utterance_generator(f):
utt = []
uid = None
for line in f:
line = line.strip()
if line[:5] == '<WSC#': #new utterance
if uid:
yield (uid, ' '.join(utt))
utt = []
uid = line[5:-1]
elif line in ('<I>', '</I>'):
pass
else:
utt.append(line)
def entaggen(s):
if s == 'tut':
s = 'SMACK'
else:
s = 'NOISE'
return '++' + s + '++', None
def drop_singles(s):
if len(s) < 2:
s = ''
return s, None
def time_or_comment(s):
m = re.match('^(\d+):(\d\d)$', s)
if m:
minutes, seconds = m.group(1, 2)
return '', {'time': int(minutes) + float(seconds) / 60.}
return '', None
def echo(s):
return s, None
def drop(s):
return '', None
Tag = namedtuple('tag', ['tag', 'solitary', 'ignore', 'filter'])
TAG_DATA = [
[",", True, False, echo],
[".", False, False, drop_singles],
["[", False, False, echo],
["{", False, True, echo],
["&", False, False, time_or_comment],
["I", False, True, echo],
["laughs", False, False, echo],
["O", False, False, entaggen],
]
TAG_MAP = {x[0]: Tag(*x) for x in TAG_DATA}
ENTAGGEN_WORDS = {
'er': '++UH++',
'um': '++UM++',
}
def comment_parser(g):
entaggen_word = ENTAGGEN_WORDS.get
for uid, utt in g:
ubits = utt.split('<')
done = [entaggen_word(w, w) for w in ubits[0].split()]
stack = []
timepoints = []
for bit in ubits[1:]:
tag, content = bit.split('>', 1)
if tag[0] != '/':
#opening or single tag
t = TAG_MAP[tag]
if not t.ignore and not t.solitary:
stack.append(t)
filter = t.filter
else:
tag = tag[1:]
if TAG_MAP[tag].ignore:
continue
if not stack or stack[-1].tag != tag:
raise TabError("found </%s> but stack is %s"
% (tag, stack))
stack.pop()
if stack:
filter = stack[-1].filter
else:
filter = echo
text, meta = filter(content)
words = text.split()
if meta and 'time' in meta:
timepoints.append((meta['time'], len(done), len(' '.join(done))))
done.extend(entaggen_word(w, w) for w in words)
yield uid, ' '.join(done), timepoints
def cmu_lookup(g):
for uid, utt, timepoints in g:
words = utt.upper().split()
dictionary, missing = lookup_words(words)
print >> sys.stderr, missing
yield uid, utt, timepoints
def print_transcription(g):
for uid, utt, timepoints in g:
print "<s> %s </s> (%s)" % (utt.upper(), uid.replace(':', '-'))
def main():
f = open('corpora/wellington/DGI038.TXT')
g = utterance_generator(f)
h = comment_parser(g)
i = cmu_lookup(h)
print_transcription(i)
main()