-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwords.rkt
executable file
·469 lines (427 loc) · 14.6 KB
/
words.rkt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
#! /usr/bin/env racket
#lang racket
; @todo: build sparse matrix object
(require htdp/matrix)
; filter out duplicate values
; @in
; lst: a list
; @out
; lst with unique values
(define filter-unique
(lambda (lst)
(let ([dict (make-hash)])
(filter (lambda (arg)
(if (dict-has-key? dict arg)
#f
(begin
(dict-set! dict arg 1)
#t)))
lst))))
; basic testing
(let ([lst '(1 1 2 3 4 2 5 1 42)])
(displayln lst)
(displayln (filter-unique lst)))
; retrieve list of word/tag
; @in
; ins: list of files
; @out
; wts: list of word/tag
(define get-wts
(lambda ([ins (find-files (lambda (arg)
;(regexp-match-exact? #px".+c[a-r][0-9]{2}" arg))
(regexp-match-exact? #px".+ca01" arg))
(build-path (current-directory) "brown"))]
[wts '()])
(if (null? ins)
(flatten wts)
(get-wts (cdr ins) (list wts (filter (lambda (arg)
(> (string-length arg) 2))
(regexp-split #rx"[ \n\t]+" (file->string (car ins)))))))))
; basic testing
(displayln (get-wts))
; add start-of-sentence symbols
; @in
; wts: list of word/tag
; @out
; wts-markers: list of word/tag with start-of-sentence symbols
(define add-sos
(lambda (wts
[wts-markers '("<s>/<s>")])
(if (null? wts)
(flatten wts-markers)
(add-sos (cdr wts) (cons wts-markers
(if (string=? (car wts) "./.")
(cons (car wts) "<s>/<s>")
(car wts)))))))
; basic testing
(displayln (add-sos (get-wts)))
; filter out start-of-sentence symbols
; @in
; tags-sos: list of tags with start-of-sentence symbols
; @out
; tags: list of tags, without start-of-sentence symbols
(define filter-sos
(lambda (tags-sos
[tags '()])
(if (null? tags-sos)
(flatten tags)
(filter-sos (cdr tags-sos) (if (string=? (car tags-sos) "<s>")
tags
(cons tags (car tags-sos)))))))
; basic testing
(displayln (filter-sos '("<s>" "vb" "nn" "jj" "<s>" "whatever" "<s>")))
; count occurrences
; @in
; lst: a list
; @out
; dict: dictionary of item -> number of occurrences
(define list->dict
(lambda (lst
[dict '()])
(if (null? lst)
dict
(list->dict (cdr lst) (dict-update dict (car lst) add1 0)))))
; basic testing
(let ([lst '(1 1 2 3 4 2 5 1 42)])
(displayln lst)
(displayln (list->dict lst)))
; extract words and tags in two lists
; @in
; wts: list of word/tag
; @out
; lsts: pair of (words list, tags list)
; @todo: make it generic with delimiter as input
(define explode-list
(lambda (wts
[lsts '(() ())])
(if (null? wts)
(list (flatten (car lsts)) (flatten (cadr lsts)))
(let ([pair (regexp-split #rx"/" (car wts))])
(explode-list (cdr wts) (list
(cons (car lsts) (car pair))
(cons (cadr lsts) (cadr pair))))))))
; basic testing
(let ([lst '("1/2" "3/4" "5/6" "7/8")])
(displayln lst)
(displayln (explode-list lst)))
; build a list of pairs (item, index)
; @in
; lst: a list
; @out
; a list of pairs (item, index)
(define index-list
(lambda (lst
[i 0])
(if (null? lst)
'()
(cons (cons (car lst) i) (index-list (cdr lst) (+ i 1))))))
; basic testing
(displayln (index-list '("t1" "t2" "t3" "t4")))
; build the row labelled tag for the B matrix
; @in
; wts-dict: dictionary of (word/tag, count)
; words: list of unique words
; tag: tag labelling the row
; tag-count: number of occurrences of tag
; @out
; row: the row labelled tag in B matrix
(define get-b-row
(lambda (wts-dict words tag tag-count
[row '()])
(if (null? words)
(flatten row)
(get-b-row wts-dict (cdr words) tag tag-count (list row (/ (+ 1 (dict-ref wts-dict (string-append (car words) "/" tag) 0)) (+ (dict-count wts-dict) tag-count)))))))
; basic testing
(let ([wts-dict '(("w1/t1" . 1) ("w2/t1" . 2) ("w3/t2" . 1) ("w1/t2" . 4) ("w2/t2" . 5) ("w4/t3" . 2))]
[words '("w1" "w2" "w3" "w4")]
[tag "t2"]
[tag-count 10])
(displayln (get-b-row wts-dict words tag tag-count)))
; build B matrix
; @in
; wts-dict: dictionary of (word/tag, count)
; tags-dict: dictionary of (tag, count)
; words: list of unique words
; tags: list of unique tags
; @out
; matrix: the matrix of probabilities of words appearing
(define get-b-matrix
(lambda (wts-dict tags-dict words tags
[matrix '()])
(if (null? tags)
(begin
(printf "(~a x ~a) matrix --> ~a\n" (dict-count tags-dict) (length words) (length (flatten matrix)))
(make-matrix (- (dict-count tags-dict) 1) (length words) (flatten matrix)))
(get-b-matrix wts-dict tags-dict words (cdr tags) (list matrix (get-b-row wts-dict words (car tags) (dict-ref tags-dict (car tags))))))))
; basic testing
(let ([wts-dict '(("w1/t1" . 1) ("w2/t1" . 2) ("w3/t2" . 1) ("w1/t2" . 4) ("w2/t2" . 5) ("w4/t3" . 2))]
[tags-dict '(("<s>" . 1) ("t1" . 3) ("t2" . 10) ("t3" . 2))]
[words '("w1" "w2" "w3" "w4")]
[tags '("t1" "t2" "t3")])
(displayln (matrix-render (get-b-matrix wts-dict tags-dict words tags))))
; count pairs of following tags
; @in
; tag1: the first tag
; tag2: the following tag
; tags: the tags list
; @out
; count: the amount of times tag1 is followed by tag2
(define count-next
(lambda (tag1 tag2 tags
[count 0])
(if (null? (cdr tags))
count
(count-next tag1 tag2 (cdr tags)
(if (and
(string=? (car tags) tag1)
(string=? (cadr tags) tag2))
(+ count 1)
count)))))
; basic testing
(let ([tag1 "t1"]
[tag2 "t2"]
[tags '("t1" "t1" "t2" "t3" "t2" "t4" "t5" "t1" "t2" "t2" "t3" "t1" "t2" "t2" "t5")])
(displayln (count-next tag1 tag2 tags)))
; build the row labelled tag for the A matrix
; @in
; tags: list of tags as in text
; tag: tag labelling the row
; tag-count: number of occurrences of tag
; taglist: list of unique tags
; @out
; row: the row labelled tag in A matrix
(define get-a-row
(lambda (tags tag tag-count taglist
[row '()])
(if (null? taglist)
(flatten row)
(get-a-row tags tag tag-count (cdr taglist) (list row (/ (count-next tag (car taglist) tags) tag-count))))))
; basic testing
(let ([tags '("t1" "t2" "t2" "t3" "t1" "t4" "t2" "t5" "t1" "t1" "t2" "t3" "t5")]
[tag "t1"]
[tag-count 4]
[taglist '("t1" "t2" "t3" "t4" "t5")])
(displayln (get-a-row tags tag tag-count taglist)))
; @todo: (?) add start and end transition probabilities for beginning and final states
; build the A matrix
; @in
; tags-dict: dictionary of (tag, count)
; tags: list of tags
; taglist: index of (unique) tags
; taglist-left: rows left to build
; @out
; matrix: the matrix of probabilities of tags appearing
(define get-a-matrix
(lambda (tags-dict tags taglist
[taglist-left (flatten (cons "<s>" taglist))]
[matrix '()])
(if (null? taglist-left)
(make-matrix (+ (length taglist) 1) (length taglist) (flatten matrix))
(get-a-matrix tags-dict tags taglist (cdr taglist-left) (list matrix (get-a-row tags (car taglist-left) (dict-ref tags-dict (car taglist-left)) taglist))))))
; basic testing
(let ([tags-dict '(("<s>" . 1) ("t1" . 5) ("t2" . 3) ("t3" . 1) ("t4" . 3) ("t5" . 2))]
[tags '("t1" "t4" "t5" "t2" "t1" "t1" "t3" "t2" "t1" "t4" "t5" "t1" "t4" "t2")]
[taglist '("t1" "t2" "t3" "t4" "t5")])
(displayln (matrix-render (get-a-matrix tags-dict tags taglist))))
; @todo: address the unknown words issue
; @todo: optimize by building a list instead of operating on the matrix itself
; perform the initialization step of the Viterbi algorithm
; @in
; a-matrix: the A matrix
; b-matrix: the B matrix
; o1: col index of the first word (observation) in the B matrix
; tags-index: list of (tag, index) pairs
; viterbi: a (len(tags) + 2) x len(obs) matrix
; @out
; viterbi: a matrix whose first column is initialized
(define viterbi-init
(lambda (a-matrix b-matrix o1 tags-index viterbi)
(if (null? tags-index)
viterbi
(let ([i (cdar tags-index)])
(viterbi-init a-matrix b-matrix o1
(cdr tags-index)
(matrix-set viterbi (+ i 1) 1 (* (matrix-ref a-matrix 0 i)
(matrix-ref b-matrix i o1))))))))
; perform the recursion step of the Viterbi algorithm
; @in
; a-matrix: the A matrix
; b-matrix: the B matrix
; obs: col indexes of observations in the B matrix
; tags-index: list of (tag, index) pairs
; viterbi: a (len(tags) + 2) x len(obs) initialized matrix
; backptr: a (len(tags) + 2) x len(obs) initialized matrix
; t: observation index
; @out
; viterbi: an initialized matrix, but the last col
; backptr: an initialized matrix, but the last col
(define viterbi-loop
(lambda (a-matrix b-matrix obs tags-index viterbi backptr
[t 2])
(if (null? obs)
(list viterbi backptr)
(letrec ([inner-loop (lambda (vit back
[tags-left tags-index])
(if (null? tags-left)
(list vit back)
(let ([s (cdar tags-left)])
(inner-loop
(matrix-set vit (+ s 1) t (get-max vit s t a-matrix b-matrix tags-index (car obs)))
(matrix-set back (+ s 1) t (cdr (get-arg-max vit s t a-matrix tags-index)))
(cdr tags-left)))))]
[pair (inner-loop viterbi backptr)])
(viterbi-loop a-matrix b-matrix (cdr obs) tags-index (car pair) (cadr pair) (+ t 1))))))
(define viterbi-end
(lambda (pair a-matrix s t tags-index)
(let ([vit (car pair)]
[back (cadr pair)])
(displayln (matrix-render vit))
(displayln (matrix-render back))
(list
(matrix-set vit (+ s 1) t (get-max-end vit s t a-matrix tags-index))
(matrix-set back (+ s 1) t (get-arg-max-end vit s t a-matrix tags-index))))))
; @todo: combine get-max and get-arg-max in one proc
; @todo: ditto for get-max-end and get-arg-max-end
; return the maximum probability
; @in
; viterbi: the Viterbi matrix
; s: the current state (tag)
; t: the current step (observation)
; a-matrix: the A matrix
; b-matrix: the B matrix
; tags-index: list of (tag, index) pairs
; obs: the index of observation at step t in matrix B
; @out
; res: the maximum probability
(define get-max
(lambda (viterbi s t a-matrix b-matrix tags-index obs
[res 0])
(if (null? tags-index)
res
(let* ([ss (cdar tags-index)]
[cur (* (* (matrix-ref viterbi (+ ss 1) (- t 1))
(matrix-ref a-matrix (+ ss 1) s))
(matrix-ref b-matrix s obs))])
(get-max viterbi s t a-matrix b-matrix (cdr tags-index) obs (if (< res cur)
cur
res))))))
(define get-max-end
(lambda (viterbi s t a-matrix tags-index
[res 0])
(if (null? tags-index)
res
(let* ([ss (cdar tags-index)]
[cur (* (matrix-ref viterbi (+ ss 1) t)
(matrix-ref a-matrix (+ ss 1) s))])
(get-max-end viterbi s t a-matrix (cdr tags-index) (if (< res cur)
cur
res))))))
; return a (max, index) pair of the maximum probability
; @in
; viterbi: the Viterbi matrix
; s: the current state (tag)
; t: the current step (observation)
; a-matrix: the A matrix
; tags-index: list of (tag, index) pairs
; @out
; res: a (max, index) pair
(define get-arg-max
(lambda (viterbi s t a-matrix tags-index
[res '(0 . 0)])
(if (null? tags-index)
res
(let* ([ss (cdar tags-index)]
[cur (* (matrix-ref viterbi (+ ss 1) (- t 1))
(matrix-ref a-matrix (+ ss 1) s))])
(get-arg-max viterbi s t a-matrix (cdr tags-index) (if (< (car res) cur)
(cons cur (+ ss 1))
res))))))
(define get-arg-max-end
(lambda (viterbi s t a-matrix tags-index
[res '(0 . 0)])
(if (null? tags-index)
res
(let* ([ss (cdar tags-index)]
[cur (* (matrix-ref viterbi (+ ss 1) t)
(matrix-ref a-matrix ss s))])
(get-arg-max-end viterbi s t a-matrix (cdr tags-index) (if (< (car res) cur)
(cons cur (+ ss 1))
res))))))
; @todo: address the unknown word issue
; build a list of observations' indexes
; @in
; obs: list of observations
; words-index: hashmap of (word, index)
; @out
; obs-index: list of observations' indexes
(define index-words
(lambda (obs words-index
[obs-index '()])
(if (null? obs)
(flatten obs-index)
(index-words (cdr obs) words-index (list obs-index (hash-ref words-index (car obs) 0))))))
; apply part-of-speech tagging on text
; @in
; obs: text to tag
; @out
; pos: list of pairs (observation, tag)
(define tag-it
(lambda (obs)
'()))
; @todo: optimize vars
; retrieve list of word/tag
(let* ([wts (add-sos (get-wts))]
; make a dictionary word/tag -> count
[wts-dict (list->dict wts)]
; extract words and tags in two lists
[wts-exploded (explode-list wts)]
[words (filter-sos (car wts-exploded))]
[tags-sos (cadr wts-exploded)]
; make a dictionary tag -> count
[tags-dict (list->dict tags-sos)]
; retrieve its size, minus start-of-sentence symbol
[tags-size (- (dict-count tags-dict) 1)]
; filter out duplicates of words list
[words-unique (filter-unique words)]
; retrieve its size
[words-size (length words-unique)]
; remove start-of-sentence symbols
[tags (filter-sos tags-sos)]
; filter out duplicates of tags list
[tags-unique (filter-unique tags)]
; build tags index (make-hash)?
[tags-index (index-list tags-unique)]
; build words index
[words-index (make-hash (index-list words-unique))]
; build A matrix
[a-matrix (get-a-matrix tags-dict tags-sos tags-unique)]
; build B matrix
[b-matrix (get-b-matrix wts-dict tags-dict words tags-unique)]
; the observations (words) to tag
[obs '("I" "want" "to" "try" "it")]
; number of rows of viterbi and backptr matrices
[n (+ (length tags-unique) 2)]
; number of cols of viterbi and backptr matrices
[m (+ (length obs) 1)]
; 0-filled list to populate viterbi and backptr matrices
[lst-0 (build-list (* n m) (lambda (x) 0))])
(printf "a-matrix: ~a\n b-matrix: ~a\n" (matrix-render a-matrix) (matrix-render b-matrix))
(printf "vit-init: ~a\n" (matrix-render (viterbi-init a-matrix b-matrix (hash-ref words-index (car obs)) tags-index (make-matrix n m lst-0))))
(printf "I: ~a want: ~a to: ~a try: ~a it ~a\n" (hash-ref words-index "I" -1) (hash-ref words-index "want" -1) (hash-ref words-index "to" -1) (hash-ref words-index "try" -1) (hash-ref words-index "it" -1))
(displayln (matrix-render (car (viterbi-end
(viterbi-loop
a-matrix
b-matrix
(index-words (cdr obs) words-index)
tags-index
(viterbi-init
a-matrix
b-matrix
(hash-ref words-index (car obs))
tags-index
(make-matrix n m lst-0))
(make-matrix n m lst-0))
a-matrix
(- n 3)
(- m 1)
tags-index)))))