-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathlexcatter.py
111 lines (99 loc) · 4.84 KB
/
lexcatter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from supersenses import NSS, VSS, PSS
def compute_lexcat(tokNum, smwe, smweGroupToks, ss, lexlemma, poses, rels):
"""
The lexical category, or LexCat, is the syntactic category of a strong
lexical expression, which may be a single-word or multiword expression.
The value of LexCat is determined in part from the UPOS (Universal POS)
and XPOS (for English, PTB tag), as follows:
1. If the expression has been annotated as ... the LexCat is ...
`c CCONJ
`j ADJ
`r ADV
`n NOUN
`o PRON
`v VERB
`d DISC
`i INF
`a AUX (note: the UPOS should usually be AUX)
2. If the expression has been annotated with a ... supersense, the LexCat is ...
noun N
verb V
3. If the expression has been annotated with an adposition supersense or `$:
a. If XPOS is ... the LexCat is ...
PRP$ PRON.POSS
WP$ PRON.POSS
POS POSS
TO INF.P
(all `$ tokens should be matched by one of these conditions)
b. If a multiword expression:
i. If the last token of the MWE has UPOS of `ADP` or `SCONJ`, LexCat is `P`.
ii. Otherwise, LexCat is `PP`.
c. Otherwise, LexCat is `P`.
4. Other tokens with UPOS of `NOUN`, `VERB`, `ADP`, or XPOS of `POS`, `PRP$`, `WP$`, `TO`: need examination. [N.B. `PART` = `TO` + `POS` + negative markers]
5. Otherwise, if the UPOS is `PART`, LexCat is `ADV` (negative markers).
6. Other strong MWEs need examination. Some special cases are handled.
7. Otherwise, the LexCat is the same as the UPOS.
The script that performs automatic assignment should have an option to suffix any default (non-human) annotations with `@` and any problematic cases with `!`.
"""
if smwe!='_' and not smwe.endswith(':1'):
# non-initial token in MWE
return '_'
lc = {'`a': 'AUX', '`c': 'CCONJ', '`d': 'DISC', '`i': 'INF', '`j': 'ADJ',
'`n': 'N', '`o': 'PRON', '`r': 'ADV', '`v': 'V', '??': '??'}.get(ss)
if lc is not None: return lc
if ss.isalpha() and ss.isupper(): return 'N'
if ss.isalpha() and ss.islower(): return 'V'
upos, xpos = poses[tokNum-1]
if ss=='`$' or (ss[0].isupper() and ss[1].islower()):
lc = {'PRP$': 'PRON.POSS', 'WP$': 'PRON.POSS', 'POS': 'POSS', 'TO': 'INF.P'}.get(xpos)
if lc is not None: return lc
assert ss!='`$'
if smwe!='_':
if poses[smweGroupToks[-1]-1][0] in ('ADP','SCONJ'):
return 'P'
return 'PP'
return 'P'
if upos in ('NOUN', 'VERB', 'ADP') or xpos in ('POS', 'PRP$', 'WP$', 'TO'):
return '!!@'
if upos=='PART':
return 'ADV'
if smwe!='_':
if upos=='DET':
if lexlemma in ('a lot', 'a couple', 'a few', 'a little', 'a bit', 'a number', 'a bunch'):
return 'DET'
elif lexlemma in ('no one', 'every one', 'every thing', 'each other', 'some place'):
return 'PRON'
if upos=='AUX':
if lexlemma in ('might as well',):
return 'AUX'
head,rel = rels[tokNum-1]
if head in smweGroupToks:
return compute_lexcat(head, '_', smweGroupToks, ss, lexlemma, poses, rels)
"""if upos=='ADJ' and lc=='!!':
# ADJ-NOUN compounds functioning as adjectives: the following occur in Reviews
if lexlemma in ('first - class', 'first class', 'first - time', 'first time',
'full - time', 'full time', 'high - dollar', 'high dollar',
'high - end', 'high end', 'high - tech', 'high tech',
'old - time', 'old time', 'top - notch', 'top notch'):
return 'ADJ'
if lexlemma in ('bottom line', 'parisian croissant', 'straight edge'):
return 'N'
if lexlamma in ('happy new year', 'good luck', 'good job',
'sad face', 'holy cow'):
return 'DISC'
"""
else:
assert upos!='X' # X is used for goeswith (also 'sub <advmod par').
# In those cases the head should be in the MWE.
return '!@'
return upos
def supersenses_for_lexcat(lc): # specific to English
if lc=='N': return NSS
if lc=='V' or lc.startswith('V.'):
if lc!='V':
assert lc in {'V.VID', 'V.VPC.full', 'V.VPC.semi', 'V.LVC.full', 'V.LVC.cause', 'V.IAV'},lc # PARSEME 1.1 verbal MWE subtypes
return VSS
if lc in ('P', 'PP', 'INF.P'): return (PSS - {'p.Content'})
if lc in ('POSS', 'PRON.POSS'): return (PSS - {'p.Content'}) | {'`$'}
ALL_LEXCATS = {'N', 'PRON', 'V', 'P', 'PP', 'INF', 'INF.P', 'POSS', 'PRON.POSS', 'DISC', 'AUX',
'ADJ', 'ADV', 'DET', 'CCONJ', 'SCONJ', 'INTJ', 'NUM', 'SYM', 'PUNCT', 'X'}