-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutils_nlp.py
118 lines (106 loc) · 3.52 KB
/
utils_nlp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import random
import nltk
import wn
# Global singletones
wordnet = None
def get_word_synonyms(word, pos, limit: int = None):
assert pos in 'anvr', "Invalid POS tag"
global wordnet
# Load the wordnet
if wordnet is None:
wordnet = wn.Wordnet('oewn:2023')
# Find the list of synonyms
synonyms = {word}
synsets = wordnet.synsets(word, pos=pos)
for syn in synsets:
for sense in syn.senses():
for lemma in sense.synset().lemmas():
synonyms.add(lemma)
synonyms.remove(word)
synonyms = list(synonyms)
# Return a subset
if limit is not None:
random.shuffle(synonyms)
synonyms = synonyms[:limit]
return synonyms
def upenn_to_wn_tag(upenn_tag: str, quiet=True):
"""
Converts a UPenn tag to one of the four accepted tags by WordNet (a, v, n, r).
Returns None if the tag is irrelevant in the context of WordNet.
Prints a warning if the input tag is unavailable in UPenn tagset and quiet mode is disabled.
More info on UPenn tags:
```
import nltk
nltk.help.upenn_tagset()
```
"""
punctuation = "$`'(),-.:"
irrelevant_tags = list(punctuation) + [
'CC', # coordinating conjunction (and, both, ...)
'CD', # numeral, cardinal
'DT', # determiner
'EX', # existential there
'FW', # foreign word
'IN', # preposition or conjunction, subordinating
'LS', # list item marker
'MD', # modal auxiliary
'PDT', # pre-determiner
'POS', # genitive marker
'PRP', # pronoun, personal (hers, herself, him, himself, ...)
'PRP$', # pronoun, possessive (her, his, mine, ...)
'RP', # particle (aboard, about, across, upon, ...)
'SYM', # symbol
'TO', # "to" as preposition or infinitive marker
'UH', # interjection (Goodbye, Goody, Gosh, Wow, ...)
'WDT', # WH-determiner (that, what, whatever, ...)
'WP', # WH-pronoun (that, what, whatever, whatsoever, ...)
'WP$', # WH-pronoun, possessive (whose)
'WRB', # Wh-adverb (how, however, whence, whenever, ...)
]
adj_tags = [
'JJ', # adjective or numeral, ordinal
'JJR', # adjective, comparative
'JJS', # adjective, superlative
]
noun_tags = [
'NN', # noun, common, singular or mass
'NNP', # noun, proper, singular
'NNPS', # noun, proper, plural
'NNS', # noun, common, plural
]
adv_tags = [
'RB', # adverb
'RBR', # adverb, comparative
'RBS', # adverb, superlative
]
verb_tags = [
'VB', # verb, base form
'VBD', # verb, past tense
'VBG', # verb, present participle or gerund
'VBN', # verb, past participle
'VBP', # verb, present tense, not 3rd person singular
'VBZ', # verb, present tense, 3rd person singular
]
if upenn_tag in adj_tags:
return 'a'
elif upenn_tag in noun_tags:
return 'n'
elif upenn_tag in adv_tags:
return 'r'
elif upenn_tag in verb_tags:
return 'v'
elif upenn_tag in irrelevant_tags:
return None
else:
if not quiet:
# TODO: Log instead of print
print(f'Warning: The given tag ({upenn_tag}) is not a valid UPenn tag.')
return None
if __name__ == '__main__':
# Download WordNet database
os.system('python -m wn download oewn:2023')
# Download NLTK packages
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
nltk.download('punkt')