-
Notifications
You must be signed in to change notification settings - Fork 21
/
Copy pathtweetprocess.py
84 lines (67 loc) · 2.84 KB
/
tweetprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
"""
preprocess-twitter.py
python preprocess-twitter.py "Some random text with #hashtags, @mentions and http://t.co/kdjfkdjf (links). :)"
Script for preprocessing tweets by Romain Paulus
with small modifications by Jeffrey Pennington
with translation to Python by Motoki Wu
Translation of Ruby script to create features for GloVe vectors for Twitter data.
http://nlp.stanford.edu/projects/glove/preprocess-twitter.rb
"""
import sys
import re
FLAGS = re.MULTILINE | re.DOTALL
def hashtag2(text):
text = text.group()
hashtag_body = text[1:]
if hashtag_body.isupper():
result = "<hashtag> {} <allcaps>".format(hashtag_body)
else:
result = " ".join(["<hashtag>"] + re.split(r"(?=[A-Z])", hashtag_body, flags=FLAGS))
return result
def hashtag(text):
return ''
def allcaps(text):
text = text.group()
return text.lower() + "<allcaps>"
def tokenize(text):
# Different regex parts for smiley faces
eyes = r"[8:=;]"
nose = r"['`\-]?"
# function so code less repetitive
def re_sub(pattern, repl):
return re.sub(pattern, repl, text, flags=FLAGS)
urls = re.findall(r"https?:\/\/\S+\b|www\.", text)
#text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
text = re_sub(r"https?:\/\/\S+\b|www\.", "<url>")
#text = re_sub(r"/"," / ")
text = re_sub(r"/","")
#text = re_sub(r"@\w+", "<user>")
text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
text = re_sub(r"<3","<heart>")
text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
text = re_sub(r"#\S+", hashtag)
#text = re_sub(r"([!?.]){2,}", r" \1<repeat>")
text = re_sub(r"([!?.]){1,}", r"")
text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2<elong>")
text = re_sub(r"[()\":]","")
## -- I just don't understand why the Ruby script adds <allcaps> to everything so I limited the selection.
# text = re_sub(r"([^a-z0-9()<>'`\-]){2,}", allcaps)
text = re_sub(r"([A-Z]){2,}", allcaps)
tokens = text.lower().split()
urlIndex = 0
for i in range(len(tokens)):
if tokens[i] == '<url>':
tokens[i] = urls[urlIndex]
urlIndex += 1
return tokens
if __name__ == '__main__':
text = "test"
if text == "test":
#text = "I TEST alllll kinds of #hashtags and #HASHTAGS, @mentions and 3000 (http://t.co/dkfjkdf). w/ <3 :) haha!!!!!"
#text = "on 1 Fav Source+5 others like CNET News-Why Google Android is winning http://bit.ly/aW9QWn"
text = "on 1 Fav\" Source+5 :others like CNET News-Why Google Android is winning https://docs.python.org/2/library/re.html"
tokens = tokenize(text)
print tokens