-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathquery_handler.py
123 lines (100 loc) · 3.86 KB
/
query_handler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import json
import os
import snscrape.modules.twitter as sntwitter
class Scraper:
@staticmethod
def api_scrape_tweets_by_content(since_date, until_date, keyword, max_tweets):
scraped_tweets = []
for i, tweet in enumerate(sntwitter.TwitterSearchScraper(
keyword + f' since:{since_date} '
f'until:{until_date} '
f'-filter:links '
f'-filter:nativeretweets '
).get_items()):
mentions = []
if tweet.mentionedUsers is not None:
for mentionedUser in tweet.mentionedUsers:
mentions.append(mentionedUser.username)
is_rt = False
who_was_rt = None
if tweet.quotedTweet is not None:
who_was_rt = tweet.quotedTweet.username
is_rt = True
row_tweet = {
'Id': tweet.id,
'Label': tweet.user.username,
'link': f'https://twitter.com/{tweet.user.username}',
'content': tweet.renderedContent,
'mentioned_users': ' '.join(mentions),
'verified': tweet.user.verified,
'is_rt': is_rt,
'who_was_rt': who_was_rt
}
scraped_tweets.append(row_tweet)
if i > max_tweets:
break
return scraped_tweets
@staticmethod
def cli_scrape_tweets_by_content(since_date, until_date, keyword, max_tweets):
os.system(f'snscrape '
f'--jsonl '
f'--max-results {max_tweets} '
f'twitter-search "{keyword} since:{since_date} until:{until_date}"'
f' > tmp_file.json')
scraped_file = open('tmp_file.json')
scraped_tweets = []
for user_tweet in scraped_file:
data = json.loads(user_tweet)
username = data['user']['username']
verified = data['user']['verified']
protected = data['user']['protected']
followers = data['user']['followersCount']
mentioned_users = data['mentionedUsers']
quoted_tweet = data['quotedTweet']
mentions = mentions_in_tweet(mentioned_users)
quoted_tweet, is_rt = is_retweet(quoted_tweet)
tweet_info = {
'Id': data['id'],
'Label': username,
'link': f'https://twitter.com/{username}',
'followers': followers,
'content': data['renderedContent'],
'mentioned_users': ' '.join(mentions),
'verified': verified,
'protected': protected,
'is_rt': is_rt,
'who_was_rt': quoted_tweet['who_was_rt'],
'content_of_rt': quoted_tweet['rt_content'],
'rt_id': quoted_tweet['rt_id']
}
scraped_tweets.append(tweet_info)
scraped_file.close()
os.remove("tmp_file.json")
return scraped_tweets
# Aux methods to get specific tweet info
def is_retweet(quoted_tweet):
if quoted_tweet is not None:
qt_info = {
'who_was_rt': quoted_tweet['user']['username'],
'rt_content': quoted_tweet['content'],
'rt_id': quoted_tweet['id']
}
return qt_info, True # quoted tweet info, is_rt bool
else:
qt_info = {
'who_was_rt': None,
'rt_content': None,
'rt_id': None
}
return qt_info, False
def mentions_in_tweet(mentioned_users):
mentions = []
if mentioned_users is not None:
for user in mentioned_users:
mentions.append(user['username'])
else:
mentions.append('No Mentions')
mentions = remove_duplicates(mentions)
return mentions
def remove_duplicates(mylist):
return list(dict.fromkeys(mylist))