-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwatchbot.py
158 lines (144 loc) · 5.15 KB
/
watchbot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# watchbot.py
import tweepy
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import cookielib
import hashlib
import logging
import os
import re
import string
import urllib2
import sqlite3
from watchconfig import * # import config
# prevent automatic redirection
class NoRedirection(urllib2.HTTPErrorProcessor):
def http_response(self, request, response):
return response
https_response = http_response
# post actual tweets - can post just text or text and media
def tweet_update(status, filename=None):
auth = tweepy.OAuthHandler(consumer_key, consumer_secret_key)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.API(auth)
if not status and not filename:
logging.error("Nothing to tweet")
else:
if filename:
api.update_with_media(filename, status)
else:
api.update_status(status)
# create the google search url used for the image search
def google_search_url(lead_string, site_string):
dt = datetime.now() - timedelta(days=days_to_subtract)
try:
searchstring = lead_string + "+" + dt.strftime("%-d+%B+%Y") + "+site:" + site_string # %-d may not work on Win
except:
searchstring = lead_string + "+" + dt.strftime("%d+%B+%Y") + "+site:" + site_string
search_url=google_image_search_url
search_url=string.replace(search_url,"##",searchstring, 1)
return search_url.lower()
# execute the google search and do a basic link parse looking for hrefs to jpgs
def exec_google_search(url):
logging.debug(url)
header = {'User-Agent': user_agent}
try:
htmlresp = BeautifulSoup(urllib2.urlopen(urllib2.Request(url,headers=header)))
except:
logging.error("Initial Google search failed")
htmlresp = ""
imagesresp = [a['href'] for a in htmlresp.find_all("a", {"href": re.compile("jpg")})]
return imagesresp
# so we can exclude image sites that act difficultly
def safe_site(teststring):
for each in excluded_sites:
if each in teststring:
logging.debug("Excluded site: " + teststring)
return False
return True
# parse out a list of possible images for downloading
def parse_images(images):
possible_images = []
for each in images:
try:
output = each.split('=')[1]
output = output.split('&')[0]
if safe_site(output):
possible_images.append(output)
except:
logging.error("Split error")
return possible_images
# walk the possible images checking each in the DB by URL to find one that isn't already downloaded
def find_unused_image(image_urls, conn):
db = conn.cursor()
image_url = None
for each in image_urls:
logging.debug(each + "?")
db.execute("select rowid from images where url = ?", (each,))
rowid = db.fetchall()
if len(rowid) == 0:
image_url = each
break
return image_url
# grab the image - handling redirection, then ensure we have a place to put the image, SHA256 hash the image
# to get a unique filename, and store it on disk
def capture_image_to_file(url):
cj = cookielib.CookieJar()
opener = urllib2.build_opener(NoRedirection, urllib2.HTTPCookieProcessor(cj))
try:
response = opener.open(url)
except:
return None
if response.code != 200:
if response.code == 302 or response.code == 301:
url = response.headers['Location']
logging.debug("Redirected to " + url)
else:
logging.error("error fetching image")
logging.error(response.code)
logging.error(response.headers)
return None
try:
raw_img = urllib2.urlopen(url).read()
except:
raw_img = ""
d = os.path.dirname(image_path)
if not os.path.exists(d):
try:
os.makedirs(image_path)
except:
logging.error("cannot create path")
filename = image_path + hashlib.sha256(raw_img).hexdigest() + ".jpg"
f = open(filename, 'wb')
f.write(raw_img)
f.close()
return filename
# --- main ---
# change to location so it's easier to get to db from within cron
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)
# open our database, and if it's uninitialized, set it up
conn = sqlite3.connect(database_file)
db = conn.cursor()
if os.path.getsize(database_file)<100:
db.execute("create table images(filename text, url text, date text)")
# find a unique unused image for posting
images = exec_google_search(google_search_url(search_text, search_site))
image_urls = parse_images(images)
logging.debug(image_urls)
image_url = find_unused_image(image_urls, conn)
logging.debug(image_url)
# if we found an image, store it in the DB and tweet it; if not, end on an error
if image_url:
image_file = capture_image_to_file(image_url)
logging.debug(image_file)
logging.debug(datetime.now())
db.execute("insert into images values (?, ?, ?)", (image_file, image_url, datetime.now()))
conn.commit()
conn.close()
tweet_update(tweet_message, image_file)
else:
conn.close()
logging.error("Could not find a unique image")
exit(-1)