-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdu114_image_crawler.py
348 lines (295 loc) · 10.9 KB
/
du114_image_crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#coding:utf-8
"""
Author: Sparrow
Purpose: downloading image from www.lofter.com in every blog's page once.
Created: 2016-7.4
Email: sparrow629@163.com
"""
from __future__ import print_function
from bs4 import BeautifulSoup
import urllib
import re
import os
global Urls
global Category
URLs = {1:'http://www.du114.com/gaoqingtaotu/xiuren/',
2:'http://www.du114.com/a/Mygirl/',
3:'http://www.du114.com/a/Beautyleg/',
4:'http://www.du114.com/a/TGOD/',
5:'http://www.du114.com/gaoqingtaotu/TuiGirl/',
6:'http://www.du114.com/gaoqingtaotu/youguowang/',
7:'http://www.du114.com/a/s-cute/',
8:'http://www.du114.com/a/TopQueen/',
9:'http://www.du114.com/a/DGC/',
10:'http://www.du114.com/a/Bomb.tv/',
11:'http://www.du114.com/a/RQ_STARxiezhen/',
12:'http://www.du114.com/a/Tpimage/',
13:'http://www.du114.com/a/3Agirl/',
14:'http://www.du114.com/a/simei/',
15:'http://www.du114.com/a/rosi/',
16:'http://www.du114.com/a/ligui/',
17:'http://www.du114.com/a/disi/',
18:'http://www.du114.com/a/Ru1mm/',
19:'http://www.du114.com/a/siwameinv/'
}
Category = { 1:'秀人套图',
2:'美媛馆',
3:'Beautyleg写真',
4:'推女神',
5:'TuiGirl推女郎',
6:'尤果网',
7:'s-cute',
8:'TopQueen',
9:'DGC',
10:'Bombtv',
11:'RQ_STAR写真',
12:'Tpimage',
13:'3Agirl',
14:'丝魅VIP',
15:'ROSI写真',
16:'丽柜写真',
17:'disi印象',
18:'Ru1mm如壹',
19:'丝袜系列套图',
20:'全部下载',
0:'自选粘贴限套图第一页url',
111:'某一tag标签页'
}
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
def getImage(html,mode):
Soup = BeautifulSoup(html, 'lxml')
# print Soup
imagetag = Soup.select('#picBody > p > a > img ')[0]
filenametag = Soup.select('body > div.w1200 > div.w960.r > div.articleBox > div.articleTitle > h1')
filename = filenametag[0].get_text()
imgurl = imagetag.get('src')
if mode == 1:
foldernames = Soup.select('body > div.w1200 > div.w960.r > div.articleBox > div.position > a')[-1]
foldername = foldernames.get_text()
# print(foldernames,foldername,sep='\n')
path = 'imgdownload/%s/' % foldername
elif mode == 3:
foldernames = Soup.select('body > div.w1200 > div.w960.r > div.articleBox > div.position > a')[-1]
foldername = foldernames.get_text()
# print(foldernames,foldername,sep='\n')
path = 'imgdownload/%s/%s/' % (foldername,Themename)
# print(Themenametag,path,sep='\n')
elif mode == 0:
path = 'imgdownload/%s/' % Foldername
if not os.path.exists(path):
os.makedirs(path)
target = path + '%s.jpg' % filename
urllib.urlretrieve(imgurl, target)
print(filename,imgurl,sep='\n')
print("Downloading file to %s" % target)
def Findallpic(html,firsturl):
Soup = BeautifulSoup(html, 'lxml')
global Themename
Themenametag = Soup.select('body > div.w1200 > div.w960.r > div.articleBox > div.articleTitle > h1')
Themename = Themenametag[0].get_text()
pagenumtag = Soup.select('body > div.w1200 > div.w960.r > div.articleBox > div.pages > ul > li > a')
pagenumtext = pagenumtag[0].get_text()
reg_num = '[0-9]*'
finalpagenumber = int(re.findall(reg_num,pagenumtext)[1])
PicUrllist = {1:firsturl
}
firsturllink = firsturl[:-5]
print('''-----------------------------------------------------------
此套图共有%s张图片''' % finalpagenumber)
for i in range(2,finalpagenumber+1):
picurl = firsturllink + '_' + str(i) + '.html'
PicUrllist[i] = picurl
# print(PicUrllist)
return PicUrllist
def ListCategoryUrl(num):
print(num, Category[num], URLs[num], sep=' => ')
return URLs[num]
def FindAllPage(html,currenturl):
Soup = BeautifulSoup(html, 'lxml')
finalpagetag = Soup.select('body > div.w1200 > div.w960.r > div.pages > ul > li > a')
if finalpagetag:
finalpage = finalpagetag[-1].get('href')
reg_page = r'list_.*_(.*)\.html'
finalpagenumber = int(re.findall(reg_page,finalpage)[0])
reg_list = r'(list_.*_).*\.html'
page_list = re.findall(reg_list,finalpage)[0]
i = 0
PageList = {}
print("共有%s页" % finalpagenumber,page_list)
for i in range(1,finalpagenumber+1):
page = currenturl + page_list + str(i) + '.html'
PageList[i] = page
# print(page)
# print("this is pagetlist:",PageList)
else:
PageList = {1:currenturl}
return PageList
def SelectTheme(html,key):
Soup = BeautifulSoup(html, 'lxml')
ThemeTag = Soup.select('#imgList > ul > li > span.both.title > a')
# print(ThemeTag)
Themelist = {}
Themecount = key * 100 + 1
for href in ThemeTag:
Themelist[Themecount]=(href.get('href'))
Themecount += 1
Titlelist = []
for title in ThemeTag:
Titlelist.append(title.get('title'))
Themecount = key * 100 + 1
for i in range(0,len(Titlelist)):
print(Themecount+i,Titlelist[i],sep='=>')
# print(Themelist)
return Themelist
def PathMode():
pathMode = 0
while not ((pathMode == 1) or (pathMode == 3)):
print("-----------------------------------------\n",
"所有图片下载在同一目录下请输入数字 1\n",
"选择套图根据名称分别下载在独立子目录请输入数字 3")
pathMode = int(raw_input('请输入数字选择保存图片的目录的方式:'))
return pathMode
def getTagImg(html):
Soup = BeautifulSoup(html, 'lxml')
global Foldername
foldernames = Soup.select('body > div.w1200 > div.w960.r > div > div.top_content > h1 > a')[-1]
Foldername = foldernames.get_text()
print(foldernames, Foldername, sep='\n')
Themelist = SelectTheme(html,0)
print(Themelist)
for key1 in Themelist:
Theme_url = Themelist[key1]
Pics_url_list = Findallpic(getHtml(Theme_url), Theme_url)
pic_count = 0
for key2 in Pics_url_list:
getImage(getHtml(Pics_url_list[key2]),0)
pic_count +=1
print('--------------------成功下载了%s张图片--------------------' % pic_count)
if __name__ == '__main__':
#1.测试图片下载和找图片功能
# Copy_url = 'http://www.du114.com/gaoqingtaotu/xiuren/109837.html'
# Pics_url_list = Findallpic(getHtml(Copy_url),Copy_url)
# pic_count = 0
# for key in Pics_url_list:
# getImage(getHtml(Pics_url_list[key]),3)
# pic_count += 1
# print('--------------------成功下载了%s张图片--------------------' % pic_count)
#2.测试tag下载功能
# Copy_url = 'http://www.du114.com/tag/1295.html'
# getTagImg(getHtml(Copy_url))
Quit_program = 'Y'
while not Quit_program == 'N':
for key in URLs:
print(key, Category[key], URLs[key], sep=' => ')
print('20 =>', Category[20])
print('0 =>', Category[0])
print('111 =>',Category[111])
num = -1
while not Category.has_key(num):
num = input('选择一个系列的对应数字进行下载: ')
if URLs.has_key(num):
category_url = ListCategoryUrl(num)
category_html = getHtml(category_url)
Pagelists = FindAllPage(category_html,category_url)
ThemeLists = {}
for key in Pagelists:
print("%s 页" % key,Pagelists[key],sep='=>')
pagehtml = getHtml(Pagelists[key])
themelist = SelectTheme(pagehtml,key)
Themecounts = themelist.keys()
for i in Themecounts:
ThemeLists[i] = themelist[i] #把每一页的Themes写入list字典得到所有图的首页url
print("共有%s幅套图" % len(ThemeLists))
print('''
上面是每一页对应的套图名称,你可以往上浏览过后
选择下载模式:
1.先下载该系列所有封面预览图
2.按页下载
3.按主题下载
4.下载此系列全部套图
''')
Mode = {1:'先下载该系列所有预览图',
2:'按页下载',
3:'按主题下载',
4:'下载此系列全部套图'
}
select_downloading_mode = 0
while not Mode.has_key(select_downloading_mode) :
select_downloading_mode = int(raw_input('请输入下载模式对应的数字: '))
print('你选择了 %s.%s' % (select_downloading_mode,Mode[select_downloading_mode]))
if select_downloading_mode == 1:
for key in ThemeLists:
themehtml = getHtml(ThemeLists[key])
getImage(themehtml,select_downloading_mode)
print('成功下载了%s张预览图' % len(ThemeLists.keys()))
elif select_downloading_mode == 2:
chose_quit = 'Y'
while not chose_quit == 'N':
for key in Pagelists:
print("%s 页" % key, Pagelists[key], sep='=>')
pageNumber = 0
while not Pagelists.has_key(pageNumber):
pageNumber = int(raw_input("请输入你要下载的页码:"))
print('第%s页的套图名称如下:' % pageNumber)
pagehtml = getHtml(Pagelists[pageNumber])
current_page_themelist = SelectTheme(pagehtml, pageNumber)
Path_Mode = PathMode()
ent = raw_input('取消下载请按[q],继续请回车:')
if not (ent == 'q'):
for key in current_page_themelist:
list_url = current_page_themelist[key]
PicUrlList = Findallpic(getHtml(list_url),list_url)
for key in PicUrlList:
pic_html = PicUrlList[key]
getImage(getHtml(pic_html),Path_Mode)
Number = len(current_page_themelist.keys())*len(PicUrlList.keys())
print('--------------------成功下载了%s张图片--------------------' % Number)
chose_quit = raw_input('继续选择下载请按键[Y],退出请按键[N]:')
elif select_downloading_mode == 3:
chose_quit = 'Y'
while not chose_quit == 'N':
Theme_number = int(raw_input("请根据上面的套图名称对应的编号输入数字:"))
Theme_url = ThemeLists[Theme_number]
Pics_url_list = Findallpic(getHtml(Theme_url),Theme_url)
pic_count = 0
for key in Pics_url_list:
getImage(getHtml(Pics_url_list[key]),select_downloading_mode)
pic_count +=1
print('--------------------成功下载了%s张图片--------------------' % pic_count)
chose_quit = raw_input('继续选择下载请按键[Y],退出请按键[N]:')
elif select_downloading_mode == 4:
Path_Mode = PathMode()
pic_count = 0
for key1 in ThemeLists:
Theme_url = ThemeLists[key1]
Pics_url_list = Findallpic(getHtml(Theme_url),Theme_url)
for key2 in Pics_url_list:
getImage(getHtml(Pics_url_list[key2]),Path_Mode)
pic_count +=1
print('--------------------成功下载了%s张图片--------------------' % pic_count)
else:
print(Category[num])
if num == 0:
chose_quit = 'Y'
while not chose_quit == 'N':
Copy_url = raw_input('请把要下载的套图首页的链接地址粘贴到此:')
Pics_url_list = Findallpic(getHtml(Copy_url),Copy_url)
pic_count = 0
for key in Pics_url_list:
getImage(getHtml(Pics_url_list[key]),3)
pic_count += 1
print('--------------------成功下载了%s张图片--------------------' % pic_count)
chose_quit = raw_input('继续选择下载请按键[Y],退出请按键[N]:')
if num == 111:
chose_quit = 'Y'
while not chose_quit == 'N':
Copy_url = raw_input('请把要下载标签页的链接地址粘贴到此:')
getTagImg(getHtml(Copy_url))
chose_quit = raw_input('继续选择下载请按键[Y],退出请按键[N]:')
if num == 20:
print("正在研究多线程下载")
# break
Quit_program = raw_input('继续选择下载请按键[Y],退出程序请按键[N]:')