-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMakeHumanClothesAssetsScraper.py
75 lines (67 loc) · 2.55 KB
/
MakeHumanClothesAssetsScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import requests
from bs4 import BeautifulSoup
import re
import os
import urllib.request
headers = {
'Access-Control-Allow-Origin': '*',
'Access-Control-Allow-Methods': 'GET',
'Access-Control-Allow-Headers': 'Content-Type',
'Access-Control-Max-Age': '3600',
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
}
def scraplinks(urli):
links = []
url = urli
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
for link in soup.findAll('a'):
#print(link.get('href'))
if "/clothes/" in str(link.get('href')) and not "page" in str(link.get('href')):
#print(link.get('href'))
links.append("http://makehumancommunity.org"+str(link.get('href')))
return links
def scrape_download_links(urli):
print(urli)
url = "http://makehumancommunity.org"+"/clothes/"+urli+".html"
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')
for link in soup.findAll('a'):
if ".mhclo" in str(link.get('href')) or ".obj" in str(link.get('href')) or ".mhmat" in str(link.get('href')) or ".thumb" in str(link.get('href')):
print(link.get('href'))
print('Beginning file download with urllib2...')
url4dl = str(link.get('href'))
filer = url4dl.split("/")
filll = filer[len(filer)-1]
print(filll)
urllib.request.urlretrieve(url4dl,"clothes/"+urli+"/"+filll)
#print(link.get('href'))
#print(ii)
def create_directory(name):
if not os.path.exists(name):
os.mkdir(name)
def replace_string(string, fromi, toi):
return string.replace(fromi, toi)
#scraped_links = []
for pageno in range(1,17):
print("Scraping Page Number: "+str(pageno))
scraped_links = scraplinks("http://makehumancommunity.org/clothes.html?page="+str(pageno))
fnames = []
for x in scraped_links:
g = replace_string(x,"http://makehumancommunity.org/clothes/","")
g = replace_string(g,".html","")
fnames.append(g)
for fi in range(0,len(fnames)):
create_directory("clothes/"+str(fnames[fi]))
scrape_download_links(fnames[fi])
#print(scraped_links)
##fnames = []
##for x in scraped_links:
## g = replace_string(x,"http://makehumancommunity.org/clothes/","")
## g = replace_string(g,".html","")
## fnames.append(g)
##for fi in range(0,len(fnames)):
## create_directory("clothes/"+str(fnames[fi]))
## scrape_download_links(fnames[fi])
print("done")
#create_directory("ali")