-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
102 lines (87 loc) · 3.41 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import requests, os, threading, time
from bs4 import BeautifulSoup
from os import listdir
from os.path import isfile, join, exists
# Setting Constants
DIR_URLS = [
'http://intranet.daiict.ac.in/~daiict_nt01/Academic/B%20Tech%20Students-Groups/',
'http://intranet.daiict.ac.in/~daiict_nt01/Academic/Archives/Holidays/'
]
DEST_PATH = '/Users/meetpatel/Downloads'
IGNORE_LIST = [
'.DS_Store', 'Thumbs.db'
]
DOWNLOAD_TIMEOUT = 10 # Threshold Download Timeout, In seconds
CHUNK_SIZE = 1024
def scrape_dir(url, dest): # url is single string, not a list, points to a remote directory # dest points to local download destination
if not exists(dest):
os.makedirs(dest)
response = requests.get(url)
webpage = BeautifulSoup(response.text, 'lxml')
table_row = webpage.find_all('tr')
dirs_list = []
dirs_links = []
files_list = []
files_links = []
for item in table_row:
file_name = item.find('a')
if(file_name and file_name.text!="Name" and file_name.text!="Parent Directory"):
if file_name.text[-1] == '/': #If the row-item is Directory
dirs_list.append(file_name.text)
dirs_links.append(url + file_name.get('href'))
else:
files_links.append(url + file_name.get('href'))
files_list.append(file_name.text)
# compare current directory file list with remote here
check_current_dir(files_list, files_links, dest)
# check subdirectories
for j in range(len(dirs_links)):
scrape_dir(dirs_links[j], join(dest,dirs_list[j]))
def check_current_dir(fileslist, fileslinks, local_dest):
ls = listdir(local_dest)
available_files = []
for f in ls:
if isfile(join(local_dest, f)) and f not in IGNORE_LIST:
available_files.append(f)
thread_list=[]
for i in range(len(fileslist)):
if (fileslist[i] not in available_files) and (fileslist[i] not in IGNORE_LIST):
t = threading.Thread(target=download_file_to_curr_dir, args=(fileslinks[i], local_dest))
t.start()
thread_list.append(t)
for thread in thread_list:
thread.join()
def download_file_to_curr_dir(link, dest):
download(link, join(dest,link.split('/')[-1]))
def download(url, file_name):
file_name = file_name.replace('%20', ' ')
try:
response = requests.get(url, stream=True, timeout=DOWNLOAD_TIMEOUT)
print("downloading...")
with open(file_name, "wb") as file:
for chunk in response.iter_content(chunk_size=CHUNK_SIZE):
if chunk: # filter out keep-alive new chunks
file.write(chunk)
except:
os.remove(file_name)
print("\n**Couldn't download file**: " + file_name)
print("Download Manually at: " + url + '\n')
# EXECUTE
def main():
threadslist = []
for dirurl in DIR_URLS:
newdirname = dirurl.split('/')[-2].replace('%20', ' ')
newdirname = join(DEST_PATH, newdirname)
if not exists(newdirname):
os.makedirs(newdirname)
print("Made New Directory...")
t = threading.Thread(target=scrape_dir, args=(dirurl, newdirname))
t.start()
threadslist.append(t)
# threading.join()
# scrape_dir(dirurl, newdirname)
for thread in threadslist:
thread.join()
starttime = time.time()
main()
print("\nFINAL Execution Time: " + str(time.time() - starttime))