-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathweb_spider.py
93 lines (76 loc) · 2.11 KB
/
web_spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# Author: Mystik Developed
#
# Date: 1/3/2019
#
# Scan a target website
#
#!/usr/bin/env python
#reqs
import requests
import re
import urlparse
import argparse
# Get necessary args
def getArgs():
parser = argparse.ArgumentParser() # argparse intialize
# Obtain target range to scan
parser.add_argument(
"-t",
"--target",
dest="target",
help="Target URL to spider")
# parser.add_argument(
# "-w",
# "--wordlist",
# dest="wordlist",
# help="Wordlist to use")
options = parser.parse_args()
# verify arguments
if not options.target:
parser.error("[-] No URL provided. See --help for more info")
elif not options.wordlist:
parser.error("[-] No wordlist provided. See --help for more info")
else:
return options
options = getArgs()
target_url = options.target
# wordlist = options.wordlist
target_links = []
def extract_links(url):
res = requests.get(target_url)
return re.findall('(?:href=")(.*?)"', res.content)
def crawl(url):
href_links = extract_links(target_url)
for link in href_links:
link = urlparse.urljoin(target_url, link)
if "#" in link:
link = link.split("#")[0]
if target_url in link and link not in target_links:
target_links.append(link)
print(link)
crawl(link)
crawl(target_url)
# #subdomain list
# subdom_list = []
#
# #check for subdomains
# def subdom_check(url):
# try:
# with open(wordlist, "r") as wordlist:
# for line in wordlist:
# word = line.strip()
# test_subdom = word + "." + target_url
# res = subdom_check(test_subdom)
# if res:
# subdom_dict = {"subdomain": test_subdom, "res": res}
# print("[+] Subdomain --> " + test_subdom)
# subdom_list.append(subdom_dict)
# except requests.exceptions.ConnectionError:
# pass
#
#
# def subdom_results(results_list):
# print("\tSubdomain\t\t\tResponse Code\n-------------------------------------")
#
# for domain in results_list:
# print("\t" + domain["subdomain"] + "\t\t" + domain["res"])