From c3e04024f5beb5132b6a04eb21e70077ea63c54c Mon Sep 17 00:00:00 2001 From: "@_tID" Date: Tue, 18 Sep 2018 14:46:51 +0530 Subject: [PATCH] verbose output + animation fix for python2 --- photon.py | 1224 +++++++++++++++++++++++++++-------------------------- 1 file changed, 624 insertions(+), 600 deletions(-) diff --git a/photon.py b/photon.py index e0af76a..cf36605 100644 --- a/photon.py +++ b/photon.py @@ -1,600 +1,624 @@ -#!/usr/bin/env python3 -# -*- coding: utf-8 -*- - -from __future__ import print_function - -# Let's import what we need -import os -import sys -import tld -import time -import random -import warnings -import argparse -import threading -from math import log -from re import search, findall -from requests import get, post - -try: - import concurrent.futures - from urllib.parse import urlparse # for python3 - python2, python3 = False, True -except ImportError: - from urlparse import urlparse # for python2 - python2, python3 = True, False - -from core.config import intels, badTypes - -try: - input = raw_input -except NameError: - pass - -colors = True # Output should be colored -machine = sys.platform # Detecting the os of current system -if machine.lower().startswith(('os', 'win', 'darwin', 'ios')): - colors = False # Colors shouldn't be displayed in mac & windows -if not colors: - end = red = white = green = yellow = run = bad = good = info = que = '' -else: - end = '\033[1;m' - red = '\033[91m' - white = '\033[1;97m' - green = '\033[1;32m' - yellow = '\033[1;33m' - run = '\033[1;97m[~]\033[1;m' - bad = '\033[1;31m[-]\033[1;m' - good = '\033[1;32m[+]\033[1;m' - info = '\033[1;33m[!]\033[1;m' - que = '\033[1;34m[?]\033[1;m' - -# Just a fancy ass banner -print('''%s ____ __ __ - / %s__%s \/ /_ ____ / /_____ ____ - / %s/_/%s / __ \/ %s__%s \/ __/ %s__%s \/ __ \\ - / ____/ / / / %s/_/%s / /_/ %s/_/%s / / / / - /_/ /_/ /_/\____/\__/\____/_/ /_/ %sv1.1.1%s\n''' % - (red, white, red, white, red, white, red, white, red, white, red, white, red, white, end)) - -warnings.filterwarnings('ignore') # Disable SSL related warnings - -# Processing command line arguments -parser = argparse.ArgumentParser() -# Options -parser.add_argument('-u', '--url', help='root url', dest='root') -parser.add_argument('-c', '--cookie', help='cookie', dest='cook') -parser.add_argument('-r', '--regex', help='regex pattern', dest='regex') -parser.add_argument('-e', '--export', help='export format', dest='export') -parser.add_argument('-o', '--output', help='output directory', dest='output') -parser.add_argument('-l', '--level', help='levels to crawl', dest='level', type=int) -parser.add_argument('-t', '--threads', help='number of threads', dest='threads', type=int) -parser.add_argument('-d', '--delay', help='delay between requests', dest='delay', type=float) -parser.add_argument('-s', '--seeds', help='additional seed urls', dest='seeds', nargs="+", default=[]) - -parser.add_argument('--stdout', help='send variables to stdout', dest='std') -parser.add_argument('--user-agent', help='custom user agent(s)', dest='user_agent') -parser.add_argument('--exclude', help='exclude urls matching this regex', dest='exclude') -parser.add_argument('--timeout', help='http request timeout', dest='timeout', type=float) - -# Switches -parser.add_argument('--dns', help='enumerate subdomains & dns data', dest='dns', action='store_true') -parser.add_argument('--ninja', help='ninja mode', dest='ninja', action='store_true') -parser.add_argument('--keys', help='find secret keys', dest='api', action='store_true') -parser.add_argument('--update', help='update photon', dest='update', action='store_true') -parser.add_argument('--only-urls', help='only extract urls', dest='only_urls', action='store_true') -parser.add_argument('--wayback', help='fetch urls from archive.org as seeds', dest='archive', action='store_true') -args = parser.parse_args() - -#### -# This function git clones the latest version and merges it with the current directory -#### - -def update(): - print('%s Checking for updates' % run) - changes = '''bug fixes;minor refactor;added --stdout option''' # Changes must be seperated by ; - latest_commit = get('https://raw.githubusercontent.com/s0md3v/Photon/master/photon.py').text - - if changes not in latest_commit: # just a hack to see if a new version is available - changelog = search(r"changes = '''(.*?)'''", latest_commit) - changelog = changelog.group(1).split(';') # splitting the changes to form a list - print('%s A new version of Photon is available.' % good) - print('%s Changes:' % info) - for change in changelog: # print changes - print('%s>%s %s' % (green, end, change)) - - current_path = os.getcwd().split('/') # if you know it, you know it - folder = current_path[-1] # current directory name - path = '/'.join(current_path) # current directory path - choice = input('%s Would you like to update? [Y/n] ' % que).lower() - - if choice != 'n': - print('%s Updating Photon' % run) - os.system('git clone --quiet https://github.com/s0md3v/Photon %s' % (folder)) - os.system('cp -r %s/%s/* %s && rm -r %s/%s/ 2>/dev/null' % (path, folder, path, path, folder)) - print('%s Update successful!' % good) - else: - print('%s Photon is up to date!' % good) - -if args.update: # if the user has supplied --update argument - update() - quit() # quitting because files have been changed - -if args.root: # if the user has supplied a url - main_inp = args.root - if main_inp.endswith('/'): # if the url ends with '/' - main_inp = main_inp[:-1] # we will remove it as it can cause problems later in the code -else: # if the user hasn't supplied a url - print('\n' + parser.format_help().lower()) - quit() - -delay = args.delay or 0 # Delay between requests -timeout = args.timeout or 6 # HTTP request timeout -cook = args.cook or None # Cookie -api = bool(args.api) # extract high entropy strings i.e. API keys and stuff -ninja = bool(args.ninja) # Ninja mode toggle -crawl_level = args.level or 2 # Crawling level -thread_count = args.threads or 2 # Number of threads -only_urls = bool(args.only_urls) # only urls mode is off by default - -# Variables we are gonna use later to store stuff -keys = set() # high entropy strings, prolly secret keys -files = set() # pdf, css, png etc. -intel = set() # emails, website accounts, aws buckets etc. -robots = set() # entries of robots.txt -custom = set() # string extracted by custom regex pattern -failed = set() # urls that photon failed to crawl -scripts = set() # javascript files -external = set() # urls that don't belong to the target i.e. out-of-scope -fuzzable = set() # urls that have get params in them e.g. example.com/page.php?id=2 -endpoints = set() # urls found from javascript files -processed = set() # urls that have been crawled -internal = set([s for s in args.seeds]) # urls that belong to the target i.e. in-scope - -everything = [] -bad_intel = set() # unclean intel urls -bad_scripts = set() # unclean javascript file urls - -# If the user hasn't supplied the root url with http(s), we will handle it -if main_inp.startswith('http'): - main_url = main_inp -else: - try: - get('https://' + main_inp) - main_url = 'https://' + main_inp - except: - main_url = 'http://' + main_inp - -schema = main_url.split('//')[0] # https: or http:? - -internal.add(main_url) # adding the root url to internal for crawling - -host = urlparse(main_url).netloc # Extracts host out of the url - -output_dir = args.output or host - -#### -# This function extracts top level domain from a url -#### - -def topLevel(url): - try: - toplevel = tld.get_fld(host, fix_protocol=True) - except tld.exceptions.TldDomainNotFound: - toplevel = urlparse(main_url).netloc - return toplevel - -domain = topLevel(main_url) - -#### -# This function makes requests to webpage and returns response body -#### - -if args.user_agent: - user_agents = args.user_agent.split(',') -else: - with open(sys.path[0] + '/core/user-agents.txt', 'r') as uas: - user_agents = [agent.strip('\n') for agent in uas] - -def requester(url): - processed.add(url) # mark the url as crawled - time.sleep(delay) # pause/sleep the program for specified time - def normal(url): - headers = { - 'Host' : host, # ummm this is the hostname? - 'User-Agent' : random.choice(user_agents), # selecting a random user-agent - 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language' : 'en-US,en;q=0.5', - 'Accept-Encoding' : 'gzip', - 'DNT' : '1', - 'Connection' : 'close'} - # make request and return response - response = get(url, cookies=cook, headers=headers, verify=False, timeout=timeout, stream=True) - if 'text/html' in response.headers['content-type']: - if response.status_code != '404': - return response.text - else: - response.close() - failed.add(url) - return 'dummy' - else: - response.close() - return 'dummy' - - # developer.facebook.com API - def facebook(url): - return get('https://developers.facebook.com/tools/debug/echo/?q=' + url, verify=False).text - - # pixlr.com API - def pixlr(url): - if url == main_url: - url = main_url + '/' # because pixlr throws error if http://example.com is used - # make request and return response - return get('https://pixlr.com/proxy/?url=' + url, headers={'Accept-Encoding' : 'gzip'}, verify=False).text - - # codebeautify.org API - def code_beautify(url): - headers = { - 'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0', - 'Accept' : 'text/plain, */*; q=0.01', - 'Accept-Encoding' : 'gzip', - 'Content-Type' : 'application/x-www-form-urlencoded; charset=UTF-8', - 'Origin' : 'https://codebeautify.org', - 'Connection' : 'close' - } - # make request and return response - return post('https://codebeautify.com/URLService', headers=headers, data='path=' + url, verify=False).text - - # www.photopea.com API - def photopea(url): - # make request and return response - return get('https://www.photopea.com/mirror.php?url=' + url, verify=False).text - - if ninja: # if the ninja mode is enabled - # select a random request function i.e. random API - response = random.choice([photopea, normal, facebook, pixlr, code_beautify])(url) - return response or 'dummy' - else: - return normal(url) - -#### -# This function extracts links from .xml files -#### - -def xmlParser(response): - return findall(r'(.*?)', response) # regex for extracting urls - -#### -# This function extracts links from robots.txt and sitemap.xml -#### - -def zap(url): - if args.archive: - from plugins.wayback import timeMachine - print ('%s Fetching URLs from archive.org' % run) - if False: - archived_urls = timeMachine(domain, 'domain') - else: - archived_urls = timeMachine(host, 'host') - print ('%s Retrieved %i URLs from archive.org' % (good, len(archived_urls) - 1)) - for url in archived_urls: - internal.add(url) - response = get(url + '/robots.txt', verify=False).text # makes request to robots.txt - if '= 4: - keys.add(url + ': ' + match) - -#### -# This function extracts endpoints from JavaScript Code -#### - -def jscanner(url): - response = requester(url) # make request to the url - matches = findall(r'[\'"](/.*?)[\'"]|[\'"](http.*?)[\'"]', response) # extract urls/endpoints - for match in matches: # iterate over the matches, match is a tuple - match = match[0] + match[1] # combining the items because one of them is always empty - if not search(r'[}{><"\']', match) and not match == '/': # making sure it's not some js code - endpoints.add(match) # add it to the endpoints list - -#### -# This function starts multiple threads for a function -#### - -def threader(function, *urls): - threads = [] # list of threads - urls = urls[0] # because urls is a tuple - for url in urls: # iterating over urls - task = threading.Thread(target=function, args=(url,)) - threads.append(task) - # start threads - for thread in threads: - thread.start() - # wait for all threads to complete their work - for thread in threads: - thread.join() - # delete threads - del threads[:] - -#### -# This function processes the urls and uses a threadpool to execute a function -#### - -def flash(function, links): # This shit is NOT complicated, please enjoy - links = list(links) # convert links (set) to list - if sys.version_info < (3, 2): - for begin in range(0, len(links), thread_count): # range with step - end = begin + thread_count - splitted = links[begin:end] - threader(function, splitted) - progress = end - if progress > len(links): # fix if overflow - progress = len(links) - print('\r%s Progress: %i/%i' % (info, progress, len(links)), end='\r') - else: - threadpool = concurrent.futures.ThreadPoolExecutor(max_workers=thread_count) - futures = (threadpool.submit(function, link) for link in links) - for i, _ in enumerate(concurrent.futures.as_completed(futures)): - if i + 1 == len(links) or (i + 1) % thread_count == 0: - print('%s Progress: %i/%i' % (info, i + 1, len(links)), end='\r') - print('') - -then = time.time() # records the time at which crawling started - -# Step 1. Extract urls from robots.txt & sitemap.xml -zap(main_url) - -# this is so the level 1 emails are parsed as well -internal = set(remove_regex(internal, args.exclude)) - -# Step 2. Crawl recursively to the limit specified in "crawl_level" -for level in range(crawl_level): - links = remove_regex(internal - processed, args.exclude) # links to crawl = (all links - already crawled links) - links not to crawl - if not links: # if links to crawl are 0 i.e. all links have been crawled - break - elif len(internal) <= len(processed): # if crawled links are somehow more than all links. Possible? ;/ - if len(internal) > 2 + len(args.seeds): # if you know it, you know it - break - print('%s Level %i: %i URLs' % (run, level + 1, len(links))) - try: - flash(extractor, links) - except KeyboardInterrupt: - print('') - break - -if not only_urls: - for match in bad_scripts: - if match.startswith(main_url): - scripts.add(match) - elif match.startswith('/') and not match.startswith('//'): - scripts.add(main_url + match) - elif not match.startswith('http') and not match.startswith('//'): - scripts.add(main_url + '/' + match) - # Step 3. Scan the JavaScript files for enpoints - print('%s Crawling %i JavaScript files' % (run, len(scripts))) - flash(jscanner, scripts) - - for url in internal: - if '=' in url: - fuzzable.add(url) - - for match in bad_intel: - for x in match: # because "match" is a tuple - if x != '': # if the value isn't empty - intel.add(x) - for url in external: - try: - if tld.get_fld(url, fix_protocol=True) in intels: - intel.add(url) - except: - pass - -now = time.time() # records the time at which crawling stopped -diff = (now - then) # finds total time taken - -def timer(diff): - minutes, seconds = divmod(diff, 60) # Changes seconds into minutes and seconds - try: - time_per_request = diff / float(len(processed)) # Finds average time taken by requests - except ZeroDivisionError: - time_per_request = 0 - return minutes, seconds, time_per_request -minutes, seconds, time_per_request = timer(diff) - -# Step 4. Save the results -if not os.path.exists(output_dir): # if the directory doesn't exist - os.mkdir(output_dir) # create a new directory - -datasets = [files, intel, robots, custom, failed, internal, scripts, external, fuzzable, endpoints, keys] -dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys'] - -def writer(datasets, dataset_names, output_dir): - for dataset, dataset_name in zip(datasets, dataset_names): - if dataset: - filepath = output_dir + '/' + dataset_name + '.txt' - if python3: - with open(filepath, 'w+', encoding='utf8') as f: - f.write(str('\n'.join(dataset))) - f.write('\n') - else: - with open(filepath, 'w+') as f: - joined = '\n'.join(dataset) - f.write(str(joined.encode('utf-8'))) - f.write('\n') - -writer(datasets, dataset_names, output_dir) -# Printing out results -print (('%s-%s' % (red, end)) * 50) -for dataset, dataset_name in zip(datasets, dataset_names): - if dataset: - print ('%s %s: %s' % (good, dataset_name.capitalize(), len(dataset))) -print (('%s-%s' % (red, end)) * 50) - -print('%s Total requests made: %i' % (info, len(processed))) -print('%s Total time taken: %i minutes %i seconds' % (info, minutes, seconds)) -print('%s Requests per second: %i' % (info, int(len(processed)/diff))) - -datasets = { -'files': list(files), 'intel': list(intel), 'robots': list(robots), 'custom': list(custom), 'failed': list(failed), 'internal': list(internal), -'scripts': list(scripts), 'external': list(external), 'fuzzable': list(fuzzable), 'endpoints': list(endpoints), 'keys' : list(keys) -} - -if args.dns: - print ('%s Enumerating subdomains' % run) - from plugins.findSubdomains import findSubdomains - subdomains = findSubdomains(domain) - print ('%s %i subdomains found' % (info, len(subdomains))) - writer([subdomains], ['subdomains'], output_dir) - datasets['subdomains'] = subdomains - from plugins.dnsdumpster import dnsdumpster - print ('%s Generating DNS map' % run) - dnsdumpster(domain, output_dir) - -if args.export: - from plugins.exporter import exporter - # exporter(directory, format, datasets) - exporter(output_dir, args.export, datasets) - -print('%s Results saved in %s%s%s directory' % (good, green, output_dir, end)) - -if args.std: - for string in datasets[args.std]: - sys.stdout.write(string + '\n') +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +from __future__ import print_function + +# Let's import what we need +import os +import sys +import tld +import time +import random +import warnings +import argparse +import threading +from math import log +from re import search, findall +from requests import get, post + +try: + import concurrent.futures + from urllib.parse import urlparse # for python3 + python2, python3 = False, True +except ImportError: + from urlparse import urlparse # for python2 + python2, python3 = True, False + +from core.config import intels, badTypes + +try: + input = raw_input +except NameError: + pass + +colors = True # Output should be colored +machine = sys.platform # Detecting the os of current system +if machine.lower().startswith(('os', 'win', 'darwin', 'ios')): + colors = False # Colors shouldn't be displayed in mac & windows +if not colors: + end = red = white = green = yellow = run = bad = good = info = que = '' +else: + end = '\033[1;m' + red = '\033[91m' + white = '\033[1;97m' + green = '\033[1;32m' + yellow = '\033[1;33m' + run = '\033[1;97m[~]\033[1;m' + bad = '\033[1;31m[-]\033[1;m' + good = '\033[1;32m[+]\033[1;m' + info = '\033[1;33m[!]\033[1;m' + que = '\033[1;34m[?]\033[1;m' + +# Just a fancy ass banner +print('''%s ____ __ __ + / %s__%s \/ /_ ____ / /_____ ____ + / %s/_/%s / __ \/ %s__%s \/ __/ %s__%s \/ __ \\ + / ____/ / / / %s/_/%s / /_/ %s/_/%s / / / / + /_/ /_/ /_/\____/\__/\____/_/ /_/ %sv1.1.4%s\n''' % + (red, white, red, white, red, white, red, white, red, white, red, white, red, white, end)) + +warnings.filterwarnings('ignore') # Disable SSL related warnings + +# Processing command line arguments +parser = argparse.ArgumentParser() +# Options +parser.add_argument('-u', '--url', help='root url', dest='root') +parser.add_argument('-c', '--cookie', help='cookie', dest='cook') +parser.add_argument('-r', '--regex', help='regex pattern', dest='regex') +parser.add_argument('-e', '--export', help='export format', dest='export') +parser.add_argument('-o', '--output', help='output directory', dest='output') +parser.add_argument('-l', '--level', help='levels to crawl', dest='level', type=int) +parser.add_argument('-t', '--threads', help='number of threads', dest='threads', type=int) +parser.add_argument('-d', '--delay', help='delay between requests', dest='delay', type=float) +parser.add_argument('-v', '--verbose', help='verbose output', dest='verbose', action='store_true') +parser.add_argument('-s', '--seeds', help='additional seed urls', dest='seeds', nargs="+", default=[]) + +parser.add_argument('--stdout', help='send variables to stdout', dest='std') +parser.add_argument('--user-agent', help='custom user agent(s)', dest='user_agent') +parser.add_argument('--exclude', help='exclude urls matching this regex', dest='exclude') +parser.add_argument('--timeout', help='http request timeout', dest='timeout', type=float) + +# Switches +parser.add_argument('--dns', help='enumerate subdomains & dns data', dest='dns', action='store_true') +parser.add_argument('--ninja', help='ninja mode', dest='ninja', action='store_true') +parser.add_argument('--keys', help='find secret keys', dest='api', action='store_true') +parser.add_argument('--update', help='update photon', dest='update', action='store_true') +parser.add_argument('--only-urls', help='only extract urls', dest='only_urls', action='store_true') +parser.add_argument('--wayback', help='fetch urls from archive.org as seeds', dest='archive', action='store_true') +args = parser.parse_args() + +#### +# This function git clones the latest version and merges it with the current directory +#### + +def update(): + print('%s Checking for updates' % run) + changes = '''added -v option;progress animation fix for python2''' # Changes must be seperated by ; + latest_commit = get('https://raw.githubusercontent.com/s0md3v/Photon/master/photon.py').text + + if changes not in latest_commit: # just a hack to see if a new version is available + changelog = search(r"changes = '''(.*?)'''", latest_commit) + changelog = changelog.group(1).split(';') # splitting the changes to form a list + print('%s A new version of Photon is available.' % good) + print('%s Changes:' % info) + for change in changelog: # print changes + print('%s>%s %s' % (green, end, change)) + + current_path = os.getcwd().split('/') # if you know it, you know it + folder = current_path[-1] # current directory name + path = '/'.join(current_path) # current directory path + choice = input('%s Would you like to update? [Y/n] ' % que).lower() + + if choice != 'n': + print('%s Updating Photon' % run) + os.system('git clone --quiet https://github.com/s0md3v/Photon %s' % (folder)) + os.system('cp -r %s/%s/* %s && rm -r %s/%s/ 2>/dev/null' % (path, folder, path, path, folder)) + print('%s Update successful!' % good) + else: + print('%s Photon is up to date!' % good) + +if args.update: # if the user has supplied --update argument + update() + quit() # quitting because files have been changed + +if args.root: # if the user has supplied a url + main_inp = args.root + if main_inp.endswith('/'): # if the url ends with '/' + main_inp = main_inp[:-1] # we will remove it as it can cause problems later in the code +else: # if the user hasn't supplied a url + print('\n' + parser.format_help().lower()) + quit() + +verbose = args.verbose +delay = args.delay or 0 # Delay between requests +timeout = args.timeout or 6 # HTTP request timeout +cook = args.cook or None # Cookie +api = bool(args.api) # extract high entropy strings i.e. API keys and stuff +ninja = bool(args.ninja) # Ninja mode toggle +crawl_level = args.level or 2 # Crawling level +thread_count = args.threads or 2 # Number of threads +only_urls = bool(args.only_urls) # only urls mode is off by default + +# Variables we are gonna use later to store stuff +keys = set() # high entropy strings, prolly secret keys +files = set() # pdf, css, png etc. +intel = set() # emails, website accounts, aws buckets etc. +robots = set() # entries of robots.txt +custom = set() # string extracted by custom regex pattern +failed = set() # urls that photon failed to crawl +scripts = set() # javascript files +external = set() # urls that don't belong to the target i.e. out-of-scope +fuzzable = set() # urls that have get params in them e.g. example.com/page.php?id=2 +endpoints = set() # urls found from javascript files +processed = set() # urls that have been crawled +internal = set([s for s in args.seeds]) # urls that belong to the target i.e. in-scope + +everything = [] +bad_intel = set() # unclean intel urls +bad_scripts = set() # unclean javascript file urls + +# If the user hasn't supplied the root url with http(s), we will handle it +if main_inp.startswith('http'): + main_url = main_inp +else: + try: + get('https://' + main_inp) + main_url = 'https://' + main_inp + except: + main_url = 'http://' + main_inp + +schema = main_url.split('//')[0] # https: or http:? + +internal.add(main_url) # adding the root url to internal for crawling + +host = urlparse(main_url).netloc # Extracts host out of the url + +output_dir = args.output or host + +#### +# This function extracts top level domain from a url +#### + +def topLevel(url): + try: + toplevel = tld.get_fld(host, fix_protocol=True) + except tld.exceptions.TldDomainNotFound: + toplevel = urlparse(main_url).netloc + return toplevel + +domain = topLevel(main_url) + +#### +# This function makes requests to webpage and returns response body +#### + +if args.user_agent: + user_agents = args.user_agent.split(',') +else: + with open(sys.path[0] + '/core/user-agents.txt', 'r') as uas: + user_agents = [agent.strip('\n') for agent in uas] + +def requester(url): + processed.add(url) # mark the url as crawled + time.sleep(delay) # pause/sleep the program for specified time + def normal(url): + headers = { + 'Host' : host, # ummm this is the hostname? + 'User-Agent' : random.choice(user_agents), # selecting a random user-agent + 'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language' : 'en-US,en;q=0.5', + 'Accept-Encoding' : 'gzip', + 'DNT' : '1', + 'Connection' : 'close'} + # make request and return response + response = get(url, cookies=cook, headers=headers, verify=False, timeout=timeout, stream=True) + if 'text/html' in response.headers['content-type']: + if response.status_code != '404': + return response.text + else: + response.close() + failed.add(url) + return 'dummy' + else: + response.close() + return 'dummy' + + # developer.facebook.com API + def facebook(url): + return get('https://developers.facebook.com/tools/debug/echo/?q=' + url, verify=False).text + + # pixlr.com API + def pixlr(url): + if url == main_url: + url = main_url + '/' # because pixlr throws error if http://example.com is used + # make request and return response + return get('https://pixlr.com/proxy/?url=' + url, headers={'Accept-Encoding' : 'gzip'}, verify=False).text + + # codebeautify.org API + def code_beautify(url): + headers = { + 'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0', + 'Accept' : 'text/plain, */*; q=0.01', + 'Accept-Encoding' : 'gzip', + 'Content-Type' : 'application/x-www-form-urlencoded; charset=UTF-8', + 'Origin' : 'https://codebeautify.org', + 'Connection' : 'close' + } + # make request and return response + return post('https://codebeautify.com/URLService', headers=headers, data='path=' + url, verify=False).text + + # www.photopea.com API + def photopea(url): + # make request and return response + return get('https://www.photopea.com/mirror.php?url=' + url, verify=False).text + + if ninja: # if the ninja mode is enabled + # select a random request function i.e. random API + response = random.choice([photopea, normal, facebook, pixlr, code_beautify])(url) + return response or 'dummy' + else: + return normal(url) + +#### +# This function gives verbose output +#### + +def verb(kind, string): + if verbose: + print ('%s %s: %s' % (info, kind, string)) + +#### +# This function extracts links from .xml files +#### + +def xmlParser(response): + return findall(r'(.*?)', response) # regex for extracting urls + +#### +# This function extracts links from robots.txt and sitemap.xml +#### + +def zap(url): + if args.archive: + from plugins.wayback import timeMachine + print ('%s Fetching URLs from archive.org' % run) + if False: + archived_urls = timeMachine(domain, 'domain') + else: + archived_urls = timeMachine(host, 'host') + print ('%s Retrieved %i URLs from archive.org' % (good, len(archived_urls) - 1)) + for url in archived_urls: + verb('Internal page', url) + internal.add(url) + response = get(url + '/robots.txt', verify=False).text # makes request to robots.txt + if '= 4: + verb('Key', match) + keys.add(url + ': ' + match) + +#### +# This function extracts endpoints from JavaScript Code +#### + +def jscanner(url): + response = requester(url) # make request to the url + matches = findall(r'[\'"](/.*?)[\'"]|[\'"](http.*?)[\'"]', response) # extract urls/endpoints + for match in matches: # iterate over the matches, match is a tuple + match = match[0] + match[1] # combining the items because one of them is always empty + if not search(r'[}{><"\']', match) and not match == '/': # making sure it's not some js code + verb('JS enpoint', match) + endpoints.add(match) # add it to the endpoints list + +#### +# This function starts multiple threads for a function +#### + +def threader(function, *urls): + threads = [] # list of threads + urls = urls[0] # because urls is a tuple + for url in urls: # iterating over urls + task = threading.Thread(target=function, args=(url,)) + threads.append(task) + # start threads + for thread in threads: + thread.start() + # wait for all threads to complete their work + for thread in threads: + thread.join() + # delete threads + del threads[:] + +#### +# This function processes the urls and uses a threadpool to execute a function +#### + +def flash(function, links): # This shit is NOT complicated, please enjoy + links = list(links) # convert links (set) to list + if sys.version_info < (3, 2): + for begin in range(0, len(links), thread_count): # range with step + end = begin + thread_count + splitted = links[begin:end] + threader(function, splitted) + progress = end + if progress > len(links): # fix if overflow + progress = len(links) + print('\r%s Progress: %i/%i' % (info, progress, len(links)), end='\r') + sys.stdout.flush() + else: + threadpool = concurrent.futures.ThreadPoolExecutor(max_workers=thread_count) + futures = (threadpool.submit(function, link) for link in links) + for i, _ in enumerate(concurrent.futures.as_completed(futures)): + if i + 1 == len(links) or (i + 1) % thread_count == 0: + print('%s Progress: %i/%i' % (info, i + 1, len(links)), end='\r') + print('') + +then = time.time() # records the time at which crawling started + +# Step 1. Extract urls from robots.txt & sitemap.xml +zap(main_url) + +# this is so the level 1 emails are parsed as well +internal = set(remove_regex(internal, args.exclude)) + +# Step 2. Crawl recursively to the limit specified in "crawl_level" +for level in range(crawl_level): + links = remove_regex(internal - processed, args.exclude) # links to crawl = (all links - already crawled links) - links not to crawl + if not links: # if links to crawl are 0 i.e. all links have been crawled + break + elif len(internal) <= len(processed): # if crawled links are somehow more than all links. Possible? ;/ + if len(internal) > 2 + len(args.seeds): # if you know it, you know it + break + print('%s Level %i: %i URLs' % (run, level + 1, len(links))) + try: + flash(extractor, links) + except KeyboardInterrupt: + print('') + break + +if not only_urls: + for match in bad_scripts: + if match.startswith(main_url): + scripts.add(match) + elif match.startswith('/') and not match.startswith('//'): + scripts.add(main_url + match) + elif not match.startswith('http') and not match.startswith('//'): + scripts.add(main_url + '/' + match) + # Step 3. Scan the JavaScript files for enpoints + print('%s Crawling %i JavaScript files' % (run, len(scripts))) + flash(jscanner, scripts) + + for url in internal: + if '=' in url: + fuzzable.add(url) + + for match in bad_intel: + for x in match: # because "match" is a tuple + if x != '': # if the value isn't empty + intel.add(x) + for url in external: + try: + if tld.get_fld(url, fix_protocol=True) in intels: + intel.add(url) + except: + pass + +now = time.time() # records the time at which crawling stopped +diff = (now - then) # finds total time taken + +def timer(diff): + minutes, seconds = divmod(diff, 60) # Changes seconds into minutes and seconds + try: + time_per_request = diff / float(len(processed)) # Finds average time taken by requests + except ZeroDivisionError: + time_per_request = 0 + return minutes, seconds, time_per_request +minutes, seconds, time_per_request = timer(diff) + +# Step 4. Save the results +if not os.path.exists(output_dir): # if the directory doesn't exist + os.mkdir(output_dir) # create a new directory + +datasets = [files, intel, robots, custom, failed, internal, scripts, external, fuzzable, endpoints, keys] +dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys'] + +def writer(datasets, dataset_names, output_dir): + for dataset, dataset_name in zip(datasets, dataset_names): + if dataset: + filepath = output_dir + '/' + dataset_name + '.txt' + if python3: + with open(filepath, 'w+', encoding='utf8') as f: + f.write(str('\n'.join(dataset))) + f.write('\n') + else: + with open(filepath, 'w+') as f: + joined = '\n'.join(dataset) + f.write(str(joined.encode('utf-8'))) + f.write('\n') + +writer(datasets, dataset_names, output_dir) +# Printing out results +print (('%s-%s' % (red, end)) * 50) +for dataset, dataset_name in zip(datasets, dataset_names): + if dataset: + print ('%s %s: %s' % (good, dataset_name.capitalize(), len(dataset))) +print (('%s-%s' % (red, end)) * 50) + +print('%s Total requests made: %i' % (info, len(processed))) +print('%s Total time taken: %i minutes %i seconds' % (info, minutes, seconds)) +print('%s Requests per second: %i' % (info, int(len(processed)/diff))) + +datasets = { +'files': list(files), 'intel': list(intel), 'robots': list(robots), 'custom': list(custom), 'failed': list(failed), 'internal': list(internal), +'scripts': list(scripts), 'external': list(external), 'fuzzable': list(fuzzable), 'endpoints': list(endpoints), 'keys' : list(keys) +} + +if args.dns: + print ('%s Enumerating subdomains' % run) + from plugins.findSubdomains import findSubdomains + subdomains = findSubdomains(domain) + print ('%s %i subdomains found' % (info, len(subdomains))) + writer([subdomains], ['subdomains'], output_dir) + datasets['subdomains'] = subdomains + from plugins.dnsdumpster import dnsdumpster + print ('%s Generating DNS map' % run) + dnsdumpster(domain, output_dir) + +if args.export: + from plugins.exporter import exporter + # exporter(directory, format, datasets) + exporter(output_dir, args.export, datasets) + +print('%s Results saved in %s%s%s directory' % (good, green, output_dir, end)) + +if args.std: + for string in datasets[args.std]: + sys.stdout.write(string + '\n')