From c3e04024f5beb5132b6a04eb21e70077ea63c54c Mon Sep 17 00:00:00 2001
From: "@_tID" <theinfecteddrake@gmail.com>
Date: Tue, 18 Sep 2018 14:46:51 +0530
Subject: [PATCH] verbose output + animation fix for python2

---
 photon.py | 1224 +++++++++++++++++++++++++++--------------------------
 1 file changed, 624 insertions(+), 600 deletions(-)

diff --git a/photon.py b/photon.py
index e0af76a..cf36605 100644
--- a/photon.py
+++ b/photon.py
@@ -1,600 +1,624 @@
-#!/usr/bin/env python3
-# -*- coding: utf-8 -*-
-
-from __future__ import print_function
-
-# Let's import what we need
-import os
-import sys
-import tld
-import time
-import random
-import warnings
-import argparse
-import threading
-from math import log
-from re import search, findall
-from requests import get, post
-
-try:
-    import concurrent.futures
-    from urllib.parse import urlparse # for python3
-    python2, python3 = False, True
-except ImportError:
-    from urlparse import urlparse # for python2
-    python2, python3 = True, False
-
-from core.config import intels, badTypes
-
-try:
-    input = raw_input
-except NameError:
-    pass
-
-colors = True # Output should be colored
-machine = sys.platform # Detecting the os of current system
-if machine.lower().startswith(('os', 'win', 'darwin', 'ios')):
-    colors = False # Colors shouldn't be displayed in mac & windows
-if not colors:
-    end = red = white = green = yellow = run = bad = good = info = que = ''
-else:
-    end = '\033[1;m'
-    red = '\033[91m'
-    white = '\033[1;97m'
-    green = '\033[1;32m'
-    yellow = '\033[1;33m'
-    run = '\033[1;97m[~]\033[1;m'
-    bad = '\033[1;31m[-]\033[1;m'
-    good = '\033[1;32m[+]\033[1;m'
-    info = '\033[1;33m[!]\033[1;m'
-    que = '\033[1;34m[?]\033[1;m'
-
-# Just a fancy ass banner
-print('''%s      ____  __          __
-     / %s__%s \/ /_  ____  / /_____  ____
-    / %s/_/%s / __ \/ %s__%s \/ __/ %s__%s \/ __ \\
-   / ____/ / / / %s/_/%s / /_/ %s/_/%s / / / /
-  /_/   /_/ /_/\____/\__/\____/_/ /_/ %sv1.1.1%s\n''' %
-  (red, white, red, white, red, white, red, white, red, white, red, white, red, white, end))
-
-warnings.filterwarnings('ignore') # Disable SSL related warnings
-
-# Processing command line arguments
-parser = argparse.ArgumentParser()
-# Options
-parser.add_argument('-u', '--url', help='root url', dest='root')
-parser.add_argument('-c', '--cookie', help='cookie', dest='cook')
-parser.add_argument('-r', '--regex', help='regex pattern', dest='regex')
-parser.add_argument('-e', '--export', help='export format', dest='export')
-parser.add_argument('-o', '--output', help='output directory', dest='output')
-parser.add_argument('-l', '--level', help='levels to crawl', dest='level', type=int)
-parser.add_argument('-t', '--threads', help='number of threads', dest='threads', type=int)
-parser.add_argument('-d', '--delay', help='delay between requests', dest='delay', type=float)
-parser.add_argument('-s', '--seeds', help='additional seed urls', dest='seeds', nargs="+", default=[])
-
-parser.add_argument('--stdout', help='send variables to stdout', dest='std')
-parser.add_argument('--user-agent', help='custom user agent(s)', dest='user_agent')
-parser.add_argument('--exclude', help='exclude urls matching this regex', dest='exclude')
-parser.add_argument('--timeout', help='http request timeout', dest='timeout', type=float)
-
-# Switches
-parser.add_argument('--dns', help='enumerate subdomains & dns data', dest='dns', action='store_true')
-parser.add_argument('--ninja', help='ninja mode', dest='ninja', action='store_true')
-parser.add_argument('--keys', help='find secret keys', dest='api', action='store_true')
-parser.add_argument('--update', help='update photon', dest='update', action='store_true')
-parser.add_argument('--only-urls', help='only extract urls', dest='only_urls', action='store_true')
-parser.add_argument('--wayback', help='fetch urls from archive.org as seeds', dest='archive', action='store_true')
-args = parser.parse_args()
-
-####
-# This function git clones the latest version and merges it with the current directory
-####
-
-def update():
-    print('%s Checking for updates' % run)
-    changes = '''bug fixes;minor refactor;added --stdout option''' # Changes must be seperated by ;
-    latest_commit = get('https://raw.githubusercontent.com/s0md3v/Photon/master/photon.py').text
-
-    if changes not in latest_commit: # just a hack to see if a new version is available
-        changelog = search(r"changes = '''(.*?)'''", latest_commit)
-        changelog = changelog.group(1).split(';') # splitting the changes to form a list
-        print('%s A new version of Photon is available.' % good)
-        print('%s Changes:' % info)
-        for change in changelog: # print changes
-            print('%s>%s %s' % (green, end, change))
-
-        current_path = os.getcwd().split('/') # if you know it, you know it
-        folder = current_path[-1] # current directory name
-        path = '/'.join(current_path) # current directory path
-        choice = input('%s Would you like to update? [Y/n] ' % que).lower()
-
-        if choice != 'n':
-            print('%s Updating Photon' % run)
-            os.system('git clone --quiet https://github.com/s0md3v/Photon %s' % (folder))
-            os.system('cp -r %s/%s/* %s && rm -r %s/%s/ 2>/dev/null' % (path, folder, path, path, folder))
-            print('%s Update successful!' % good)
-    else:
-        print('%s Photon is up to date!' % good)
-
-if args.update: # if the user has supplied --update argument
-    update()
-    quit() # quitting because files have been changed
-
-if args.root: # if the user has supplied a url
-    main_inp = args.root
-    if main_inp.endswith('/'): # if the url ends with '/'
-        main_inp = main_inp[:-1] # we will remove it as it can cause problems later in the code
-else: # if the user hasn't supplied a url
-    print('\n' + parser.format_help().lower())
-    quit()
-
-delay = args.delay or 0  # Delay between requests
-timeout = args.timeout or 6  # HTTP request timeout
-cook = args.cook or None  # Cookie
-api = bool(args.api)  # extract high entropy strings i.e. API keys and stuff
-ninja = bool(args.ninja)  # Ninja mode toggle
-crawl_level = args.level or 2  # Crawling level
-thread_count = args.threads or 2  # Number of threads
-only_urls = bool(args.only_urls)  # only urls mode is off by default
-
-# Variables we are gonna use later to store stuff
-keys = set() # high entropy strings, prolly secret keys
-files = set() # pdf, css, png etc.
-intel = set() # emails, website accounts, aws buckets etc.
-robots = set() # entries of robots.txt
-custom = set() # string extracted by custom regex pattern
-failed = set() # urls that photon failed to crawl
-scripts = set() # javascript files
-external = set() # urls that don't belong to the target i.e. out-of-scope
-fuzzable = set() # urls that have get params in them e.g. example.com/page.php?id=2
-endpoints = set() # urls found from javascript files
-processed = set() # urls that have been crawled
-internal = set([s for s in args.seeds]) # urls that belong to the target i.e. in-scope
-
-everything = []
-bad_intel = set() # unclean intel urls
-bad_scripts = set() # unclean javascript file urls
-
-# If the user hasn't supplied the root url with http(s), we will handle it
-if main_inp.startswith('http'):
-    main_url = main_inp
-else:
-    try:
-        get('https://' + main_inp)
-        main_url = 'https://' + main_inp
-    except:
-        main_url = 'http://' + main_inp
-
-schema = main_url.split('//')[0] # https: or http:?
-
-internal.add(main_url) # adding the root url to internal for crawling
-
-host = urlparse(main_url).netloc # Extracts host out of the url
-
-output_dir = args.output or host
-
-####
-# This function extracts top level domain from a url
-####
-
-def topLevel(url):
-    try:
-        toplevel = tld.get_fld(host, fix_protocol=True)
-    except tld.exceptions.TldDomainNotFound:
-        toplevel = urlparse(main_url).netloc
-    return toplevel
-
-domain = topLevel(main_url)
-
-####
-# This function makes requests to webpage and returns response body
-####
-
-if args.user_agent:
-    user_agents = args.user_agent.split(',')
-else:
-    with open(sys.path[0] + '/core/user-agents.txt', 'r') as uas:
-        user_agents = [agent.strip('\n') for agent in uas]
-
-def requester(url):
-    processed.add(url) # mark the url as crawled
-    time.sleep(delay) # pause/sleep the program for specified time
-    def normal(url):
-        headers = {
-        'Host' : host, # ummm this is the hostname?
-        'User-Agent' : random.choice(user_agents), # selecting a random user-agent
-        'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
-        'Accept-Language' : 'en-US,en;q=0.5',
-        'Accept-Encoding' : 'gzip',
-        'DNT' : '1',
-        'Connection' : 'close'}
-        # make request and return response
-        response = get(url, cookies=cook, headers=headers, verify=False, timeout=timeout, stream=True)
-        if 'text/html' in response.headers['content-type']:
-            if response.status_code != '404':
-                return response.text
-            else:
-                response.close()
-                failed.add(url)
-                return 'dummy'
-        else:
-            response.close()
-            return 'dummy'
-
-    # developer.facebook.com API
-    def facebook(url):
-        return get('https://developers.facebook.com/tools/debug/echo/?q=' + url, verify=False).text
-
-    # pixlr.com API
-    def pixlr(url):
-        if url == main_url:
-            url = main_url + '/' # because pixlr throws error if http://example.com is used
-        # make request and return response
-        return get('https://pixlr.com/proxy/?url=' + url, headers={'Accept-Encoding' : 'gzip'}, verify=False).text
-
-    # codebeautify.org API
-    def code_beautify(url):
-        headers = {
-        'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
-        'Accept' : 'text/plain, */*; q=0.01',
-        'Accept-Encoding' : 'gzip',
-        'Content-Type' : 'application/x-www-form-urlencoded; charset=UTF-8',
-        'Origin' : 'https://codebeautify.org',
-        'Connection' : 'close'
-        }
-        # make request and return response
-        return post('https://codebeautify.com/URLService', headers=headers, data='path=' + url, verify=False).text
-
-    # www.photopea.com API
-    def photopea(url):
-        # make request and return response
-        return get('https://www.photopea.com/mirror.php?url=' + url, verify=False).text
-
-    if ninja: # if the ninja mode is enabled
-        # select a random request function i.e. random API
-        response = random.choice([photopea, normal, facebook, pixlr, code_beautify])(url)
-        return response or 'dummy'
-    else:
-        return normal(url)
-
-####
-# This function extracts links from .xml files
-####
-
-def xmlParser(response):
-    return findall(r'<loc>(.*?)</loc>', response) # regex for extracting urls
-
-####
-# This function extracts links from robots.txt and sitemap.xml
-####
-
-def zap(url):
-    if args.archive:
-        from plugins.wayback import timeMachine
-        print ('%s Fetching URLs from archive.org' % run)
-        if False:
-            archived_urls = timeMachine(domain, 'domain')
-        else:
-            archived_urls = timeMachine(host, 'host')
-        print ('%s Retrieved %i URLs from archive.org' % (good, len(archived_urls) - 1))
-        for url in archived_urls:
-            internal.add(url)
-    response = get(url + '/robots.txt', verify=False).text # makes request to robots.txt
-    if '<body' not in response: # making sure robots.txt isn't some fancy 404 page
-        matches = findall(r'Allow: (.*)|Disallow: (.*)', response) # If you know it, you know it
-        if matches:
-            for match in matches: # iterating over the matches, match is a tuple here
-                match = ''.join(match) # one item in match will always be empty so will combine both items
-                if '*' not in match: # if the url doesn't use a wildcard
-                    url = main_url + match
-                    internal.add(url) # add the url to internal list for crawling
-                    robots.add(url) # add the url to robots list
-            print('%s URLs retrieved from robots.txt: %s' % (good, len(robots)))
-    response = get(url + '/sitemap.xml', verify=False).text # makes request to sitemap.xml
-    if '<body' not in response: # making sure robots.txt isn't some fancy 404 page
-        matches = xmlParser(response)
-        if matches: # if there are any matches
-            print('%s URLs retrieved from sitemap.xml: %s' % (good, len(matches)))
-            for match in matches:
-                internal.add(match) #cleaning up the url & adding it to the internal list for crawling
-
-####
-# This functions checks whether a url matches a regular expression
-####
-
-def remove_regex(urls, regex):
-    """
-    Parses a list for non-matches to a regex
-
-    Args:
-        urls: iterable of urls
-        custom_regex: string regex to be parsed for
-
-    Returns:
-        list of strings not matching regex
-    """
-
-    if not regex:
-        return urls
-
-    # to avoid iterating over the characters of a string
-    if not isinstance(urls, (list, set, tuple)):
-        urls = [urls]
-
-    try:
-        non_matching_urls = [url for url in urls if not search(regex, url)]
-    except TypeError:
-        return []
-
-    return non_matching_urls
-
-
-####
-# This functions checks whether a url should be crawled or not
-####
-
-def is_link(url):
-    # file extension that don't need to be crawled and are files
-    conclusion = False # whether the the url should be crawled or not
-
-    if url not in processed: # if the url hasn't been crawled already
-        if url.split('.')[-1].lower() in badTypes:
-            files.add(url)
-        else:
-            return True # url can be crawled
-    return conclusion # return the conclusion :D
-
-####
-# This function extracts string based on regex pattern supplied by user
-####
-
-supress_regex = False
-def regxy(pattern, response):
-    try:
-        matches = findall(r'%s' % pattern, response)
-        for match in matches:
-            custom.add(match)
-    except:
-        supress_regex = True
-
-####
-# This function extracts intel from the response body
-####
-
-def intel_extractor(response):
-    matches = findall(r'''([\w\.-]+s[\w\.-]+\.amazonaws\.com)|([\w\.-]+@[\w\.-]+\.[\.\w]+)''', response)
-    if matches:
-        for match in matches: # iterate over the matches
-            bad_intel.add(match) # add it to intel list
-####
-# This function extracts js files from the response body
-####
-
-def js_extractor(response):
-    matches = findall(r'src=[\'"](.*?\.js)["\']', response) # extract .js files
-    for match in matches: # iterate over the matches
-        bad_scripts.add(match)
-
-####
-# This function calculates the entropy of a string
-####
-
-def entropy(payload):
-    entropy = 0
-    for number in range(256):
-        result = float(payload.encode('utf-8').count(chr(number)))/len(payload.encode('utf-8'))
-        if result != 0:
-            entropy = entropy - result * log(result, 2)
-    return entropy
-
-####
-# This function extracts stuff from the response body
-####
-
-def extractor(url):
-    response = requester(url) # make request to the url
-    matches = findall(r'<[aA].*href=["\']{0,1}(.*?)["\']', response)
-    for link in matches: # iterate over the matches
-        link = link.split('#')[0] # remove everything after a "#" to deal with in-page anchors
-        if is_link(link): # checks if the urls should be crawled
-            if link[:4] == 'http':
-                if link.startswith(main_url):
-                    internal.add(link)
-                else:
-                    external.add(link)
-            elif link[:2] == '//':
-                if link.split('/')[2].startswith(host):
-                    internal.add(schema + link)
-                else:
-                    external.add(link)
-            elif link[:1] == '/':
-                internal.add(main_url + link)
-            else:
-                internal.add(main_url + '/' + link)
-
-    if not only_urls:
-        intel_extractor(response)
-        js_extractor(response)
-    if args.regex and not supress_regex:
-        regxy(args.regex, response)
-    if api:
-        matches = findall(r'[\w-]{16,45}', response)
-        for match in matches:
-            if entropy(match) >= 4:
-                keys.add(url + ': ' + match)
-
-####
-# This function extracts endpoints from JavaScript Code
-####
-
-def jscanner(url):
-    response = requester(url) # make request to the url
-    matches = findall(r'[\'"](/.*?)[\'"]|[\'"](http.*?)[\'"]', response) # extract urls/endpoints
-    for match in matches: # iterate over the matches, match is a tuple
-        match = match[0] + match[1] # combining the items because one of them is always empty
-        if not search(r'[}{><"\']', match) and not match == '/': # making sure it's not some js code
-            endpoints.add(match) # add it to the endpoints list
-
-####
-# This function starts multiple threads for a function
-####
-
-def threader(function, *urls):
-    threads = [] # list of threads
-    urls = urls[0] # because urls is a tuple
-    for url in urls: # iterating over urls
-        task = threading.Thread(target=function, args=(url,))
-        threads.append(task)
-    # start threads
-    for thread in threads:
-        thread.start()
-    # wait for all threads to complete their work
-    for thread in threads:
-        thread.join()
-    # delete threads
-    del threads[:]
-
-####
-# This function processes the urls and uses a threadpool to execute a function
-####
-
-def flash(function, links): # This shit is NOT complicated, please enjoy
-    links = list(links) # convert links (set) to list
-    if sys.version_info < (3, 2):
-        for begin in range(0, len(links), thread_count): # range with step
-            end = begin + thread_count
-            splitted = links[begin:end]
-            threader(function, splitted)
-            progress = end
-            if progress > len(links): # fix if overflow
-                progress = len(links)
-            print('\r%s Progress: %i/%i' % (info, progress, len(links)), end='\r')
-    else:
-        threadpool = concurrent.futures.ThreadPoolExecutor(max_workers=thread_count)
-        futures = (threadpool.submit(function, link) for link in links)
-        for i, _ in enumerate(concurrent.futures.as_completed(futures)):
-            if i + 1 == len(links) or (i + 1) % thread_count == 0:
-                print('%s Progress: %i/%i' % (info, i + 1, len(links)), end='\r')
-    print('')
-
-then = time.time() # records the time at which crawling started
-
-# Step 1. Extract urls from robots.txt & sitemap.xml
-zap(main_url)
-
-# this is so the level 1 emails are parsed as well
-internal = set(remove_regex(internal, args.exclude))
-
-# Step 2. Crawl recursively to the limit specified in "crawl_level"
-for level in range(crawl_level):
-    links = remove_regex(internal - processed, args.exclude) # links to crawl = (all links - already crawled links) - links not to crawl
-    if not links: # if links to crawl are 0 i.e. all links have been crawled
-        break
-    elif len(internal) <= len(processed): # if crawled links are somehow more than all links. Possible? ;/
-        if len(internal) > 2 + len(args.seeds): # if you know it, you know it
-            break
-    print('%s Level %i: %i URLs' % (run, level + 1, len(links)))
-    try:
-        flash(extractor, links)
-    except KeyboardInterrupt:
-        print('')
-        break
-
-if not only_urls:
-    for match in bad_scripts:
-        if match.startswith(main_url):
-            scripts.add(match)
-        elif match.startswith('/') and not match.startswith('//'):
-            scripts.add(main_url + match)
-        elif not match.startswith('http') and not match.startswith('//'):
-            scripts.add(main_url + '/' + match)
-    # Step 3. Scan the JavaScript files for enpoints
-    print('%s Crawling %i JavaScript files' % (run, len(scripts)))
-    flash(jscanner, scripts)
-
-    for url in internal:
-        if '=' in url:
-            fuzzable.add(url)
-
-    for match in bad_intel:
-        for x in match: # because "match" is a tuple
-            if x != '': # if the value isn't empty
-                intel.add(x)
-        for url in external:
-            try:
-                if tld.get_fld(url, fix_protocol=True) in intels:
-                    intel.add(url)
-            except:
-                pass
-
-now = time.time() # records the time at which crawling stopped
-diff = (now - then) # finds total time taken
-
-def timer(diff):
-    minutes, seconds = divmod(diff, 60) # Changes seconds into minutes and seconds
-    try:
-        time_per_request = diff / float(len(processed)) # Finds average time taken by requests
-    except ZeroDivisionError:
-        time_per_request = 0
-    return minutes, seconds, time_per_request
-minutes, seconds, time_per_request = timer(diff)
-
-# Step 4. Save the results
-if not os.path.exists(output_dir): # if the directory doesn't exist
-    os.mkdir(output_dir) # create a new directory
-
-datasets = [files, intel, robots, custom, failed, internal, scripts, external, fuzzable, endpoints, keys]
-dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys']
-
-def writer(datasets, dataset_names, output_dir):
-    for dataset, dataset_name in zip(datasets, dataset_names):
-        if dataset:
-            filepath = output_dir + '/' + dataset_name + '.txt'
-            if python3:
-                with open(filepath, 'w+', encoding='utf8') as f:
-                    f.write(str('\n'.join(dataset)))
-                    f.write('\n')
-            else:
-                with open(filepath, 'w+') as f:
-                    joined = '\n'.join(dataset)
-                    f.write(str(joined.encode('utf-8')))
-                    f.write('\n')
-
-writer(datasets, dataset_names, output_dir)
-# Printing out results
-print (('%s-%s' % (red, end)) * 50)
-for dataset, dataset_name in zip(datasets, dataset_names):
-    if dataset:
-        print ('%s %s: %s' % (good, dataset_name.capitalize(), len(dataset)))
-print (('%s-%s' % (red, end)) * 50)
-
-print('%s Total requests made: %i' % (info, len(processed)))
-print('%s Total time taken: %i minutes %i seconds' % (info, minutes, seconds))
-print('%s Requests per second: %i' % (info, int(len(processed)/diff)))
-
-datasets = {
-'files': list(files), 'intel': list(intel), 'robots': list(robots), 'custom': list(custom), 'failed': list(failed), 'internal': list(internal),
-'scripts': list(scripts), 'external': list(external), 'fuzzable': list(fuzzable), 'endpoints': list(endpoints), 'keys' : list(keys)
-}
-
-if args.dns:
-    print ('%s Enumerating subdomains' % run)
-    from plugins.findSubdomains import findSubdomains
-    subdomains = findSubdomains(domain)
-    print ('%s %i subdomains found' % (info, len(subdomains)))
-    writer([subdomains], ['subdomains'], output_dir)
-    datasets['subdomains'] = subdomains
-    from plugins.dnsdumpster import dnsdumpster
-    print ('%s Generating DNS map' % run)
-    dnsdumpster(domain, output_dir)
-
-if args.export:
-    from plugins.exporter import exporter
-    # exporter(directory, format, datasets)
-    exporter(output_dir, args.export, datasets)
-
-print('%s Results saved in %s%s%s directory' % (good, green, output_dir, end))
-
-if args.std:
-    for string in datasets[args.std]:
-        sys.stdout.write(string + '\n')
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+from __future__ import print_function
+
+# Let's import what we need
+import os
+import sys
+import tld
+import time
+import random
+import warnings
+import argparse
+import threading
+from math import log
+from re import search, findall
+from requests import get, post
+
+try:
+    import concurrent.futures
+    from urllib.parse import urlparse # for python3
+    python2, python3 = False, True
+except ImportError:
+    from urlparse import urlparse # for python2
+    python2, python3 = True, False
+
+from core.config import intels, badTypes
+
+try:
+    input = raw_input
+except NameError:
+    pass
+
+colors = True # Output should be colored
+machine = sys.platform # Detecting the os of current system
+if machine.lower().startswith(('os', 'win', 'darwin', 'ios')):
+    colors = False # Colors shouldn't be displayed in mac & windows
+if not colors:
+    end = red = white = green = yellow = run = bad = good = info = que = ''
+else:
+    end = '\033[1;m'
+    red = '\033[91m'
+    white = '\033[1;97m'
+    green = '\033[1;32m'
+    yellow = '\033[1;33m'
+    run = '\033[1;97m[~]\033[1;m'
+    bad = '\033[1;31m[-]\033[1;m'
+    good = '\033[1;32m[+]\033[1;m'
+    info = '\033[1;33m[!]\033[1;m'
+    que = '\033[1;34m[?]\033[1;m'
+
+# Just a fancy ass banner
+print('''%s      ____  __          __
+     / %s__%s \/ /_  ____  / /_____  ____
+    / %s/_/%s / __ \/ %s__%s \/ __/ %s__%s \/ __ \\
+   / ____/ / / / %s/_/%s / /_/ %s/_/%s / / / /
+  /_/   /_/ /_/\____/\__/\____/_/ /_/ %sv1.1.4%s\n''' %
+  (red, white, red, white, red, white, red, white, red, white, red, white, red, white, end))
+
+warnings.filterwarnings('ignore') # Disable SSL related warnings
+
+# Processing command line arguments
+parser = argparse.ArgumentParser()
+# Options
+parser.add_argument('-u', '--url', help='root url', dest='root')
+parser.add_argument('-c', '--cookie', help='cookie', dest='cook')
+parser.add_argument('-r', '--regex', help='regex pattern', dest='regex')
+parser.add_argument('-e', '--export', help='export format', dest='export')
+parser.add_argument('-o', '--output', help='output directory', dest='output')
+parser.add_argument('-l', '--level', help='levels to crawl', dest='level', type=int)
+parser.add_argument('-t', '--threads', help='number of threads', dest='threads', type=int)
+parser.add_argument('-d', '--delay', help='delay between requests', dest='delay', type=float)
+parser.add_argument('-v', '--verbose', help='verbose output', dest='verbose', action='store_true')
+parser.add_argument('-s', '--seeds', help='additional seed urls', dest='seeds', nargs="+", default=[])
+
+parser.add_argument('--stdout', help='send variables to stdout', dest='std')
+parser.add_argument('--user-agent', help='custom user agent(s)', dest='user_agent')
+parser.add_argument('--exclude', help='exclude urls matching this regex', dest='exclude')
+parser.add_argument('--timeout', help='http request timeout', dest='timeout', type=float)
+
+# Switches
+parser.add_argument('--dns', help='enumerate subdomains & dns data', dest='dns', action='store_true')
+parser.add_argument('--ninja', help='ninja mode', dest='ninja', action='store_true')
+parser.add_argument('--keys', help='find secret keys', dest='api', action='store_true')
+parser.add_argument('--update', help='update photon', dest='update', action='store_true')
+parser.add_argument('--only-urls', help='only extract urls', dest='only_urls', action='store_true')
+parser.add_argument('--wayback', help='fetch urls from archive.org as seeds', dest='archive', action='store_true')
+args = parser.parse_args()
+
+####
+# This function git clones the latest version and merges it with the current directory
+####
+
+def update():
+    print('%s Checking for updates' % run)
+    changes = '''added -v option;progress animation fix for python2''' # Changes must be seperated by ;
+    latest_commit = get('https://raw.githubusercontent.com/s0md3v/Photon/master/photon.py').text
+
+    if changes not in latest_commit: # just a hack to see if a new version is available
+        changelog = search(r"changes = '''(.*?)'''", latest_commit)
+        changelog = changelog.group(1).split(';') # splitting the changes to form a list
+        print('%s A new version of Photon is available.' % good)
+        print('%s Changes:' % info)
+        for change in changelog: # print changes
+            print('%s>%s %s' % (green, end, change))
+
+        current_path = os.getcwd().split('/') # if you know it, you know it
+        folder = current_path[-1] # current directory name
+        path = '/'.join(current_path) # current directory path
+        choice = input('%s Would you like to update? [Y/n] ' % que).lower()
+
+        if choice != 'n':
+            print('%s Updating Photon' % run)
+            os.system('git clone --quiet https://github.com/s0md3v/Photon %s' % (folder))
+            os.system('cp -r %s/%s/* %s && rm -r %s/%s/ 2>/dev/null' % (path, folder, path, path, folder))
+            print('%s Update successful!' % good)
+    else:
+        print('%s Photon is up to date!' % good)
+
+if args.update: # if the user has supplied --update argument
+    update()
+    quit() # quitting because files have been changed
+
+if args.root: # if the user has supplied a url
+    main_inp = args.root
+    if main_inp.endswith('/'): # if the url ends with '/'
+        main_inp = main_inp[:-1] # we will remove it as it can cause problems later in the code
+else: # if the user hasn't supplied a url
+    print('\n' + parser.format_help().lower())
+    quit()
+
+verbose = args.verbose
+delay = args.delay or 0  # Delay between requests
+timeout = args.timeout or 6  # HTTP request timeout
+cook = args.cook or None  # Cookie
+api = bool(args.api)  # extract high entropy strings i.e. API keys and stuff
+ninja = bool(args.ninja)  # Ninja mode toggle
+crawl_level = args.level or 2  # Crawling level
+thread_count = args.threads or 2  # Number of threads
+only_urls = bool(args.only_urls)  # only urls mode is off by default
+
+# Variables we are gonna use later to store stuff
+keys = set() # high entropy strings, prolly secret keys
+files = set() # pdf, css, png etc.
+intel = set() # emails, website accounts, aws buckets etc.
+robots = set() # entries of robots.txt
+custom = set() # string extracted by custom regex pattern
+failed = set() # urls that photon failed to crawl
+scripts = set() # javascript files
+external = set() # urls that don't belong to the target i.e. out-of-scope
+fuzzable = set() # urls that have get params in them e.g. example.com/page.php?id=2
+endpoints = set() # urls found from javascript files
+processed = set() # urls that have been crawled
+internal = set([s for s in args.seeds]) # urls that belong to the target i.e. in-scope
+
+everything = []
+bad_intel = set() # unclean intel urls
+bad_scripts = set() # unclean javascript file urls
+
+# If the user hasn't supplied the root url with http(s), we will handle it
+if main_inp.startswith('http'):
+    main_url = main_inp
+else:
+    try:
+        get('https://' + main_inp)
+        main_url = 'https://' + main_inp
+    except:
+        main_url = 'http://' + main_inp
+
+schema = main_url.split('//')[0] # https: or http:?
+
+internal.add(main_url) # adding the root url to internal for crawling
+
+host = urlparse(main_url).netloc # Extracts host out of the url
+
+output_dir = args.output or host
+
+####
+# This function extracts top level domain from a url
+####
+
+def topLevel(url):
+    try:
+        toplevel = tld.get_fld(host, fix_protocol=True)
+    except tld.exceptions.TldDomainNotFound:
+        toplevel = urlparse(main_url).netloc
+    return toplevel
+
+domain = topLevel(main_url)
+
+####
+# This function makes requests to webpage and returns response body
+####
+
+if args.user_agent:
+    user_agents = args.user_agent.split(',')
+else:
+    with open(sys.path[0] + '/core/user-agents.txt', 'r') as uas:
+        user_agents = [agent.strip('\n') for agent in uas]
+
+def requester(url):
+    processed.add(url) # mark the url as crawled
+    time.sleep(delay) # pause/sleep the program for specified time
+    def normal(url):
+        headers = {
+        'Host' : host, # ummm this is the hostname?
+        'User-Agent' : random.choice(user_agents), # selecting a random user-agent
+        'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+        'Accept-Language' : 'en-US,en;q=0.5',
+        'Accept-Encoding' : 'gzip',
+        'DNT' : '1',
+        'Connection' : 'close'}
+        # make request and return response
+        response = get(url, cookies=cook, headers=headers, verify=False, timeout=timeout, stream=True)
+        if 'text/html' in response.headers['content-type']:
+            if response.status_code != '404':
+                return response.text
+            else:
+                response.close()
+                failed.add(url)
+                return 'dummy'
+        else:
+            response.close()
+            return 'dummy'
+
+    # developer.facebook.com API
+    def facebook(url):
+        return get('https://developers.facebook.com/tools/debug/echo/?q=' + url, verify=False).text
+
+    # pixlr.com API
+    def pixlr(url):
+        if url == main_url:
+            url = main_url + '/' # because pixlr throws error if http://example.com is used
+        # make request and return response
+        return get('https://pixlr.com/proxy/?url=' + url, headers={'Accept-Encoding' : 'gzip'}, verify=False).text
+
+    # codebeautify.org API
+    def code_beautify(url):
+        headers = {
+        'User-Agent' : 'Mozilla/5.0 (X11; Linux x86_64; rv:61.0) Gecko/20100101 Firefox/61.0',
+        'Accept' : 'text/plain, */*; q=0.01',
+        'Accept-Encoding' : 'gzip',
+        'Content-Type' : 'application/x-www-form-urlencoded; charset=UTF-8',
+        'Origin' : 'https://codebeautify.org',
+        'Connection' : 'close'
+        }
+        # make request and return response
+        return post('https://codebeautify.com/URLService', headers=headers, data='path=' + url, verify=False).text
+
+    # www.photopea.com API
+    def photopea(url):
+        # make request and return response
+        return get('https://www.photopea.com/mirror.php?url=' + url, verify=False).text
+
+    if ninja: # if the ninja mode is enabled
+        # select a random request function i.e. random API
+        response = random.choice([photopea, normal, facebook, pixlr, code_beautify])(url)
+        return response or 'dummy'
+    else:
+        return normal(url)
+
+####
+# This function gives verbose output
+####
+
+def verb(kind, string):
+    if verbose:
+        print ('%s %s: %s' % (info, kind, string))
+
+####
+# This function extracts links from .xml files
+####
+
+def xmlParser(response):
+    return findall(r'<loc>(.*?)</loc>', response) # regex for extracting urls
+
+####
+# This function extracts links from robots.txt and sitemap.xml
+####
+
+def zap(url):
+    if args.archive:
+        from plugins.wayback import timeMachine
+        print ('%s Fetching URLs from archive.org' % run)
+        if False:
+            archived_urls = timeMachine(domain, 'domain')
+        else:
+            archived_urls = timeMachine(host, 'host')
+        print ('%s Retrieved %i URLs from archive.org' % (good, len(archived_urls) - 1))
+        for url in archived_urls:
+            verb('Internal page', url)
+            internal.add(url)
+    response = get(url + '/robots.txt', verify=False).text # makes request to robots.txt
+    if '<body' not in response: # making sure robots.txt isn't some fancy 404 page
+        matches = findall(r'Allow: (.*)|Disallow: (.*)', response) # If you know it, you know it
+        if matches:
+            for match in matches: # iterating over the matches, match is a tuple here
+                match = ''.join(match) # one item in match will always be empty so will combine both items
+                if '*' not in match: # if the url doesn't use a wildcard
+                    url = main_url + match
+                    internal.add(url) # add the url to internal list for crawling
+                    robots.add(url) # add the url to robots list
+            print('%s URLs retrieved from robots.txt: %s' % (good, len(robots)))
+    response = get(url + '/sitemap.xml', verify=False).text # makes request to sitemap.xml
+    if '<body' not in response: # making sure robots.txt isn't some fancy 404 page
+        matches = xmlParser(response)
+        if matches: # if there are any matches
+            print('%s URLs retrieved from sitemap.xml: %s' % (good, len(matches)))
+            for match in matches:
+                verb('Internal page', url)
+                internal.add(match) #cleaning up the url & adding it to the internal list for crawling
+
+####
+# This functions checks whether a url matches a regular expression
+####
+
+def remove_regex(urls, regex):
+    """
+    Parses a list for non-matches to a regex
+
+    Args:
+        urls: iterable of urls
+        custom_regex: string regex to be parsed for
+
+    Returns:
+        list of strings not matching regex
+    """
+
+    if not regex:
+        return urls
+
+    # to avoid iterating over the characters of a string
+    if not isinstance(urls, (list, set, tuple)):
+        urls = [urls]
+
+    try:
+        non_matching_urls = [url for url in urls if not search(regex, url)]
+    except TypeError:
+        return []
+
+    return non_matching_urls
+
+
+####
+# This functions checks whether a url should be crawled or not
+####
+
+def is_link(url):
+    # file extension that don't need to be crawled and are files
+    conclusion = False # whether the the url should be crawled or not
+
+    if url not in processed: # if the url hasn't been crawled already
+        if url.split('.')[-1].lower() in badTypes:
+            files.add(url)
+        else:
+            return True # url can be crawled
+    return conclusion # return the conclusion :D
+
+####
+# This function extracts string based on regex pattern supplied by user
+####
+
+supress_regex = False
+def regxy(pattern, response):
+    try:
+        matches = findall(r'%s' % pattern, response)
+        for match in matches:
+            verb('Custom regex', match)
+            custom.add(match)
+    except:
+        supress_regex = True
+
+####
+# This function extracts intel from the response body
+####
+
+def intel_extractor(response):
+    matches = findall(r'''([\w\.-]+s[\w\.-]+\.amazonaws\.com)|([\w\.-]+@[\w\.-]+\.[\.\w]+)''', response)
+    if matches:
+        for match in matches: # iterate over the matches
+            verb('Intel', match)
+            bad_intel.add(match) # add it to intel list
+####
+# This function extracts js files from the response body
+####
+
+def js_extractor(response):
+    matches = findall(r'src=[\'"](.*?\.js)["\']', response) # extract .js files
+    for match in matches: # iterate over the matches
+        verb('JS file', match)
+        bad_scripts.add(match)
+
+####
+# This function calculates the entropy of a string
+####
+
+def entropy(payload):
+    entropy = 0
+    for number in range(256):
+        result = float(payload.encode('utf-8').count(chr(number)))/len(payload.encode('utf-8'))
+        if result != 0:
+            entropy = entropy - result * log(result, 2)
+    return entropy
+
+####
+# This function extracts stuff from the response body
+####
+
+def extractor(url):
+    response = requester(url) # make request to the url
+    matches = findall(r'<[aA].*href=["\']{0,1}(.*?)["\']', response)
+    for link in matches: # iterate over the matches
+        link = link.split('#')[0] # remove everything after a "#" to deal with in-page anchors
+        if is_link(link): # checks if the urls should be crawled
+            if link[:4] == 'http':
+                if link.startswith(main_url):
+                    verb('Internal page', link)
+                    internal.add(link)
+                else:
+                    verb('External page', link)
+                    external.add(link)
+            elif link[:2] == '//':
+                if link.split('/')[2].startswith(host):
+                    verb('Internal page', link)
+                    internal.add(schema + link)
+                else:
+                    verb('External page', link)
+                    external.add(link)
+            elif link[:1] == '/':
+                verb('Internal page', link)
+                internal.add(main_url + link)
+            else:
+                verb('Internal page', link)
+                internal.add(main_url + '/' + link)
+
+    if not only_urls:
+        intel_extractor(response)
+        js_extractor(response)
+    if args.regex and not supress_regex:
+        regxy(args.regex, response)
+    if api:
+        matches = findall(r'[\w-]{16,45}', response)
+        for match in matches:
+            if entropy(match) >= 4:
+                verb('Key', match)
+                keys.add(url + ': ' + match)
+
+####
+# This function extracts endpoints from JavaScript Code
+####
+
+def jscanner(url):
+    response = requester(url) # make request to the url
+    matches = findall(r'[\'"](/.*?)[\'"]|[\'"](http.*?)[\'"]', response) # extract urls/endpoints
+    for match in matches: # iterate over the matches, match is a tuple
+        match = match[0] + match[1] # combining the items because one of them is always empty
+        if not search(r'[}{><"\']', match) and not match == '/': # making sure it's not some js code
+            verb('JS enpoint', match)
+            endpoints.add(match) # add it to the endpoints list
+
+####
+# This function starts multiple threads for a function
+####
+
+def threader(function, *urls):
+    threads = [] # list of threads
+    urls = urls[0] # because urls is a tuple
+    for url in urls: # iterating over urls
+        task = threading.Thread(target=function, args=(url,))
+        threads.append(task)
+    # start threads
+    for thread in threads:
+        thread.start()
+    # wait for all threads to complete their work
+    for thread in threads:
+        thread.join()
+    # delete threads
+    del threads[:]
+
+####
+# This function processes the urls and uses a threadpool to execute a function
+####
+
+def flash(function, links): # This shit is NOT complicated, please enjoy
+    links = list(links) # convert links (set) to list
+    if sys.version_info < (3, 2):
+        for begin in range(0, len(links), thread_count): # range with step
+            end = begin + thread_count
+            splitted = links[begin:end]
+            threader(function, splitted)
+            progress = end
+            if progress > len(links): # fix if overflow
+                progress = len(links)
+            print('\r%s Progress: %i/%i' % (info, progress, len(links)), end='\r')
+            sys.stdout.flush()
+    else:
+        threadpool = concurrent.futures.ThreadPoolExecutor(max_workers=thread_count)
+        futures = (threadpool.submit(function, link) for link in links)
+        for i, _ in enumerate(concurrent.futures.as_completed(futures)):
+            if i + 1 == len(links) or (i + 1) % thread_count == 0:
+                print('%s Progress: %i/%i' % (info, i + 1, len(links)), end='\r')
+    print('')
+
+then = time.time() # records the time at which crawling started
+
+# Step 1. Extract urls from robots.txt & sitemap.xml
+zap(main_url)
+
+# this is so the level 1 emails are parsed as well
+internal = set(remove_regex(internal, args.exclude))
+
+# Step 2. Crawl recursively to the limit specified in "crawl_level"
+for level in range(crawl_level):
+    links = remove_regex(internal - processed, args.exclude) # links to crawl = (all links - already crawled links) - links not to crawl
+    if not links: # if links to crawl are 0 i.e. all links have been crawled
+        break
+    elif len(internal) <= len(processed): # if crawled links are somehow more than all links. Possible? ;/
+        if len(internal) > 2 + len(args.seeds): # if you know it, you know it
+            break
+    print('%s Level %i: %i URLs' % (run, level + 1, len(links)))
+    try:
+        flash(extractor, links)
+    except KeyboardInterrupt:
+        print('')
+        break
+
+if not only_urls:
+    for match in bad_scripts:
+        if match.startswith(main_url):
+            scripts.add(match)
+        elif match.startswith('/') and not match.startswith('//'):
+            scripts.add(main_url + match)
+        elif not match.startswith('http') and not match.startswith('//'):
+            scripts.add(main_url + '/' + match)
+    # Step 3. Scan the JavaScript files for enpoints
+    print('%s Crawling %i JavaScript files' % (run, len(scripts)))
+    flash(jscanner, scripts)
+
+    for url in internal:
+        if '=' in url:
+            fuzzable.add(url)
+
+    for match in bad_intel:
+        for x in match: # because "match" is a tuple
+            if x != '': # if the value isn't empty
+                intel.add(x)
+        for url in external:
+            try:
+                if tld.get_fld(url, fix_protocol=True) in intels:
+                    intel.add(url)
+            except:
+                pass
+
+now = time.time() # records the time at which crawling stopped
+diff = (now - then) # finds total time taken
+
+def timer(diff):
+    minutes, seconds = divmod(diff, 60) # Changes seconds into minutes and seconds
+    try:
+        time_per_request = diff / float(len(processed)) # Finds average time taken by requests
+    except ZeroDivisionError:
+        time_per_request = 0
+    return minutes, seconds, time_per_request
+minutes, seconds, time_per_request = timer(diff)
+
+# Step 4. Save the results
+if not os.path.exists(output_dir): # if the directory doesn't exist
+    os.mkdir(output_dir) # create a new directory
+
+datasets = [files, intel, robots, custom, failed, internal, scripts, external, fuzzable, endpoints, keys]
+dataset_names = ['files', 'intel', 'robots', 'custom', 'failed', 'internal', 'scripts', 'external', 'fuzzable', 'endpoints', 'keys']
+
+def writer(datasets, dataset_names, output_dir):
+    for dataset, dataset_name in zip(datasets, dataset_names):
+        if dataset:
+            filepath = output_dir + '/' + dataset_name + '.txt'
+            if python3:
+                with open(filepath, 'w+', encoding='utf8') as f:
+                    f.write(str('\n'.join(dataset)))
+                    f.write('\n')
+            else:
+                with open(filepath, 'w+') as f:
+                    joined = '\n'.join(dataset)
+                    f.write(str(joined.encode('utf-8')))
+                    f.write('\n')
+
+writer(datasets, dataset_names, output_dir)
+# Printing out results
+print (('%s-%s' % (red, end)) * 50)
+for dataset, dataset_name in zip(datasets, dataset_names):
+    if dataset:
+        print ('%s %s: %s' % (good, dataset_name.capitalize(), len(dataset)))
+print (('%s-%s' % (red, end)) * 50)
+
+print('%s Total requests made: %i' % (info, len(processed)))
+print('%s Total time taken: %i minutes %i seconds' % (info, minutes, seconds))
+print('%s Requests per second: %i' % (info, int(len(processed)/diff)))
+
+datasets = {
+'files': list(files), 'intel': list(intel), 'robots': list(robots), 'custom': list(custom), 'failed': list(failed), 'internal': list(internal),
+'scripts': list(scripts), 'external': list(external), 'fuzzable': list(fuzzable), 'endpoints': list(endpoints), 'keys' : list(keys)
+}
+
+if args.dns:
+    print ('%s Enumerating subdomains' % run)
+    from plugins.findSubdomains import findSubdomains
+    subdomains = findSubdomains(domain)
+    print ('%s %i subdomains found' % (info, len(subdomains)))
+    writer([subdomains], ['subdomains'], output_dir)
+    datasets['subdomains'] = subdomains
+    from plugins.dnsdumpster import dnsdumpster
+    print ('%s Generating DNS map' % run)
+    dnsdumpster(domain, output_dir)
+
+if args.export:
+    from plugins.exporter import exporter
+    # exporter(directory, format, datasets)
+    exporter(output_dir, args.export, datasets)
+
+print('%s Results saved in %s%s%s directory' % (good, green, output_dir, end))
+
+if args.std:
+    for string in datasets[args.std]:
+        sys.stdout.write(string + '\n')