diff --git a/blc/__main__.py b/blc/__main__.py index f7a92f8..98bd415 100644 --- a/blc/__main__.py +++ b/blc/__main__.py @@ -40,6 +40,7 @@ def main(args): "password": None, "smtp_server": None, "recipient": None, + "browser_sleep": None, } if not config_args.debug: @@ -81,6 +82,9 @@ def main(args): help='It represent the email where send the report') parser.add_argument('-n', '--deep-scan', action='store_true', help='Enable the deep scan') + parser.add_argument('-b', '--browser_sleep', type=float, + help='Enable browser extension ' + '(if params used) and set his sleep time') args = parser.parse_args() # We verify the dependency @@ -103,6 +107,7 @@ def main(args): target, delay=args.delay if args.delay is not None else 1.0, deep_scan=args.deep_scan, + browser_sleep=args.browser_sleep, ) if conn: checker.conn = conn diff --git a/blc/checker.py b/blc/checker.py index a4184fb..3987d36 100644 --- a/blc/checker.py +++ b/blc/checker.py @@ -25,7 +25,8 @@ class Checker: just verify the availability of these URL """ - def __init__(self, host: str, delay: int = 1, deep_scan: bool = False): + def __init__(self, host: str, delay: int = 1, deep_scan: bool = False, + browser_sleep: float = None): """Init the checker.""" # We config the logger self.logging = logging.getLogger(f'checker({host})') @@ -37,6 +38,9 @@ def __init__(self, host: str, delay: int = 1, deep_scan: bool = False): self.conn.headers.update({ "User-Agent": "BrokenLinkChecker/1.0", }) + self.timeout = 2 + self.browser_sleep = browser_sleep + self.max_download_size = 1048576 # 1MB self.host = host @@ -120,9 +124,10 @@ def check(self, url: str) -> requests_html.HTMLResponse: # We make a connection try: if self.is_same_host(url): - response = self.conn.get(url, timeout=2, stream=True) + response = self.conn.get(url, timeout=self.timeout, + stream=True) else: - response = self.conn.head(url, timeout=2) + response = self.conn.head(url, timeout=self.timeout) except requests.exceptions.ReadTimeout: self.urls[url]['result'] = False, None, "Timeout!" except requests.exceptions.ConnectionError: @@ -156,18 +161,25 @@ def update_list(self, response: requests_html.HTMLResponse) -> None: # We verify if the content is a webpage if self.REGEX_CONTENT_TYPE.match(response.headers['Content-Type']): self.logging.debug('Getting of the webpage...') + # We execute the js script - try: - # We wait 10s to load the js and in case of connection latency - response.html.render(timeout=10, sleep=10) - except (AttributeError, requests_html.etree.ParserError): - # we read max 2**20 bytes by precaution + if self.browser_sleep is not None: + data = None + try: + # We wait to load the js and in case of connection latency + response.html.render( + timeout=self.timeout, + sleep=self.browser_sleep) + data = response.html.html + except (AttributeError, requests_html.etree.ParserError): + pass + + if data is None: + # we read fixed bytes by precaution response.raw.decode_content = True - data = response.raw.read(1048576) + data = response.raw.read(self.max_download_size) self.logging.debug('Decoding of data...') data = data.decode() - else: - data = response.html.html # We verify if we are not already got this content # in the previous request