refactor: switching to requests_html (#67)

## refactor: switching to requests_html - Added support for dynamic webpage - Removed multi-checking ## fix: problem of URL with html code inside - Added cleaning of URL to remove the unwanted part ## feat: enable/disable browser - Update the argsparser
osscameroon · Sep 21, 2022 · 1ab3d8a · 1ab3d8a
1 parent da0289d
commit 1ab3d8a
Show file tree

Hide file tree

Showing 4 changed files with 99 additions and 32 deletions.
diff --git a/blc/__main__.py b/blc/__main__.py
@@ -9,7 +9,7 @@
 from configparser import ConfigParser
 import sys
 import logging
-import threading
+# import threading
 import coloredlogs
 
 
@@ -40,6 +40,7 @@ def main(args):
         "password": None,
         "smtp_server": None,
         "recipient": None,
+        "browser_sleep": None,
     }
 
     if not config_args.debug:
@@ -81,6 +82,9 @@ def main(args):
                         help='It represent the email where send the report')
     parser.add_argument('-n', '--deep-scan', action='store_true',
                         help='Enable the deep scan')
+    parser.add_argument('-b', '--browser_sleep', type=float,
+                        help='Enable browser extension '
+                        '(if params used) and set his sleep time')
     args = parser.parse_args()
 
     # We verify the dependency
@@ -93,22 +97,30 @@ def main(args):
     else:
         pass
 
-    checker_threads = []
     report = {}
+    # checker_threads = []
+    conn = None
 
     for target in args.host.split(','):
         # We initialize the checker
         checker = Checker(
             target,
             delay=args.delay if args.delay is not None else 1.0,
             deep_scan=args.deep_scan,
+            browser_sleep=args.browser_sleep,
         )
+        if conn:
+            checker.conn = conn
+        else:
+            conn = checker.conn
         # We config the shared dict
         report[target] = checker.urls
 
-        t = threading.Thread(target=checker.run)
-        checker_threads.append(t)
-        t.daemon = True
+        # t = threading.Thread(target=checker.run)
+        # checker_threads.append(t)
+        # t.daemon = True
+
+        checker.run()
 
     # We initialize the notifier
     notifier = Notifier(
@@ -118,12 +130,12 @@ def main(args):
     )
 
     # We start the checkers
-    for thread in checker_threads:
-        logging.info('Checking of %s' % args.host)
-        thread.start()
+    # for thread in checker_threads:
+    #    logging.info('Checking of %s' % args.host)
+    #    thread.start()
 
     # We wait for the completion
-    [thread.join() for thread in checker_threads]
+    # [thread.join() for thread in checker_threads]
 
     # We build the report
     msg = 'Hello, the report of the broken link checker is ready.\n'

diff --git a/blc/checker.py b/blc/checker.py
@@ -3,14 +3,16 @@
 """Checker module."""
 
 import requests
+import requests_html
 from urllib.parse import urljoin
 import time
 import logging
 import re
 import difflib
+import html
 
 # We change the log level for requests’s logger
-logging.getLogger("requests").setLevel(logging.WARNING)
+logging.getLogger("requests_html").setLevel(logging.WARNING)
 
 
 class Checker:
@@ -23,18 +25,22 @@ class Checker:
         just verify the availability of these URL
     """
 
-    def __init__(self, host: str, delay: int = 1, deep_scan: bool = False):
+    def __init__(self, host: str, delay: int = 1, deep_scan: bool = False,
+                 browser_sleep: float = None):
         """Init the checker."""
         # We config the logger
         self.logging = logging.getLogger(f'checker({host})')
         self.logging.setLevel(logging.DEBUG)
         self.logging.debug('We initialize the checker for %s' % host)
 
         # We config the connection
-        self.conn = requests.session()
+        self.conn = requests_html.HTMLSession()
         self.conn.headers.update({
             "User-Agent": "BrokenLinkChecker/1.0",
         })
+        self.timeout = 2
+        self.browser_sleep = browser_sleep
+        self.max_download_size = 1048576  # 1MB
 
         self.host = host
 
@@ -72,6 +78,11 @@ def __init__(self, host: str, delay: int = 1, deep_scan: bool = False):
             re.IGNORECASE
         )
 
+        self.REGEX_CLEAN_URL = re.compile(
+            r"[-A-Z0-9+&@#/%?=~_|!:,.;]*",
+            re.IGNORECASE
+        )
+
         # Regex to verify the content type
         self.REGEX_CONTENT_TYPE = re.compile(
             r"text/(xml|html)"
@@ -98,7 +109,7 @@ def is_same_host(self, url):
         else:
             return False
 
-    def check(self, url: str) -> requests.Response:
+    def check(self, url: str) -> requests_html.HTMLResponse:
         """
         Verify if a link is broken of not.
 
@@ -113,9 +124,10 @@ def check(self, url: str) -> requests.Response:
         # We make a connection
         try:
             if self.is_same_host(url):
-                response = self.conn.get(url, timeout=2, stream=True)
+                response = self.conn.get(url, timeout=self.timeout,
+                                         stream=True)
             else:
-                response = self.conn.head(url, timeout=2)
+                response = self.conn.head(url, timeout=self.timeout)
         except requests.exceptions.ReadTimeout:
             self.urls[url]['result'] = False, None, "Timeout!"
         except requests.exceptions.ConnectionError:
@@ -139,7 +151,7 @@ def check(self, url: str) -> requests.Response:
                 )
                 return None
 
-    def update_list(self, response: requests.Response) -> None:
+    def update_list(self, response: requests_html.HTMLResponse) -> None:
         """
         Update the list of URL to checked
          in function of the URL get in a webpage.
@@ -149,11 +161,27 @@ def update_list(self, response: requests.Response) -> None:
         # We verify if the content is a webpage
         if self.REGEX_CONTENT_TYPE.match(response.headers['Content-Type']):
             self.logging.debug('Getting of the webpage...')
-            # we read max 2**20 bytes by precaution
-            response.raw.decode_content = True
-            data = response.raw.read(1048576)
-            self.logging.debug('Decoding of data...')
-            data = data.decode()
+            data = None
+
+            # We execute the js script
+            if self.browser_sleep is not None:
+                try:
+                    # We wait to load the js and in case of connection latency
+                    response.html.render(
+                        timeout=self.timeout,
+                        sleep=self.browser_sleep)
+                    data = response.html.html
+                except (AttributeError, requests_html.etree.ParserError):
+                    pass
+                except requests_html.pyppeteer.errors.TimeoutError:
+                    pass
+
+            if data is None:
+                # we read fixed bytes by precaution
+                response.raw.decode_content = True
+                data = response.raw.read(self.max_download_size)
+                self.logging.debug('Decoding of data...')
+                data = data.decode()
 
             # We verify if we are not already got this content
             #   in the previous request
@@ -170,8 +198,13 @@ def update_list(self, response: requests.Response) -> None:
 
             self.logging.debug('Getting of the URLs...')
 
+            # Some url can be escape by the browser
+            data = html.unescape(data)
+
+            # We build a list of cleaned links
             urls = [
-                ii for i in self.REGEX_TEXT_URL.findall(data)
+                self.REGEX_CLEAN_URL.findall(ii)[0]
+                for i in self.REGEX_TEXT_URL.findall(data)
                 if i for ii in i if ii
             ]
 

diff --git a/requirements.txt b/requirements.txt
@@ -1,11 +1,33 @@
-build==0.7.0
-flake8==4.0.1
-mccabe==0.6.1
+appdirs==1.4.4
+beautifulsoup4==4.11.1
+bs4==0.0.1
+build==0.8.0
+certifi==2022.6.15.1
+charset-normalizer==2.1.1
+coloredlogs==15.0.1
+cssselect==1.1.0
+fake-useragent==0.1.11
+flake8==5.0.4
+humanfriendly==10.0
+idna==3.3
+importlib-metadata==4.12.0
+lxml==4.9.1
+mccabe==0.7.0
 packaging==21.3
-pep517==0.12.0
-pycodestyle==2.8.0
-pyflakes==2.4.0
-pyparsing==3.0.8
+parse==1.19.0
+pep517==0.13.0
+pycodestyle==2.9.1
+pyee==8.2.2
+pyflakes==2.5.0
+pyparsing==3.0.9
+pyppeteer==1.0.2
+pyquery==1.4.3
+requests==2.28.1
+requests-html==0.10.0
+soupsieve==2.3.2.post1
 tomli==2.0.1
-requests
-coloredlogs
+tqdm==4.64.1
+urllib3==1.26.12
+w3lib==2.0.1
+websockets==10.3
+zipp==3.8.1
diff --git a/setup.py b/setup.py
@@ -17,7 +17,7 @@
         long_description=LONG_DESCRIPTION,
         long_description_content_type="text/markdown",
         packages=find_packages(where="src"),
-        install_requires=["requests"],
+        install_requires=["requests-html", "coloredlogs"],
         keywords=["link", "url", "broken", "check"],
         classifiers=[
             "Topic :: Internet :: WWW/HTTP",