Skip to content

Commit

Permalink
refactor: switching to requests_html (#67)
Browse files Browse the repository at this point in the history
## refactor: switching to requests_html
- Added support for dynamic webpage
- Removed multi-checking

## fix: problem of URL with html code inside
- Added cleaning of URL to remove the unwanted part

## feat: enable/disable browser
- Update the argsparser
  • Loading branch information
pythonbrad authored Sep 21, 2022
1 parent da0289d commit 1ab3d8a
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 32 deletions.
30 changes: 21 additions & 9 deletions blc/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from configparser import ConfigParser
import sys
import logging
import threading
# import threading
import coloredlogs


Expand Down Expand Up @@ -40,6 +40,7 @@ def main(args):
"password": None,
"smtp_server": None,
"recipient": None,
"browser_sleep": None,
}

if not config_args.debug:
Expand Down Expand Up @@ -81,6 +82,9 @@ def main(args):
help='It represent the email where send the report')
parser.add_argument('-n', '--deep-scan', action='store_true',
help='Enable the deep scan')
parser.add_argument('-b', '--browser_sleep', type=float,
help='Enable browser extension '
'(if params used) and set his sleep time')
args = parser.parse_args()

# We verify the dependency
Expand All @@ -93,22 +97,30 @@ def main(args):
else:
pass

checker_threads = []
report = {}
# checker_threads = []
conn = None

for target in args.host.split(','):
# We initialize the checker
checker = Checker(
target,
delay=args.delay if args.delay is not None else 1.0,
deep_scan=args.deep_scan,
browser_sleep=args.browser_sleep,
)
if conn:
checker.conn = conn
else:
conn = checker.conn
# We config the shared dict
report[target] = checker.urls

t = threading.Thread(target=checker.run)
checker_threads.append(t)
t.daemon = True
# t = threading.Thread(target=checker.run)
# checker_threads.append(t)
# t.daemon = True

checker.run()

# We initialize the notifier
notifier = Notifier(
Expand All @@ -118,12 +130,12 @@ def main(args):
)

# We start the checkers
for thread in checker_threads:
logging.info('Checking of %s' % args.host)
thread.start()
# for thread in checker_threads:
# logging.info('Checking of %s' % args.host)
# thread.start()

# We wait for the completion
[thread.join() for thread in checker_threads]
# [thread.join() for thread in checker_threads]

# We build the report
msg = 'Hello, the report of the broken link checker is ready.\n'
Expand Down
59 changes: 46 additions & 13 deletions blc/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,16 @@
"""Checker module."""

import requests
import requests_html
from urllib.parse import urljoin
import time
import logging
import re
import difflib
import html

# We change the log level for requests’s logger
logging.getLogger("requests").setLevel(logging.WARNING)
logging.getLogger("requests_html").setLevel(logging.WARNING)


class Checker:
Expand All @@ -23,18 +25,22 @@ class Checker:
just verify the availability of these URL
"""

def __init__(self, host: str, delay: int = 1, deep_scan: bool = False):
def __init__(self, host: str, delay: int = 1, deep_scan: bool = False,
browser_sleep: float = None):
"""Init the checker."""
# We config the logger
self.logging = logging.getLogger(f'checker({host})')
self.logging.setLevel(logging.DEBUG)
self.logging.debug('We initialize the checker for %s' % host)

# We config the connection
self.conn = requests.session()
self.conn = requests_html.HTMLSession()
self.conn.headers.update({
"User-Agent": "BrokenLinkChecker/1.0",
})
self.timeout = 2
self.browser_sleep = browser_sleep
self.max_download_size = 1048576 # 1MB

self.host = host

Expand Down Expand Up @@ -72,6 +78,11 @@ def __init__(self, host: str, delay: int = 1, deep_scan: bool = False):
re.IGNORECASE
)

self.REGEX_CLEAN_URL = re.compile(
r"[-A-Z0-9+&@#/%?=~_|!:,.;]*",
re.IGNORECASE
)

# Regex to verify the content type
self.REGEX_CONTENT_TYPE = re.compile(
r"text/(xml|html)"
Expand All @@ -98,7 +109,7 @@ def is_same_host(self, url):
else:
return False

def check(self, url: str) -> requests.Response:
def check(self, url: str) -> requests_html.HTMLResponse:
"""
Verify if a link is broken of not.
Expand All @@ -113,9 +124,10 @@ def check(self, url: str) -> requests.Response:
# We make a connection
try:
if self.is_same_host(url):
response = self.conn.get(url, timeout=2, stream=True)
response = self.conn.get(url, timeout=self.timeout,
stream=True)
else:
response = self.conn.head(url, timeout=2)
response = self.conn.head(url, timeout=self.timeout)
except requests.exceptions.ReadTimeout:
self.urls[url]['result'] = False, None, "Timeout!"
except requests.exceptions.ConnectionError:
Expand All @@ -139,7 +151,7 @@ def check(self, url: str) -> requests.Response:
)
return None

def update_list(self, response: requests.Response) -> None:
def update_list(self, response: requests_html.HTMLResponse) -> None:
"""
Update the list of URL to checked
in function of the URL get in a webpage.
Expand All @@ -149,11 +161,27 @@ def update_list(self, response: requests.Response) -> None:
# We verify if the content is a webpage
if self.REGEX_CONTENT_TYPE.match(response.headers['Content-Type']):
self.logging.debug('Getting of the webpage...')
# we read max 2**20 bytes by precaution
response.raw.decode_content = True
data = response.raw.read(1048576)
self.logging.debug('Decoding of data...')
data = data.decode()
data = None

# We execute the js script
if self.browser_sleep is not None:
try:
# We wait to load the js and in case of connection latency
response.html.render(
timeout=self.timeout,
sleep=self.browser_sleep)
data = response.html.html
except (AttributeError, requests_html.etree.ParserError):
pass
except requests_html.pyppeteer.errors.TimeoutError:
pass

if data is None:
# we read fixed bytes by precaution
response.raw.decode_content = True
data = response.raw.read(self.max_download_size)
self.logging.debug('Decoding of data...')
data = data.decode()

# We verify if we are not already got this content
# in the previous request
Expand All @@ -170,8 +198,13 @@ def update_list(self, response: requests.Response) -> None:

self.logging.debug('Getting of the URLs...')

# Some url can be escape by the browser
data = html.unescape(data)

# We build a list of cleaned links
urls = [
ii for i in self.REGEX_TEXT_URL.findall(data)
self.REGEX_CLEAN_URL.findall(ii)[0]
for i in self.REGEX_TEXT_URL.findall(data)
if i for ii in i if ii
]

Expand Down
40 changes: 31 additions & 9 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,33 @@
build==0.7.0
flake8==4.0.1
mccabe==0.6.1
appdirs==1.4.4
beautifulsoup4==4.11.1
bs4==0.0.1
build==0.8.0
certifi==2022.6.15.1
charset-normalizer==2.1.1
coloredlogs==15.0.1
cssselect==1.1.0
fake-useragent==0.1.11
flake8==5.0.4
humanfriendly==10.0
idna==3.3
importlib-metadata==4.12.0
lxml==4.9.1
mccabe==0.7.0
packaging==21.3
pep517==0.12.0
pycodestyle==2.8.0
pyflakes==2.4.0
pyparsing==3.0.8
parse==1.19.0
pep517==0.13.0
pycodestyle==2.9.1
pyee==8.2.2
pyflakes==2.5.0
pyparsing==3.0.9
pyppeteer==1.0.2
pyquery==1.4.3
requests==2.28.1
requests-html==0.10.0
soupsieve==2.3.2.post1
tomli==2.0.1
requests
coloredlogs
tqdm==4.64.1
urllib3==1.26.12
w3lib==2.0.1
websockets==10.3
zipp==3.8.1
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
long_description=LONG_DESCRIPTION,
long_description_content_type="text/markdown",
packages=find_packages(where="src"),
install_requires=["requests"],
install_requires=["requests-html", "coloredlogs"],
keywords=["link", "url", "broken", "check"],
classifiers=[
"Topic :: Internet :: WWW/HTTP",
Expand Down

0 comments on commit 1ab3d8a

Please sign in to comment.