Skip to content

Commit

Permalink
feat: enable/disable browser
Browse files Browse the repository at this point in the history
  • Loading branch information
pythonbrad committed Sep 21, 2022
1 parent 747dff5 commit 5a2bd1b
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 11 deletions.
5 changes: 5 additions & 0 deletions blc/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def main(args):
"password": None,
"smtp_server": None,
"recipient": None,
"browser_sleep": None,
}

if not config_args.debug:
Expand Down Expand Up @@ -81,6 +82,9 @@ def main(args):
help='It represent the email where send the report')
parser.add_argument('-n', '--deep-scan', action='store_true',
help='Enable the deep scan')
parser.add_argument('-b', '--browser_sleep', type=float,
help='Enable browser extension '
'(if params used) and set his sleep time')
args = parser.parse_args()

# We verify the dependency
Expand All @@ -103,6 +107,7 @@ def main(args):
target,
delay=args.delay if args.delay is not None else 1.0,
deep_scan=args.deep_scan,
browser_sleep=args.browser_sleep,
)
if conn:
checker.conn = conn
Expand Down
34 changes: 23 additions & 11 deletions blc/checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@ class Checker:
just verify the availability of these URL
"""

def __init__(self, host: str, delay: int = 1, deep_scan: bool = False):
def __init__(self, host: str, delay: int = 1, deep_scan: bool = False,
browser_sleep: float = None):
"""Init the checker."""
# We config the logger
self.logging = logging.getLogger(f'checker({host})')
Expand All @@ -37,6 +38,9 @@ def __init__(self, host: str, delay: int = 1, deep_scan: bool = False):
self.conn.headers.update({
"User-Agent": "BrokenLinkChecker/1.0",
})
self.timeout = 2
self.browser_sleep = browser_sleep
self.max_download_size = 1048576 # 1MB

self.host = host

Expand Down Expand Up @@ -120,9 +124,10 @@ def check(self, url: str) -> requests_html.HTMLResponse:
# We make a connection
try:
if self.is_same_host(url):
response = self.conn.get(url, timeout=2, stream=True)
response = self.conn.get(url, timeout=self.timeout,
stream=True)
else:
response = self.conn.head(url, timeout=2)
response = self.conn.head(url, timeout=self.timeout)
except requests.exceptions.ReadTimeout:
self.urls[url]['result'] = False, None, "Timeout!"
except requests.exceptions.ConnectionError:
Expand Down Expand Up @@ -156,18 +161,25 @@ def update_list(self, response: requests_html.HTMLResponse) -> None:
# We verify if the content is a webpage
if self.REGEX_CONTENT_TYPE.match(response.headers['Content-Type']):
self.logging.debug('Getting of the webpage...')

# We execute the js script
try:
# We wait 10s to load the js and in case of connection latency
response.html.render(timeout=10, sleep=10)
except (AttributeError, requests_html.etree.ParserError):
# we read max 2**20 bytes by precaution
if self.browser_sleep is not None:
data = None
try:
# We wait to load the js and in case of connection latency
response.html.render(
timeout=self.timeout,
sleep=self.browser_sleep)
data = response.html.html
except (AttributeError, requests_html.etree.ParserError):
pass

if data is None:
# we read fixed bytes by precaution
response.raw.decode_content = True
data = response.raw.read(1048576)
data = response.raw.read(self.max_download_size)
self.logging.debug('Decoding of data...')
data = data.decode()
else:
data = response.html.html

# We verify if we are not already got this content
# in the previous request
Expand Down

0 comments on commit 5a2bd1b

Please sign in to comment.