From 0bc02f3a3f6a63c415e80cc625bef123d2155c5a Mon Sep 17 00:00:00 2001 From: "Artiom N." Date: Sat, 27 Apr 2024 17:36:35 +0300 Subject: [PATCH] Bug #23 fixed --- markdown_toolset/image_downloader.py | 4 ++++ markdown_toolset/www_tools.py | 14 ++++++++++---- tests/data/image_mime_incorrect.md | 3 +++ tests/test_www_tools.py | 23 ++++++++++++++++++++++- 4 files changed, 39 insertions(+), 5 deletions(-) create mode 100644 tests/data/image_mime_incorrect.md diff --git a/markdown_toolset/image_downloader.py b/markdown_toolset/image_downloader.py index 97608c0..5c6f843 100644 --- a/markdown_toolset/image_downloader.py +++ b/markdown_toolset/image_downloader.py @@ -137,12 +137,16 @@ def download_images(self, images: List[Union[str, ImageLink]]) -> dict: ) continue + logging.debug('Image is URL: %s', is_url(image_download_url)) + image_filename, image_content = ( self._get_remote_image(image_download_url, image_num, images_count) if is_url(image_download_url) else ImageDownloader._get_local_image(Path(image_download_url)) ) + logging.debug('Guessed image filename: %s', image_filename) + if image_filename is None: logging.warning( 'Empty image filename, probably this is incorrect link: "%s".', image_download_url diff --git a/markdown_toolset/www_tools.py b/markdown_toolset/www_tools.py index 92168f1..2a2c082 100644 --- a/markdown_toolset/www_tools.py +++ b/markdown_toolset/www_tools.py @@ -7,6 +7,7 @@ from mimetypes import guess_extension import os import re +from urllib.parse import urlparse, urlunparse import requests from .string_tools import slugify @@ -15,8 +16,7 @@ NECESSARY_HEADERS = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:96.0) Gecko/20100101 Firefox/96.0'} __protocol_prefix_replace_regex = re.compile(r'^\s*(:?(?:(?:http|ftp)+s?|file)://)', re.IGNORECASE) - -# TODO: Use urllib!!! +__protocol_prefix_slashes_replace_regex = re.compile(r'^\s*:?//', re.IGNORECASE) def is_url(url: str, allowed_url_prefixes=('http', 'ftp', 'https', 'ftps')) -> bool: @@ -37,7 +37,7 @@ def remove_protocol_prefix(url: str) -> str: Remove prefixes like http, ftp, HTTPS, and other from the URL. """ - return __protocol_prefix_replace_regex.sub('', url) + return __protocol_prefix_slashes_replace_regex.sub('', str(urlunparse(urlparse(url)._replace(scheme='')))) def download_from_url(url: str, timeout: float = None): @@ -48,6 +48,7 @@ def download_from_url(url: str, timeout: float = None): :raise OSError: when HTTP status is not 200. """ + # todo: Add urlparse()? url = url.split()[0] try: @@ -70,8 +71,11 @@ def get_filename_from_url(req: requests.Response) -> Optional[str]: Get filename from url and, if not found, try to get from content-disposition. """ + logging.debug('URL from request: %s', req.url) + if req and req.url.find('/'): - result = req.url.rsplit('/', 1)[1] + result = urlparse(req.url).path + logging.debug('Filename from URL: %s', result) else: cd = req.headers.get('content-disposition') @@ -80,6 +84,8 @@ def get_filename_from_url(req: requests.Response) -> Optional[str]: file_name = re.findall('filename=(.+)', cd) + logging.debug('Filename from "filename=" part: %s', file_name) + if len(file_name) == 0: return None diff --git a/tests/data/image_mime_incorrect.md b/tests/data/image_mime_incorrect.md new file mode 100644 index 0000000..dbc0dab --- /dev/null +++ b/tests/data/image_mime_incorrect.md @@ -0,0 +1,3 @@ +![](https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpicx.zhimg.com%2F50%2Fv2-53de590b6bb3f42d1a06d28c806c698d_720w.jpg%3Fsource%3D1940ef5c) +![](https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpica.zhimg.com%2F50%2Fv2-872d10f75dfa52172835fe6fbf22c5fe_720w.jpg%3Fsource%3D1940ef5c) +![](https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpic1.zhimg.com%2F50%2Fv2-c4b89a30d2a3fe1897cfe24388ec935e_720w.jpg%3Fsource%3D1940ef5c) diff --git a/tests/test_www_tools.py b/tests/test_www_tools.py index 1e1c102..fec1f40 100644 --- a/tests/test_www_tools.py +++ b/tests/test_www_tools.py @@ -1,4 +1,6 @@ -from markdown_toolset.www_tools import remove_protocol_prefix, is_url +import requests + +from markdown_toolset.www_tools import remove_protocol_prefix, is_url, get_filename_from_url, download_from_url class TestProtocolPrefixesFunctions: @@ -18,3 +20,22 @@ def test_url_checker(self): assert is_url('Https://test') == True # noqa assert is_url('FTPS://test') == True # noqa assert is_url('file://test') == False # noqa + + def test_get_filename_from_url(self): + # Mock response. + req = requests.Response() + req.status_code = 200 + req.headers['content-type'] = 'image/jpg' + + req.url = 'https://image.cubox.pro/cardImg/26p25dhia8yismewd0i3zptqzluz1ydufavhzlog6yjr6b6yle.jpg?imageMogr2/quality/90/ignore-error/1' + assert get_filename_from_url(req) == 'cardimg26p25dhia8yismewd0i3zptqzluz1ydufavhzlog6yjr6b6yle.jpg' + + req.url = 'https://image.cubox.pro/cardImg/53fjbjlzb8a72slatcat03qmae7rw44qh3rvyck9548bqg06a2.jpg?imageMogr2/quality/90/ignore-error/1' + assert get_filename_from_url(req) == 'cardimg53fjbjlzb8a72slatcat03qmae7rw44qh3rvyck9548bqg06a2.jpg' + + url = ( + 'https://cubox.pro/c/filters:no_upscale()?valid=false&imageUrl=https%3A%2F%2Fpic1.zhimg.com' + '%2F50%2Fv2-c4b89a30d2a3fe1897cfe24388ec935e_720w.jpg%3Fsource%3D1940ef5c' + ) + req = download_from_url(url) + assert get_filename_from_url(req) == 'cardimgo2sqp98phc0gflafoxr829sjojo4vouo8twjaqycdtakasiqc.jpg'