From fd1211b1a91143468488f5ef90c152e9b79df20f Mon Sep 17 00:00:00 2001 From: benoit74 Date: Tue, 7 Jan 2025 10:40:46 +0000 Subject: [PATCH 1/4] Use scraperlib 5.0.0rc4 --- scraper/pyproject.toml | 6 +----- scraper/src/mindtouch2zim/asset.py | 9 +-------- 2 files changed, 2 insertions(+), 13 deletions(-) diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index e90136b..0ffaaf0 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -8,8 +8,7 @@ requires-python = ">=3.12,<3.13" description = "Make ZIM file from Mindtouch / Nice CXone Expert libraries" readme = "../README.md" dependencies = [ - # use zimscraperlib pinned version once content rewriting functions have been released - "zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main", + "zimscraperlib==5.0.0rc4", "requests==2.32.3", "types-requests==2.32.0.20241016", "kiwixstorage==0.9.0", @@ -26,9 +25,6 @@ dependencies = [ ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] -[tool.hatch.metadata] -allow-direct-references = true - [tool.hatch.metadata.hooks.openzim-metadata] kind = "scraper" additional-keywords = ["mindouch", "nice", "cxone", "expert"] diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py index 6c85291..27b6b80 100644 --- a/scraper/src/mindtouch2zim/asset.py +++ b/scraper/src/mindtouch2zim/asset.py @@ -242,14 +242,7 @@ def _get_image_content( ).save(optimized, format="WEBP") del unoptimized - optimize_webp( - src=optimized, - quality=WEBP_OPTIONS.get("quality"), # pyright: ignore[reportArgumentType] - method=WEBP_OPTIONS.get("method"), # pyright: ignore[reportArgumentType] - lossless=WEBP_OPTIONS.get( - "lossless" - ), # pyright: ignore[reportArgumentType] - ) + optimize_webp(src=optimized, options=WEBP_OPTIONS) if context.s3_url_with_credentials: # upload optimized to S3 From 01354fd9fb915e908beb5ef27d0d61e898a4c057 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 9 Jan 2025 10:01:18 +0000 Subject: [PATCH 2/4] Cleanup unused classes --- .../libretexts/detailed_licensing.py | 22 ------------------- 1 file changed, 22 deletions(-) diff --git a/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py b/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py index ab4bf77..93923a7 100644 --- a/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py +++ b/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py @@ -1,7 +1,6 @@ from typing import Any from jinja2 import Template -from pydantic import BaseModel from zimscraperlib.rewriting.html import HtmlRewriter from mindtouch2zim.client import LibraryPage, MindtouchClient @@ -12,27 +11,6 @@ logger = context.logger -class LicenseStatistic(BaseModel): - label: str - version: str | None - percent: float - count: int - link: str - - -class LicenseInfo(BaseModel): - statistics: list[LicenseStatistic] - details: list - - -class PageInfo(BaseModel): - license_label: str - license_version: str - url: str - title: str - children: list["PageInfo"] - - def _get_licensing_report_data(cover_url: str) -> Any: """ Get licensing report from libretexts.org From 1e042ce5d1699f66b081e4a088ec494c52db229b Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 9 Jan 2025 10:01:43 +0000 Subject: [PATCH 3/4] Activate strict type checking mode --- scraper/pyproject.toml | 2 +- scraper/src/mindtouch2zim/asset.py | 13 +++++++++---- scraper/src/mindtouch2zim/client.py | 5 ++++- scraper/src/mindtouch2zim/context.py | 9 +++++---- scraper/src/mindtouch2zim/processor.py | 8 ++++++-- scraper/tests/__init__.py | 3 ++- scraper/tests/test_asset.py | 2 +- scraper/tests/test_context.py | 4 +++- scraper/tests/test_entrypoint.py | 2 +- scraper/tests/test_processor.py | 2 +- 10 files changed, 33 insertions(+), 17 deletions(-) diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index 0ffaaf0..052da8d 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -224,5 +224,5 @@ include = ["src", "tests", "tasks.py"] exclude = [".env/**", ".venv/**", "src/mindtouch2zim/templates", ".hatch"] extraPaths = ["src"] pythonVersion = "3.12" -typeCheckingMode = "basic" +typeCheckingMode = "strict" disableBytesTypePromotions = true diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py index 27b6b80..7b6c482 100644 --- a/scraper/src/mindtouch2zim/asset.py +++ b/scraper/src/mindtouch2zim/asset.py @@ -7,11 +7,16 @@ from urllib.parse import urlsplit import backoff -from kiwixstorage import KiwixStorage, NotFoundError -from pif import get_public_ip +from kiwixstorage import ( # pyright: ignore[reportMissingTypeStubs] + KiwixStorage, + NotFoundError, +) +from pif import ( # pyright: ignore[reportMissingTypeStubs] + get_public_ip, # pyright: ignore[reportUnknownVariableType] +) from PIL import Image from requests.exceptions import RequestException -from resizeimage import resizeimage +from resizeimage import resizeimage # pyright: ignore[reportMissingTypeStubs] from zimscraperlib.image.optimization import optimize_webp from zimscraperlib.image.presets import WebpMedium from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath @@ -232,7 +237,7 @@ def _get_image_content( if image.width * image.height <= context.maximum_image_pixels: image.save(optimized, format="WEBP") else: - resizeimage.resize_width( + resizeimage.resize_width( # pyright: ignore[reportUnknownMemberType] image, int( math.sqrt( diff --git a/scraper/src/mindtouch2zim/client.py b/scraper/src/mindtouch2zim/client.py index fa6d227..1092218 100644 --- a/scraper/src/mindtouch2zim/client.py +++ b/scraper/src/mindtouch2zim/client.py @@ -327,7 +327,10 @@ def get_page_definition(self, page: LibraryPage | str) -> LibraryPageDefinition: if raw_tag is None: raise MindtouchParsingError(f"No tag property for page {page_id}") if isinstance(raw_tag, list): - tags = [item.get("@value") for item in raw_tag] + tags: list[Any] = [ + item.get("@value") # pyright: ignore[reportUnknownMemberType] + for item in raw_tag # pyright: ignore[reportUnknownVariableType] + ] else: tags = [raw_tag.get("@value")] diff --git a/scraper/src/mindtouch2zim/context.py b/scraper/src/mindtouch2zim/context.py index 6b7fbae..68f8c83 100644 --- a/scraper/src/mindtouch2zim/context.py +++ b/scraper/src/mindtouch2zim/context.py @@ -4,6 +4,7 @@ import re import threading from pathlib import Path +from typing import Any import requests from zimscraperlib.constants import NAME as SCRAPERLIB_NAME @@ -64,7 +65,7 @@ class Context: assets_workers: int = 10 # known bad assets - bad_assets_regex: re.Pattern = re.compile(STANDARD_KNOWN_BAD_ASSETS_REGEX) + bad_assets_regex: re.Pattern[str] = re.compile(STANDARD_KNOWN_BAD_ASSETS_REGEX) # maximum amount of bad assets bad_assets_threshold: int = 10 @@ -99,9 +100,9 @@ class Context: secondary_color: str = "#FFFFFF" # Content filters - page_title_include: re.Pattern | None = None + page_title_include: re.Pattern[str] | None = None page_id_include: list[str] | None = None - page_title_exclude: re.Pattern | None = None + page_title_exclude: re.Pattern[str] | None = None root_page_id: str | None = None # Maximum number of pixels of images that will be pushed to the ZIM @@ -114,7 +115,7 @@ class Context: ) @classmethod - def setup(cls, **kwargs): + def setup(cls, **kwargs: Any): new_instance = cls(**kwargs) if cls._instance: # replace values 'in-place' so that we do not change the Context object diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 8edacd4..10313dd 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -5,10 +5,14 @@ from http import HTTPStatus from io import BytesIO from pathlib import Path +from typing import Any import backoff from jinja2 import Environment, FileSystemLoader, select_autoescape -from joblib import Parallel, delayed +from joblib import ( # pyright: ignore[reportMissingTypeStubs] + Parallel, + delayed, # pyright: ignore[reportUnknownVariableType] +) from pydantic import BaseModel from requests import RequestException from requests.exceptions import HTTPError @@ -462,7 +466,7 @@ def run_with_creator(self, creator: Creator): context.current_thread_workitem = "assets" self.stats_items_total += len(self.asset_manager.assets) - res = self.asset_executor( + res: Any = self.asset_executor( delayed(self.asset_processor.process_asset)( asset_path, asset_details, creator ) diff --git a/scraper/tests/__init__.py b/scraper/tests/__init__.py index 0b79efa..d3b4f9e 100644 --- a/scraper/tests/__init__.py +++ b/scraper/tests/__init__.py @@ -1,10 +1,11 @@ import threading +from typing import Any from zimscraperlib.download import get_session from mindtouch2zim.context import Context -CONTEXT_DEFAULTS = { +CONTEXT_DEFAULTS: dict[str, Any] = { "web_session": get_session(), "tmp_folder": None, "cache_folder": None, diff --git a/scraper/tests/test_asset.py b/scraper/tests/test_asset.py index 8f21d62..e10423f 100644 --- a/scraper/tests/test_asset.py +++ b/scraper/tests/test_asset.py @@ -178,7 +178,7 @@ def test_get_mime_type( ): assert ( - processor._get_mime_type( + processor._get_mime_type( # pyright: ignore[reportPrivateUsage] header_data=HeaderData(ident="foo", content_type=header_content_type), asset_url=HttpUrl( "https://www.acme.com/xenolith-of-diorite.jpg?revision=1" diff --git a/scraper/tests/test_context.py b/scraper/tests/test_context.py index 244dffe..4fc5384 100644 --- a/scraper/tests/test_context.py +++ b/scraper/tests/test_context.py @@ -1,3 +1,5 @@ +from typing import Any + import pytest from mindtouch2zim.context import Context @@ -28,7 +30,7 @@ def test_context_defaults(): assert context.current_thread_workitem == "context 123" -def test_context_setup_again(context_defaults): +def test_context_setup_again(context_defaults: dict[str, Any]): settings = context_defaults.copy() settings["title"] = "A title" Context.setup(**settings) diff --git a/scraper/tests/test_entrypoint.py b/scraper/tests/test_entrypoint.py index 68f8fd9..b3f72cf 100644 --- a/scraper/tests/test_entrypoint.py +++ b/scraper/tests/test_entrypoint.py @@ -316,7 +316,7 @@ def test_entrypoint_regex_args( prepare_context( ([*good_cli_args, arg_name, arg_value] if arg_name else good_cli_args), tmpdir ) - regex: re.Pattern = context.__getattribute__(context_name) + regex: re.Pattern[str] = context.__getattribute__(context_name) for match in expected_match: assert regex.findall(match) for match in expected_no_match: diff --git a/scraper/tests/test_processor.py b/scraper/tests/test_processor.py index 0852c1f..94b14f9 100644 --- a/scraper/tests/test_processor.py +++ b/scraper/tests/test_processor.py @@ -12,7 +12,7 @@ def dummy_encoded_url() -> str: @pytest.fixture(scope="module") -def library_tree(dummy_encoded_url) -> LibraryTree: +def library_tree(dummy_encoded_url: str) -> LibraryTree: root = LibraryPage( id="24", title="Home page", path="", encoded_url=dummy_encoded_url ) From ddb65cacb4888429809681ab38b6d0c108d0b9d0 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Thu, 9 Jan 2025 10:56:54 +0000 Subject: [PATCH 4/4] Make test of Wikimedia User-Agent more exact/permissive --- scraper/tests/test_context.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scraper/tests/test_context.py b/scraper/tests/test_context.py index 4fc5384..c63dd53 100644 --- a/scraper/tests/test_context.py +++ b/scraper/tests/test_context.py @@ -1,3 +1,4 @@ +import re from typing import Any import pytest @@ -22,9 +23,9 @@ def test_context_defaults(): context = Context.get() assert context == processor_context # check both objects are same assert context.assets_workers == 10 - assert ( # check getter logic - context.wm_user_agent - == "mindtouch2zim/0.1.0-dev0 (https://www.kiwix.org) zimscraperlib/5.0.0-dev0" + assert re.match( # check getter logic + r"mindtouch2zim\/.* \(https:\/\/www\.kiwix\.org\) zimscraperlib\/.*", + context.wm_user_agent, ) context.current_thread_workitem = "context 123" assert context.current_thread_workitem == "context 123"