diff --git a/scraper/pyproject.toml b/scraper/pyproject.toml index e90136b..052da8d 100644 --- a/scraper/pyproject.toml +++ b/scraper/pyproject.toml @@ -8,8 +8,7 @@ requires-python = ">=3.12,<3.13" description = "Make ZIM file from Mindtouch / Nice CXone Expert libraries" readme = "../README.md" dependencies = [ - # use zimscraperlib pinned version once content rewriting functions have been released - "zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main", + "zimscraperlib==5.0.0rc4", "requests==2.32.3", "types-requests==2.32.0.20241016", "kiwixstorage==0.9.0", @@ -26,9 +25,6 @@ dependencies = [ ] dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"] -[tool.hatch.metadata] -allow-direct-references = true - [tool.hatch.metadata.hooks.openzim-metadata] kind = "scraper" additional-keywords = ["mindouch", "nice", "cxone", "expert"] @@ -228,5 +224,5 @@ include = ["src", "tests", "tasks.py"] exclude = [".env/**", ".venv/**", "src/mindtouch2zim/templates", ".hatch"] extraPaths = ["src"] pythonVersion = "3.12" -typeCheckingMode = "basic" +typeCheckingMode = "strict" disableBytesTypePromotions = true diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py index 6c85291..7b6c482 100644 --- a/scraper/src/mindtouch2zim/asset.py +++ b/scraper/src/mindtouch2zim/asset.py @@ -7,11 +7,16 @@ from urllib.parse import urlsplit import backoff -from kiwixstorage import KiwixStorage, NotFoundError -from pif import get_public_ip +from kiwixstorage import ( # pyright: ignore[reportMissingTypeStubs] + KiwixStorage, + NotFoundError, +) +from pif import ( # pyright: ignore[reportMissingTypeStubs] + get_public_ip, # pyright: ignore[reportUnknownVariableType] +) from PIL import Image from requests.exceptions import RequestException -from resizeimage import resizeimage +from resizeimage import resizeimage # pyright: ignore[reportMissingTypeStubs] from zimscraperlib.image.optimization import optimize_webp from zimscraperlib.image.presets import WebpMedium from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath @@ -232,7 +237,7 @@ def _get_image_content( if image.width * image.height <= context.maximum_image_pixels: image.save(optimized, format="WEBP") else: - resizeimage.resize_width( + resizeimage.resize_width( # pyright: ignore[reportUnknownMemberType] image, int( math.sqrt( @@ -242,14 +247,7 @@ def _get_image_content( ).save(optimized, format="WEBP") del unoptimized - optimize_webp( - src=optimized, - quality=WEBP_OPTIONS.get("quality"), # pyright: ignore[reportArgumentType] - method=WEBP_OPTIONS.get("method"), # pyright: ignore[reportArgumentType] - lossless=WEBP_OPTIONS.get( - "lossless" - ), # pyright: ignore[reportArgumentType] - ) + optimize_webp(src=optimized, options=WEBP_OPTIONS) if context.s3_url_with_credentials: # upload optimized to S3 diff --git a/scraper/src/mindtouch2zim/client.py b/scraper/src/mindtouch2zim/client.py index fa6d227..1092218 100644 --- a/scraper/src/mindtouch2zim/client.py +++ b/scraper/src/mindtouch2zim/client.py @@ -327,7 +327,10 @@ def get_page_definition(self, page: LibraryPage | str) -> LibraryPageDefinition: if raw_tag is None: raise MindtouchParsingError(f"No tag property for page {page_id}") if isinstance(raw_tag, list): - tags = [item.get("@value") for item in raw_tag] + tags: list[Any] = [ + item.get("@value") # pyright: ignore[reportUnknownMemberType] + for item in raw_tag # pyright: ignore[reportUnknownVariableType] + ] else: tags = [raw_tag.get("@value")] diff --git a/scraper/src/mindtouch2zim/context.py b/scraper/src/mindtouch2zim/context.py index 6b7fbae..68f8c83 100644 --- a/scraper/src/mindtouch2zim/context.py +++ b/scraper/src/mindtouch2zim/context.py @@ -4,6 +4,7 @@ import re import threading from pathlib import Path +from typing import Any import requests from zimscraperlib.constants import NAME as SCRAPERLIB_NAME @@ -64,7 +65,7 @@ class Context: assets_workers: int = 10 # known bad assets - bad_assets_regex: re.Pattern = re.compile(STANDARD_KNOWN_BAD_ASSETS_REGEX) + bad_assets_regex: re.Pattern[str] = re.compile(STANDARD_KNOWN_BAD_ASSETS_REGEX) # maximum amount of bad assets bad_assets_threshold: int = 10 @@ -99,9 +100,9 @@ class Context: secondary_color: str = "#FFFFFF" # Content filters - page_title_include: re.Pattern | None = None + page_title_include: re.Pattern[str] | None = None page_id_include: list[str] | None = None - page_title_exclude: re.Pattern | None = None + page_title_exclude: re.Pattern[str] | None = None root_page_id: str | None = None # Maximum number of pixels of images that will be pushed to the ZIM @@ -114,7 +115,7 @@ class Context: ) @classmethod - def setup(cls, **kwargs): + def setup(cls, **kwargs: Any): new_instance = cls(**kwargs) if cls._instance: # replace values 'in-place' so that we do not change the Context object diff --git a/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py b/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py index ab4bf77..93923a7 100644 --- a/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py +++ b/scraper/src/mindtouch2zim/libretexts/detailed_licensing.py @@ -1,7 +1,6 @@ from typing import Any from jinja2 import Template -from pydantic import BaseModel from zimscraperlib.rewriting.html import HtmlRewriter from mindtouch2zim.client import LibraryPage, MindtouchClient @@ -12,27 +11,6 @@ logger = context.logger -class LicenseStatistic(BaseModel): - label: str - version: str | None - percent: float - count: int - link: str - - -class LicenseInfo(BaseModel): - statistics: list[LicenseStatistic] - details: list - - -class PageInfo(BaseModel): - license_label: str - license_version: str - url: str - title: str - children: list["PageInfo"] - - def _get_licensing_report_data(cover_url: str) -> Any: """ Get licensing report from libretexts.org diff --git a/scraper/src/mindtouch2zim/processor.py b/scraper/src/mindtouch2zim/processor.py index 8edacd4..10313dd 100644 --- a/scraper/src/mindtouch2zim/processor.py +++ b/scraper/src/mindtouch2zim/processor.py @@ -5,10 +5,14 @@ from http import HTTPStatus from io import BytesIO from pathlib import Path +from typing import Any import backoff from jinja2 import Environment, FileSystemLoader, select_autoescape -from joblib import Parallel, delayed +from joblib import ( # pyright: ignore[reportMissingTypeStubs] + Parallel, + delayed, # pyright: ignore[reportUnknownVariableType] +) from pydantic import BaseModel from requests import RequestException from requests.exceptions import HTTPError @@ -462,7 +466,7 @@ def run_with_creator(self, creator: Creator): context.current_thread_workitem = "assets" self.stats_items_total += len(self.asset_manager.assets) - res = self.asset_executor( + res: Any = self.asset_executor( delayed(self.asset_processor.process_asset)( asset_path, asset_details, creator ) diff --git a/scraper/tests/__init__.py b/scraper/tests/__init__.py index 0b79efa..d3b4f9e 100644 --- a/scraper/tests/__init__.py +++ b/scraper/tests/__init__.py @@ -1,10 +1,11 @@ import threading +from typing import Any from zimscraperlib.download import get_session from mindtouch2zim.context import Context -CONTEXT_DEFAULTS = { +CONTEXT_DEFAULTS: dict[str, Any] = { "web_session": get_session(), "tmp_folder": None, "cache_folder": None, diff --git a/scraper/tests/test_asset.py b/scraper/tests/test_asset.py index 8f21d62..e10423f 100644 --- a/scraper/tests/test_asset.py +++ b/scraper/tests/test_asset.py @@ -178,7 +178,7 @@ def test_get_mime_type( ): assert ( - processor._get_mime_type( + processor._get_mime_type( # pyright: ignore[reportPrivateUsage] header_data=HeaderData(ident="foo", content_type=header_content_type), asset_url=HttpUrl( "https://www.acme.com/xenolith-of-diorite.jpg?revision=1" diff --git a/scraper/tests/test_context.py b/scraper/tests/test_context.py index 244dffe..c63dd53 100644 --- a/scraper/tests/test_context.py +++ b/scraper/tests/test_context.py @@ -1,3 +1,6 @@ +import re +from typing import Any + import pytest from mindtouch2zim.context import Context @@ -20,15 +23,15 @@ def test_context_defaults(): context = Context.get() assert context == processor_context # check both objects are same assert context.assets_workers == 10 - assert ( # check getter logic - context.wm_user_agent - == "mindtouch2zim/0.1.0-dev0 (https://www.kiwix.org) zimscraperlib/5.0.0-dev0" + assert re.match( # check getter logic + r"mindtouch2zim\/.* \(https:\/\/www\.kiwix\.org\) zimscraperlib\/.*", + context.wm_user_agent, ) context.current_thread_workitem = "context 123" assert context.current_thread_workitem == "context 123" -def test_context_setup_again(context_defaults): +def test_context_setup_again(context_defaults: dict[str, Any]): settings = context_defaults.copy() settings["title"] = "A title" Context.setup(**settings) diff --git a/scraper/tests/test_entrypoint.py b/scraper/tests/test_entrypoint.py index 68f8fd9..b3f72cf 100644 --- a/scraper/tests/test_entrypoint.py +++ b/scraper/tests/test_entrypoint.py @@ -316,7 +316,7 @@ def test_entrypoint_regex_args( prepare_context( ([*good_cli_args, arg_name, arg_value] if arg_name else good_cli_args), tmpdir ) - regex: re.Pattern = context.__getattribute__(context_name) + regex: re.Pattern[str] = context.__getattribute__(context_name) for match in expected_match: assert regex.findall(match) for match in expected_no_match: diff --git a/scraper/tests/test_processor.py b/scraper/tests/test_processor.py index 0852c1f..94b14f9 100644 --- a/scraper/tests/test_processor.py +++ b/scraper/tests/test_processor.py @@ -12,7 +12,7 @@ def dummy_encoded_url() -> str: @pytest.fixture(scope="module") -def library_tree(dummy_encoded_url) -> LibraryTree: +def library_tree(dummy_encoded_url: str) -> LibraryTree: root = LibraryPage( id="24", title="Home page", path="", encoded_url=dummy_encoded_url )