Skip to content

Commit

Permalink
Merge pull request #125 from openzim/scraperlib5rc1
Browse files Browse the repository at this point in the history
Use scraperlib 5.0.0rc4
  • Loading branch information
benoit74 authored Jan 9, 2025
2 parents 9b861f5 + ddb65ca commit 372694a
Show file tree
Hide file tree
Showing 11 changed files with 39 additions and 55 deletions.
8 changes: 2 additions & 6 deletions scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ requires-python = ">=3.12,<3.13"
description = "Make ZIM file from Mindtouch / Nice CXone Expert libraries"
readme = "../README.md"
dependencies = [
# use zimscraperlib pinned version once content rewriting functions have been released
"zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main",
"zimscraperlib==5.0.0rc4",
"requests==2.32.3",
"types-requests==2.32.0.20241016",
"kiwixstorage==0.9.0",
Expand All @@ -26,9 +25,6 @@ dependencies = [
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

[tool.hatch.metadata]
allow-direct-references = true

[tool.hatch.metadata.hooks.openzim-metadata]
kind = "scraper"
additional-keywords = ["mindouch", "nice", "cxone", "expert"]
Expand Down Expand Up @@ -228,5 +224,5 @@ include = ["src", "tests", "tasks.py"]
exclude = [".env/**", ".venv/**", "src/mindtouch2zim/templates", ".hatch"]
extraPaths = ["src"]
pythonVersion = "3.12"
typeCheckingMode = "basic"
typeCheckingMode = "strict"
disableBytesTypePromotions = true
22 changes: 10 additions & 12 deletions scraper/src/mindtouch2zim/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@
from urllib.parse import urlsplit

import backoff
from kiwixstorage import KiwixStorage, NotFoundError
from pif import get_public_ip
from kiwixstorage import ( # pyright: ignore[reportMissingTypeStubs]
KiwixStorage,
NotFoundError,
)
from pif import ( # pyright: ignore[reportMissingTypeStubs]
get_public_ip, # pyright: ignore[reportUnknownVariableType]
)
from PIL import Image
from requests.exceptions import RequestException
from resizeimage import resizeimage
from resizeimage import resizeimage # pyright: ignore[reportMissingTypeStubs]
from zimscraperlib.image.optimization import optimize_webp
from zimscraperlib.image.presets import WebpMedium
from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath
Expand Down Expand Up @@ -232,7 +237,7 @@ def _get_image_content(
if image.width * image.height <= context.maximum_image_pixels:
image.save(optimized, format="WEBP")
else:
resizeimage.resize_width(
resizeimage.resize_width( # pyright: ignore[reportUnknownMemberType]
image,
int(
math.sqrt(
Expand All @@ -242,14 +247,7 @@ def _get_image_content(
).save(optimized, format="WEBP")
del unoptimized

optimize_webp(
src=optimized,
quality=WEBP_OPTIONS.get("quality"), # pyright: ignore[reportArgumentType]
method=WEBP_OPTIONS.get("method"), # pyright: ignore[reportArgumentType]
lossless=WEBP_OPTIONS.get(
"lossless"
), # pyright: ignore[reportArgumentType]
)
optimize_webp(src=optimized, options=WEBP_OPTIONS)

if context.s3_url_with_credentials:
# upload optimized to S3
Expand Down
5 changes: 4 additions & 1 deletion scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,10 @@ def get_page_definition(self, page: LibraryPage | str) -> LibraryPageDefinition:
if raw_tag is None:
raise MindtouchParsingError(f"No tag property for page {page_id}")
if isinstance(raw_tag, list):
tags = [item.get("@value") for item in raw_tag]
tags: list[Any] = [
item.get("@value") # pyright: ignore[reportUnknownMemberType]
for item in raw_tag # pyright: ignore[reportUnknownVariableType]
]
else:
tags = [raw_tag.get("@value")]

Expand Down
9 changes: 5 additions & 4 deletions scraper/src/mindtouch2zim/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
import threading
from pathlib import Path
from typing import Any

import requests
from zimscraperlib.constants import NAME as SCRAPERLIB_NAME
Expand Down Expand Up @@ -64,7 +65,7 @@ class Context:
assets_workers: int = 10

# known bad assets
bad_assets_regex: re.Pattern = re.compile(STANDARD_KNOWN_BAD_ASSETS_REGEX)
bad_assets_regex: re.Pattern[str] = re.compile(STANDARD_KNOWN_BAD_ASSETS_REGEX)

# maximum amount of bad assets
bad_assets_threshold: int = 10
Expand Down Expand Up @@ -99,9 +100,9 @@ class Context:
secondary_color: str = "#FFFFFF"

# Content filters
page_title_include: re.Pattern | None = None
page_title_include: re.Pattern[str] | None = None
page_id_include: list[str] | None = None
page_title_exclude: re.Pattern | None = None
page_title_exclude: re.Pattern[str] | None = None
root_page_id: str | None = None

# Maximum number of pixels of images that will be pushed to the ZIM
Expand All @@ -114,7 +115,7 @@ class Context:
)

@classmethod
def setup(cls, **kwargs):
def setup(cls, **kwargs: Any):
new_instance = cls(**kwargs)
if cls._instance:
# replace values 'in-place' so that we do not change the Context object
Expand Down
22 changes: 0 additions & 22 deletions scraper/src/mindtouch2zim/libretexts/detailed_licensing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Any

from jinja2 import Template
from pydantic import BaseModel
from zimscraperlib.rewriting.html import HtmlRewriter

from mindtouch2zim.client import LibraryPage, MindtouchClient
Expand All @@ -12,27 +11,6 @@
logger = context.logger


class LicenseStatistic(BaseModel):
label: str
version: str | None
percent: float
count: int
link: str


class LicenseInfo(BaseModel):
statistics: list[LicenseStatistic]
details: list


class PageInfo(BaseModel):
license_label: str
license_version: str
url: str
title: str
children: list["PageInfo"]


def _get_licensing_report_data(cover_url: str) -> Any:
"""
Get licensing report from libretexts.org
Expand Down
8 changes: 6 additions & 2 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@
from http import HTTPStatus
from io import BytesIO
from pathlib import Path
from typing import Any

import backoff
from jinja2 import Environment, FileSystemLoader, select_autoescape
from joblib import Parallel, delayed
from joblib import ( # pyright: ignore[reportMissingTypeStubs]
Parallel,
delayed, # pyright: ignore[reportUnknownVariableType]
)
from pydantic import BaseModel
from requests import RequestException
from requests.exceptions import HTTPError
Expand Down Expand Up @@ -462,7 +466,7 @@ def run_with_creator(self, creator: Creator):
context.current_thread_workitem = "assets"
self.stats_items_total += len(self.asset_manager.assets)

res = self.asset_executor(
res: Any = self.asset_executor(
delayed(self.asset_processor.process_asset)(
asset_path, asset_details, creator
)
Expand Down
3 changes: 2 additions & 1 deletion scraper/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import threading
from typing import Any

from zimscraperlib.download import get_session

from mindtouch2zim.context import Context

CONTEXT_DEFAULTS = {
CONTEXT_DEFAULTS: dict[str, Any] = {
"web_session": get_session(),
"tmp_folder": None,
"cache_folder": None,
Expand Down
2 changes: 1 addition & 1 deletion scraper/tests/test_asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test_get_mime_type(
):

assert (
processor._get_mime_type(
processor._get_mime_type( # pyright: ignore[reportPrivateUsage]
header_data=HeaderData(ident="foo", content_type=header_content_type),
asset_url=HttpUrl(
"https://www.acme.com/xenolith-of-diorite.jpg?revision=1"
Expand Down
11 changes: 7 additions & 4 deletions scraper/tests/test_context.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import re
from typing import Any

import pytest

from mindtouch2zim.context import Context
Expand All @@ -20,15 +23,15 @@ def test_context_defaults():
context = Context.get()
assert context == processor_context # check both objects are same
assert context.assets_workers == 10
assert ( # check getter logic
context.wm_user_agent
== "mindtouch2zim/0.1.0-dev0 (https://www.kiwix.org) zimscraperlib/5.0.0-dev0"
assert re.match( # check getter logic
r"mindtouch2zim\/.* \(https:\/\/www\.kiwix\.org\) zimscraperlib\/.*",
context.wm_user_agent,
)
context.current_thread_workitem = "context 123"
assert context.current_thread_workitem == "context 123"


def test_context_setup_again(context_defaults):
def test_context_setup_again(context_defaults: dict[str, Any]):
settings = context_defaults.copy()
settings["title"] = "A title"
Context.setup(**settings)
Expand Down
2 changes: 1 addition & 1 deletion scraper/tests/test_entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def test_entrypoint_regex_args(
prepare_context(
([*good_cli_args, arg_name, arg_value] if arg_name else good_cli_args), tmpdir
)
regex: re.Pattern = context.__getattribute__(context_name)
regex: re.Pattern[str] = context.__getattribute__(context_name)
for match in expected_match:
assert regex.findall(match)
for match in expected_no_match:
Expand Down
2 changes: 1 addition & 1 deletion scraper/tests/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def dummy_encoded_url() -> str:


@pytest.fixture(scope="module")
def library_tree(dummy_encoded_url) -> LibraryTree:
def library_tree(dummy_encoded_url: str) -> LibraryTree:
root = LibraryPage(
id="24", title="Home page", path="", encoded_url=dummy_encoded_url
)
Expand Down

0 comments on commit 372694a

Please sign in to comment.