Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use scraperlib 5.0.0rc4 #125

Merged
merged 4 commits into from
Jan 9, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 2 additions & 6 deletions scraper/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@ requires-python = ">=3.12,<3.13"
description = "Make ZIM file from Mindtouch / Nice CXone Expert libraries"
readme = "../README.md"
dependencies = [
# use zimscraperlib pinned version once content rewriting functions have been released
"zimscraperlib @ git+https://github.com/openzim/python-scraperlib@main",
"zimscraperlib==5.0.0rc4",
"requests==2.32.3",
"types-requests==2.32.0.20241016",
"kiwixstorage==0.9.0",
Expand All @@ -26,9 +25,6 @@ dependencies = [
]
dynamic = ["authors", "classifiers", "keywords", "license", "version", "urls"]

[tool.hatch.metadata]
allow-direct-references = true

[tool.hatch.metadata.hooks.openzim-metadata]
kind = "scraper"
additional-keywords = ["mindouch", "nice", "cxone", "expert"]
Expand Down Expand Up @@ -228,5 +224,5 @@ include = ["src", "tests", "tasks.py"]
exclude = [".env/**", ".venv/**", "src/mindtouch2zim/templates", ".hatch"]
extraPaths = ["src"]
pythonVersion = "3.12"
typeCheckingMode = "basic"
typeCheckingMode = "strict"
disableBytesTypePromotions = true
22 changes: 10 additions & 12 deletions scraper/src/mindtouch2zim/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,16 @@
from urllib.parse import urlsplit

import backoff
from kiwixstorage import KiwixStorage, NotFoundError
from pif import get_public_ip
from kiwixstorage import ( # pyright: ignore[reportMissingTypeStubs]
KiwixStorage,
NotFoundError,
)
from pif import ( # pyright: ignore[reportMissingTypeStubs]
get_public_ip, # pyright: ignore[reportUnknownVariableType]
)
from PIL import Image
from requests.exceptions import RequestException
from resizeimage import resizeimage
from resizeimage import resizeimage # pyright: ignore[reportMissingTypeStubs]
from zimscraperlib.image.optimization import optimize_webp
from zimscraperlib.image.presets import WebpMedium
from zimscraperlib.rewriting.url_rewriting import HttpUrl, ZimPath
Expand Down Expand Up @@ -232,7 +237,7 @@
if image.width * image.height <= context.maximum_image_pixels:
image.save(optimized, format="WEBP")
else:
resizeimage.resize_width(
resizeimage.resize_width( # pyright: ignore[reportUnknownMemberType]

Check warning on line 240 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L240

Added line #L240 was not covered by tests
image,
int(
math.sqrt(
Expand All @@ -242,14 +247,7 @@
).save(optimized, format="WEBP")
del unoptimized

optimize_webp(
src=optimized,
quality=WEBP_OPTIONS.get("quality"), # pyright: ignore[reportArgumentType]
method=WEBP_OPTIONS.get("method"), # pyright: ignore[reportArgumentType]
lossless=WEBP_OPTIONS.get(
"lossless"
), # pyright: ignore[reportArgumentType]
)
optimize_webp(src=optimized, options=WEBP_OPTIONS)

Check warning on line 250 in scraper/src/mindtouch2zim/asset.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/asset.py#L250

Added line #L250 was not covered by tests

if context.s3_url_with_credentials:
# upload optimized to S3
Expand Down
5 changes: 4 additions & 1 deletion scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,10 @@
if raw_tag is None:
raise MindtouchParsingError(f"No tag property for page {page_id}")
if isinstance(raw_tag, list):
tags = [item.get("@value") for item in raw_tag]
tags: list[Any] = [

Check warning on line 330 in scraper/src/mindtouch2zim/client.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/client.py#L330

Added line #L330 was not covered by tests
item.get("@value") # pyright: ignore[reportUnknownMemberType]
for item in raw_tag # pyright: ignore[reportUnknownVariableType]
]
else:
tags = [raw_tag.get("@value")]

Expand Down
9 changes: 5 additions & 4 deletions scraper/src/mindtouch2zim/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
import threading
from pathlib import Path
from typing import Any

import requests
from zimscraperlib.constants import NAME as SCRAPERLIB_NAME
Expand Down Expand Up @@ -64,7 +65,7 @@ class Context:
assets_workers: int = 10

# known bad assets
bad_assets_regex: re.Pattern = re.compile(STANDARD_KNOWN_BAD_ASSETS_REGEX)
bad_assets_regex: re.Pattern[str] = re.compile(STANDARD_KNOWN_BAD_ASSETS_REGEX)

# maximum amount of bad assets
bad_assets_threshold: int = 10
Expand Down Expand Up @@ -99,9 +100,9 @@ class Context:
secondary_color: str = "#FFFFFF"

# Content filters
page_title_include: re.Pattern | None = None
page_title_include: re.Pattern[str] | None = None
page_id_include: list[str] | None = None
page_title_exclude: re.Pattern | None = None
page_title_exclude: re.Pattern[str] | None = None
root_page_id: str | None = None

# Maximum number of pixels of images that will be pushed to the ZIM
Expand All @@ -114,7 +115,7 @@ class Context:
)

@classmethod
def setup(cls, **kwargs):
def setup(cls, **kwargs: Any):
new_instance = cls(**kwargs)
if cls._instance:
# replace values 'in-place' so that we do not change the Context object
Expand Down
22 changes: 0 additions & 22 deletions scraper/src/mindtouch2zim/libretexts/detailed_licensing.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
from typing import Any

from jinja2 import Template
from pydantic import BaseModel
from zimscraperlib.rewriting.html import HtmlRewriter

from mindtouch2zim.client import LibraryPage, MindtouchClient
Expand All @@ -12,27 +11,6 @@
logger = context.logger


class LicenseStatistic(BaseModel):
label: str
version: str | None
percent: float
count: int
link: str


class LicenseInfo(BaseModel):
statistics: list[LicenseStatistic]
details: list


class PageInfo(BaseModel):
license_label: str
license_version: str
url: str
title: str
children: list["PageInfo"]


def _get_licensing_report_data(cover_url: str) -> Any:
"""
Get licensing report from libretexts.org
Expand Down
8 changes: 6 additions & 2 deletions scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,14 @@
from http import HTTPStatus
from io import BytesIO
from pathlib import Path
from typing import Any

import backoff
from jinja2 import Environment, FileSystemLoader, select_autoescape
from joblib import Parallel, delayed
from joblib import ( # pyright: ignore[reportMissingTypeStubs]
Parallel,
delayed, # pyright: ignore[reportUnknownVariableType]
)
from pydantic import BaseModel
from requests import RequestException
from requests.exceptions import HTTPError
Expand Down Expand Up @@ -462,7 +466,7 @@
context.current_thread_workitem = "assets"
self.stats_items_total += len(self.asset_manager.assets)

res = self.asset_executor(
res: Any = self.asset_executor(

Check warning on line 469 in scraper/src/mindtouch2zim/processor.py

View check run for this annotation

Codecov / codecov/patch

scraper/src/mindtouch2zim/processor.py#L469

Added line #L469 was not covered by tests
delayed(self.asset_processor.process_asset)(
asset_path, asset_details, creator
)
Expand Down
3 changes: 2 additions & 1 deletion scraper/tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import threading
from typing import Any

from zimscraperlib.download import get_session

from mindtouch2zim.context import Context

CONTEXT_DEFAULTS = {
CONTEXT_DEFAULTS: dict[str, Any] = {
"web_session": get_session(),
"tmp_folder": None,
"cache_folder": None,
Expand Down
2 changes: 1 addition & 1 deletion scraper/tests/test_asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def test_get_mime_type(
):

assert (
processor._get_mime_type(
processor._get_mime_type( # pyright: ignore[reportPrivateUsage]
header_data=HeaderData(ident="foo", content_type=header_content_type),
asset_url=HttpUrl(
"https://www.acme.com/xenolith-of-diorite.jpg?revision=1"
Expand Down
11 changes: 7 additions & 4 deletions scraper/tests/test_context.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import re
from typing import Any

import pytest

from mindtouch2zim.context import Context
Expand All @@ -20,15 +23,15 @@ def test_context_defaults():
context = Context.get()
assert context == processor_context # check both objects are same
assert context.assets_workers == 10
assert ( # check getter logic
context.wm_user_agent
== "mindtouch2zim/0.1.0-dev0 (https://www.kiwix.org) zimscraperlib/5.0.0-dev0"
assert re.match( # check getter logic
r"mindtouch2zim\/.* \(https:\/\/www\.kiwix\.org\) zimscraperlib\/.*",
context.wm_user_agent,
)
context.current_thread_workitem = "context 123"
assert context.current_thread_workitem == "context 123"


def test_context_setup_again(context_defaults):
def test_context_setup_again(context_defaults: dict[str, Any]):
settings = context_defaults.copy()
settings["title"] = "A title"
Context.setup(**settings)
Expand Down
2 changes: 1 addition & 1 deletion scraper/tests/test_entrypoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ def test_entrypoint_regex_args(
prepare_context(
([*good_cli_args, arg_name, arg_value] if arg_name else good_cli_args), tmpdir
)
regex: re.Pattern = context.__getattribute__(context_name)
regex: re.Pattern[str] = context.__getattribute__(context_name)
for match in expected_match:
assert regex.findall(match)
for match in expected_no_match:
Expand Down
2 changes: 1 addition & 1 deletion scraper/tests/test_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ def dummy_encoded_url() -> str:


@pytest.fixture(scope="module")
def library_tree(dummy_encoded_url) -> LibraryTree:
def library_tree(dummy_encoded_url: str) -> LibraryTree:
root = LibraryPage(
id="24", title="Home page", path="", encoded_url=dummy_encoded_url
)
Expand Down