Skip to content

Commit

Permalink
Merge pull request #110 from openzim/always_optimize_images
Browse files Browse the repository at this point in the history
Better detect when we can optimize images
  • Loading branch information
benoit74 authored Dec 6, 2024
2 parents 36618e5 + 49a42b5 commit 58c7951
Show file tree
Hide file tree
Showing 6 changed files with 390 additions and 108 deletions.
114 changes: 98 additions & 16 deletions scraper/src/mindtouch2zim/asset.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
import math
import mimetypes
import threading
from functools import partial
from io import BytesIO
from typing import NamedTuple
from urllib.parse import urlsplit

import backoff
from kiwixstorage import KiwixStorage, NotFoundError
Expand Down Expand Up @@ -55,6 +57,7 @@ class AssetDetails(NamedTuple):
asset_urls: set[HttpUrl]
used_by: set[str]
always_fetch_online: bool
kind: str | None

@property
def get_usage_repr(self) -> str:
Expand All @@ -64,6 +67,60 @@ def get_usage_repr(self) -> str:
return f' used by {", ".join(self.used_by)}'


class AssetManager:
"""Class responsible to manage a list of assets to download"""

def __init__(self) -> None:
self.assets: dict[ZimPath, AssetDetails] = {}

def add_asset(
self,
asset_path: ZimPath,
asset_url: HttpUrl,
used_by: str,
kind: str | None,
*,
always_fetch_online: bool,
):
"""Add a new asset to download
asset_path: target path inside the ZIM
asset_url: URL where the asset can be downloaded
used_by: string explaining (mostly for debug purposes) where the asset is used
(typically on which page)
kind: kind of asset if we know it ; for instance "img" is used to not rely only
on returned mime type to decide if we should optimize it or not
always_fetch_online: if False, the asset may be cached on S3 ; if True, it is
always fetch online
"""
if asset_path not in self.assets:
self.assets[asset_path] = AssetDetails(
asset_urls={asset_url},
used_by={used_by},
kind=kind,
always_fetch_online=always_fetch_online,
)
return
current_asset = self.assets[asset_path]
if current_asset.kind != kind:
logger.warning(
f"Conflicting kind found for asset at {asset_path} already used by "
f"{current_asset.get_usage_repr}; current kind is "
f"'{current_asset.kind}';new kind '{kind}' from {asset_url} used by "
f"{used_by} will be ignored"
)
if current_asset.always_fetch_online != always_fetch_online:
logger.warning(
f"Conflicting always_fetch_online found for asset at {asset_path} "
f"already used by {current_asset.get_usage_repr}; current "
f"always_fetch_online is '{current_asset.always_fetch_online}';"
f"new always_fetch_online '{always_fetch_online}' from {asset_url} used"
f" by {used_by} will be ignored"
)
current_asset.used_by.add(used_by)
current_asset.asset_urls.add(asset_url)


class AssetProcessor:

def __init__(
Expand All @@ -89,6 +146,7 @@ def process_asset(
asset_path=asset_path,
asset_url=asset_url,
always_fetch_online=asset_details.always_fetch_online,
kind=asset_details.kind,
)
logger.debug(f"Adding asset to {asset_path.value} in the ZIM")
creator.add_item_for(
Expand Down Expand Up @@ -161,8 +219,9 @@ def _get_image_content(

if context.s3_url_with_credentials:
if s3_data := self._download_from_s3_cache(s3_key=s3_key, meta=meta):
logger.debug("Fetching directly from S3 cache")
return s3_data # found in cache
if len(s3_data.getvalue()) > 0:
logger.debug("Fetched directly from S3 cache")
return s3_data # found in cache

logger.debug("Fetching from online")
unoptimized = self._download_from_online(asset_url=asset_url)
Expand Down Expand Up @@ -243,33 +302,56 @@ def _download_from_online(self, asset_url: HttpUrl) -> BytesIO:
)
return asset_content

def _get_mime_type(
self,
header_data: HeaderData,
asset_url: HttpUrl,
kind: str | None,
) -> str | None:
if header_data.content_type:
mime_type = header_data.content_type.split(";")[0].strip()
else:
mime_type = None
if (
mime_type is None or mime_type == "application/octet-stream"
) and kind == "img":
# try to source mime_type from file extension
mime_type, _ = mimetypes.guess_type(urlsplit(asset_url.value).path)
return mime_type

@backoff.on_exception(
partial(backoff.expo, base=3, factor=2),
RequestException,
max_time=30, # secs
on_backoff=backoff_hdlr,
)
def get_asset_content(
self, asset_path: ZimPath, asset_url: HttpUrl, *, always_fetch_online: bool
self,
asset_path: ZimPath,
asset_url: HttpUrl,
kind: str | None,
*,
always_fetch_online: bool,
) -> BytesIO:
"""Download of a given asset, optimize if needed, or download from S3 cache"""

try:
if not always_fetch_online:
header_data = self._get_header_data_for(asset_url)
if header_data.content_type:
mime_type = header_data.content_type.split(";")[0].strip()
if mime_type in SUPPORTED_IMAGE_MIME_TYPES:
return self._get_image_content(
asset_path=asset_path,
asset_url=asset_url,
header_data=header_data,
)
else:
logger.debug(
f"Not optimizing, unsupported mime type: {mime_type} for "
f"{context.current_thread_workitem}"
)
mime_type = self._get_mime_type(
header_data=header_data, asset_url=asset_url, kind=kind
)
if mime_type and mime_type in SUPPORTED_IMAGE_MIME_TYPES:
return self._get_image_content(
asset_path=asset_path,
asset_url=asset_url,
header_data=header_data,
)
else:
logger.debug(
f"Not optimizing, unsupported mime type: {mime_type} for "
f"{context.current_thread_workitem}"
)

return self._download_from_online(asset_url=asset_url)
except RequestException as exc:
Expand Down
18 changes: 8 additions & 10 deletions scraper/src/mindtouch2zim/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,12 +362,11 @@ def get_cover_page(self, page: LibraryPage) -> LibraryPage | None:
or "coverpage:nocommons" in current_definition.tags
):
return current_page
if "article:topic-category" in current_definition.tags:
if (
"article:topic-category" in current_definition.tags
or current_page.parent is None
):
return None
if current_page.parent is None:
raise MindtouchParsingError(
f"No more parent for {page.id}, reached root at {current_page.id}"
)
current_page = current_page.parent

def _get_cover_page_from_str_id(self, page_id: str) -> str | None:
Expand All @@ -390,12 +389,11 @@ def _get_cover_page_from_str_id(self, page_id: str) -> str | None:
or "coverpage:nocommons" in current_definition.tags
):
return current_page
if "article:topic-category" in current_definition.tags:
if (
"article:topic-category" in current_definition.tags
or current_definition.parent_id is None
):
return None
if current_definition.parent_id is None:
raise MindtouchParsingError(
f"No more parent for {page_id}, reached root at {current_page}"
)
current_page = current_definition.parent_id

def get_cover_page_encoded_url(self, page: LibraryPage) -> str | None:
Expand Down
39 changes: 22 additions & 17 deletions scraper/src/mindtouch2zim/html_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
ZimPath,
)

from mindtouch2zim.asset import AssetManager
from mindtouch2zim.client import LibraryPage
from mindtouch2zim.constants import logger
from mindtouch2zim.context import Context
Expand Down Expand Up @@ -106,14 +107,14 @@ def rewrite_iframe_tags(
f'https://i.ytimg.com/vi/{ytb_match.group("id")}/hqdefault.jpg',
base_href=base_href,
)
url_rewriter.add_item_to_download(rewrite_result)
url_rewriter.add_item_to_download(rewrite_result, "img")
image_rewriten_url = rewrite_result.rewriten_url
elif VIMEO_IFRAME_RE.match(src):
rewrite_result = url_rewriter(
get_vimeo_thumbnail_url(src),
base_href=base_href,
)
url_rewriter.add_item_to_download(rewrite_result)
url_rewriter.add_item_to_download(rewrite_result, "img")
image_rewriten_url = rewrite_result.rewriten_url
else:
logger.debug(
Expand Down Expand Up @@ -158,7 +159,11 @@ class HtmlUrlsRewriter(ArticleUrlRewriter):
"""

def __init__(
self, library_url: str, page: LibraryPage, existing_zim_paths: set[ZimPath]
self,
library_url: str,
page: LibraryPage,
existing_zim_paths: set[ZimPath],
asset_manager: AssetManager,
):
super().__init__(
article_url=HttpUrl(f"{library_url}/{page.path}"),
Expand All @@ -167,28 +172,28 @@ def __init__(
)
self.library_url = library_url
self.library_path = ArticleUrlRewriter.normalize(HttpUrl(f"{library_url}/"))
self.items_to_download: dict[ZimPath, set[HttpUrl]] = {}
self.page = page
self.asset_manager = asset_manager

def __call__(
self, item_url: str, base_href: str | None, *, rewrite_all_url: bool = True
) -> RewriteResult:
result = super().__call__(item_url, base_href, rewrite_all_url=rewrite_all_url)
return result

def add_item_to_download(self, rewrite_result: RewriteResult):
def add_item_to_download(self, rewrite_result: RewriteResult, kind: str | None):
"""Add item to download based on rewrite result"""
if rewrite_result.zim_path is not None:
# if item is expected to be inside the ZIM, store asset information so that
# we can download it afterwards
if rewrite_result.zim_path in self.items_to_download:
self.items_to_download[rewrite_result.zim_path].add(
HttpUrl(rewrite_result.absolute_url)
)
else:
self.items_to_download[rewrite_result.zim_path] = {
HttpUrl(rewrite_result.absolute_url)
}
if rewrite_result.zim_path is None:
return
# if item is expected to be inside the ZIM, store asset information so that
# we can download it afterwards
self.asset_manager.add_asset(
asset_path=rewrite_result.zim_path,
asset_url=HttpUrl(rewrite_result.absolute_url),
used_by=context.current_thread_workitem,
kind=kind,
always_fetch_online=False,
)


@html_rules.rewrite_tag()
Expand Down Expand Up @@ -234,7 +239,7 @@ def rewrite_img_tags(
rewrite_result = url_rewriter(src_value, base_href=base_href, rewrite_all_url=True)
# add 'content/' to the URL since all assets will be stored in the sub.-path
new_attr_value = f"content/{rewrite_result.rewriten_url}"
url_rewriter.add_item_to_download(rewrite_result)
url_rewriter.add_item_to_download(rewrite_result, "img")

values = " ".join(
format_attr(*attr)
Expand Down
Loading

0 comments on commit 58c7951

Please sign in to comment.