Merge pull request #110 from openzim/always_optimize_images

Better detect when we can optimize images
openzim · Dec 6, 2024 · 58c7951 · 58c7951
2 parents 36618e5 + 49a42b5
commit 58c7951
Show file tree

Hide file tree

Showing 6 changed files with 390 additions and 108 deletions.
diff --git a/scraper/src/mindtouch2zim/asset.py b/scraper/src/mindtouch2zim/asset.py
@@ -1,8 +1,10 @@
 import math
+import mimetypes
 import threading
 from functools import partial
 from io import BytesIO
 from typing import NamedTuple
+from urllib.parse import urlsplit
 
 import backoff
 from kiwixstorage import KiwixStorage, NotFoundError
@@ -55,6 +57,7 @@ class AssetDetails(NamedTuple):
     asset_urls: set[HttpUrl]
     used_by: set[str]
     always_fetch_online: bool
+    kind: str | None
 
     @property
     def get_usage_repr(self) -> str:
@@ -64,6 +67,60 @@ def get_usage_repr(self) -> str:
         return f' used by {", ".join(self.used_by)}'
 
 
+class AssetManager:
+    """Class responsible to manage a list of assets to download"""
+
+    def __init__(self) -> None:
+        self.assets: dict[ZimPath, AssetDetails] = {}
+
+    def add_asset(
+        self,
+        asset_path: ZimPath,
+        asset_url: HttpUrl,
+        used_by: str,
+        kind: str | None,
+        *,
+        always_fetch_online: bool,
+    ):
+        """Add a new asset to download
+
+        asset_path: target path inside the ZIM
+        asset_url: URL where the asset can be downloaded
+        used_by: string explaining (mostly for debug purposes) where the asset is used
+          (typically on which page)
+        kind: kind of asset if we know it ; for instance "img" is used to not rely only
+          on returned mime type to decide if we should optimize it or not
+        always_fetch_online: if False, the asset may be cached on S3 ; if True, it is
+          always fetch online
+        """
+        if asset_path not in self.assets:
+            self.assets[asset_path] = AssetDetails(
+                asset_urls={asset_url},
+                used_by={used_by},
+                kind=kind,
+                always_fetch_online=always_fetch_online,
+            )
+            return
+        current_asset = self.assets[asset_path]
+        if current_asset.kind != kind:
+            logger.warning(
+                f"Conflicting kind found for asset at {asset_path} already used by "
+                f"{current_asset.get_usage_repr}; current kind is "
+                f"'{current_asset.kind}';new kind '{kind}' from {asset_url} used by "
+                f"{used_by} will be ignored"
+            )
+        if current_asset.always_fetch_online != always_fetch_online:
+            logger.warning(
+                f"Conflicting always_fetch_online found for asset at {asset_path} "
+                f"already used by {current_asset.get_usage_repr}; current "
+                f"always_fetch_online is '{current_asset.always_fetch_online}';"
+                f"new always_fetch_online '{always_fetch_online}' from {asset_url} used"
+                f" by {used_by} will be ignored"
+            )
+        current_asset.used_by.add(used_by)
+        current_asset.asset_urls.add(asset_url)
+
+
 class AssetProcessor:
 
     def __init__(
@@ -89,6 +146,7 @@ def process_asset(
                     asset_path=asset_path,
                     asset_url=asset_url,
                     always_fetch_online=asset_details.always_fetch_online,
+                    kind=asset_details.kind,
                 )
                 logger.debug(f"Adding asset to {asset_path.value} in the ZIM")
                 creator.add_item_for(
@@ -161,8 +219,9 @@ def _get_image_content(
 
         if context.s3_url_with_credentials:
             if s3_data := self._download_from_s3_cache(s3_key=s3_key, meta=meta):
-                logger.debug("Fetching directly from S3 cache")
-                return s3_data  # found in cache
+                if len(s3_data.getvalue()) > 0:
+                    logger.debug("Fetched directly from S3 cache")
+                    return s3_data  # found in cache
 
         logger.debug("Fetching from online")
         unoptimized = self._download_from_online(asset_url=asset_url)
@@ -243,33 +302,56 @@ def _download_from_online(self, asset_url: HttpUrl) -> BytesIO:
         )
         return asset_content
 
+    def _get_mime_type(
+        self,
+        header_data: HeaderData,
+        asset_url: HttpUrl,
+        kind: str | None,
+    ) -> str | None:
+        if header_data.content_type:
+            mime_type = header_data.content_type.split(";")[0].strip()
+        else:
+            mime_type = None
+        if (
+            mime_type is None or mime_type == "application/octet-stream"
+        ) and kind == "img":
+            # try to source mime_type from file extension
+            mime_type, _ = mimetypes.guess_type(urlsplit(asset_url.value).path)
+        return mime_type
+
     @backoff.on_exception(
         partial(backoff.expo, base=3, factor=2),
         RequestException,
         max_time=30,  # secs
         on_backoff=backoff_hdlr,
     )
     def get_asset_content(
-        self, asset_path: ZimPath, asset_url: HttpUrl, *, always_fetch_online: bool
+        self,
+        asset_path: ZimPath,
+        asset_url: HttpUrl,
+        kind: str | None,
+        *,
+        always_fetch_online: bool,
     ) -> BytesIO:
         """Download of a given asset, optimize if needed, or download from S3 cache"""
 
         try:
             if not always_fetch_online:
                 header_data = self._get_header_data_for(asset_url)
-                if header_data.content_type:
-                    mime_type = header_data.content_type.split(";")[0].strip()
-                    if mime_type in SUPPORTED_IMAGE_MIME_TYPES:
-                        return self._get_image_content(
-                            asset_path=asset_path,
-                            asset_url=asset_url,
-                            header_data=header_data,
-                        )
-                    else:
-                        logger.debug(
-                            f"Not optimizing, unsupported mime type: {mime_type} for "
-                            f"{context.current_thread_workitem}"
-                        )
+                mime_type = self._get_mime_type(
+                    header_data=header_data, asset_url=asset_url, kind=kind
+                )
+                if mime_type and mime_type in SUPPORTED_IMAGE_MIME_TYPES:
+                    return self._get_image_content(
+                        asset_path=asset_path,
+                        asset_url=asset_url,
+                        header_data=header_data,
+                    )
+                else:
+                    logger.debug(
+                        f"Not optimizing, unsupported mime type: {mime_type} for "
+                        f"{context.current_thread_workitem}"
+                    )
 
             return self._download_from_online(asset_url=asset_url)
         except RequestException as exc:

diff --git a/scraper/src/mindtouch2zim/client.py b/scraper/src/mindtouch2zim/client.py
@@ -362,12 +362,11 @@ def get_cover_page(self, page: LibraryPage) -> LibraryPage | None:
                 or "coverpage:nocommons" in current_definition.tags
             ):
                 return current_page
-            if "article:topic-category" in current_definition.tags:
+            if (
+                "article:topic-category" in current_definition.tags
+                or current_page.parent is None
+            ):
                 return None
-            if current_page.parent is None:
-                raise MindtouchParsingError(
-                    f"No more parent for {page.id}, reached root at {current_page.id}"
-                )
             current_page = current_page.parent
 
     def _get_cover_page_from_str_id(self, page_id: str) -> str | None:
@@ -390,12 +389,11 @@ def _get_cover_page_from_str_id(self, page_id: str) -> str | None:
                 or "coverpage:nocommons" in current_definition.tags
             ):
                 return current_page
-            if "article:topic-category" in current_definition.tags:
+            if (
+                "article:topic-category" in current_definition.tags
+                or current_definition.parent_id is None
+            ):
                 return None
-            if current_definition.parent_id is None:
-                raise MindtouchParsingError(
-                    f"No more parent for {page_id}, reached root at {current_page}"
-                )
             current_page = current_definition.parent_id
 
     def get_cover_page_encoded_url(self, page: LibraryPage) -> str | None:

diff --git a/scraper/src/mindtouch2zim/html_rewriting.py b/scraper/src/mindtouch2zim/html_rewriting.py
@@ -13,6 +13,7 @@
     ZimPath,
 )
 
+from mindtouch2zim.asset import AssetManager
 from mindtouch2zim.client import LibraryPage
 from mindtouch2zim.constants import logger
 from mindtouch2zim.context import Context
@@ -106,14 +107,14 @@ def rewrite_iframe_tags(
                 f'https://i.ytimg.com/vi/{ytb_match.group("id")}/hqdefault.jpg',
                 base_href=base_href,
             )
-            url_rewriter.add_item_to_download(rewrite_result)
+            url_rewriter.add_item_to_download(rewrite_result, "img")
             image_rewriten_url = rewrite_result.rewriten_url
         elif VIMEO_IFRAME_RE.match(src):
             rewrite_result = url_rewriter(
                 get_vimeo_thumbnail_url(src),
                 base_href=base_href,
             )
-            url_rewriter.add_item_to_download(rewrite_result)
+            url_rewriter.add_item_to_download(rewrite_result, "img")
             image_rewriten_url = rewrite_result.rewriten_url
         else:
             logger.debug(
@@ -158,7 +159,11 @@ class HtmlUrlsRewriter(ArticleUrlRewriter):
     """
 
     def __init__(
-        self, library_url: str, page: LibraryPage, existing_zim_paths: set[ZimPath]
+        self,
+        library_url: str,
+        page: LibraryPage,
+        existing_zim_paths: set[ZimPath],
+        asset_manager: AssetManager,
     ):
         super().__init__(
             article_url=HttpUrl(f"{library_url}/{page.path}"),
@@ -167,28 +172,28 @@ def __init__(
         )
         self.library_url = library_url
         self.library_path = ArticleUrlRewriter.normalize(HttpUrl(f"{library_url}/"))
-        self.items_to_download: dict[ZimPath, set[HttpUrl]] = {}
         self.page = page
+        self.asset_manager = asset_manager
 
     def __call__(
         self, item_url: str, base_href: str | None, *, rewrite_all_url: bool = True
     ) -> RewriteResult:
         result = super().__call__(item_url, base_href, rewrite_all_url=rewrite_all_url)
         return result
 
-    def add_item_to_download(self, rewrite_result: RewriteResult):
+    def add_item_to_download(self, rewrite_result: RewriteResult, kind: str | None):
         """Add item to download based on rewrite result"""
-        if rewrite_result.zim_path is not None:
-            # if item is expected to be inside the ZIM, store asset information so that
-            # we can download it afterwards
-            if rewrite_result.zim_path in self.items_to_download:
-                self.items_to_download[rewrite_result.zim_path].add(
-                    HttpUrl(rewrite_result.absolute_url)
-                )
-            else:
-                self.items_to_download[rewrite_result.zim_path] = {
-                    HttpUrl(rewrite_result.absolute_url)
-                }
+        if rewrite_result.zim_path is None:
+            return
+        # if item is expected to be inside the ZIM, store asset information so that
+        # we can download it afterwards
+        self.asset_manager.add_asset(
+            asset_path=rewrite_result.zim_path,
+            asset_url=HttpUrl(rewrite_result.absolute_url),
+            used_by=context.current_thread_workitem,
+            kind=kind,
+            always_fetch_online=False,
+        )
 
 
 @html_rules.rewrite_tag()
@@ -234,7 +239,7 @@ def rewrite_img_tags(
     rewrite_result = url_rewriter(src_value, base_href=base_href, rewrite_all_url=True)
     # add 'content/' to the URL since all assets will be stored in the sub.-path
     new_attr_value = f"content/{rewrite_result.rewriten_url}"
-    url_rewriter.add_item_to_download(rewrite_result)
+    url_rewriter.add_item_to_download(rewrite_result, "img")
 
     values = " ".join(
         format_attr(*attr)