Skip to content

Commit

Permalink
Merge pull request #98 from openzim/detailed_licensing
Browse files Browse the repository at this point in the history
Retrieve and rewrite Detailed licensing pages content
  • Loading branch information
benoit74 authored Dec 6, 2024
2 parents 7a97c59 + e793254 commit b3c8911
Show file tree
Hide file tree
Showing 4 changed files with 222 additions and 1 deletion.
97 changes: 97 additions & 0 deletions scraper/src/mindtouch2zim/libretexts/detailed_licensing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
from typing import Any

from jinja2 import Template
from pydantic import BaseModel
from zimscraperlib.rewriting.html import HtmlRewriter

from mindtouch2zim.client import LibraryPage, MindtouchClient
from mindtouch2zim.constants import logger
from mindtouch2zim.context import Context

context = Context.get()


class LicenseStatistic(BaseModel):
label: str
version: str | None
percent: float
count: int
link: str


class LicenseInfo(BaseModel):
statistics: list[LicenseStatistic]
details: list


class PageInfo(BaseModel):
license_label: str
license_version: str
url: str
title: str
children: list["PageInfo"]


def _get_licensing_report_data(cover_url: str) -> Any:
"""
Get licensing report from libretexts.org
Logic to get the data has been adapted from `buildLicensingReport` function
at https://cdn.libretexts.net/github/LibreTextsMain/DynamicLicensing/dist/dynamicLicensing.min.js
Probably coming from
https://github.com/LibreTexts/Libretext/blob/master/public/DynamicLicensing/dynamicLicensing.js
"""
api_url = f"https://api.libretexts.org/endpoint/licensereport/{cover_url}"
logger.debug(f"Calling API at {api_url}")
resp = context.web_session.get(
url=api_url,
headers={"Origin": "https://www.libretexts.org"}, # kinda authorization header
timeout=context.http_timeout_long_seconds,
)
resp.raise_for_status()
return resp.json()


def _render_html_from_data(jinja2_template: Template, licensing_data: Any) -> str:
if not licensing_data.get("meta", {}).get("specialRestrictions", None):
special_restrictions = None
else:

def get_restriction_label(restriction_key: str):
return {
"noncommercial": "Noncommercial",
"noderivatives": "No Derivatives",
"fairuse": "Fair Use",
}.get(restriction_key, restriction_key)

special_restrictions = ", ".join(
[
get_restriction_label(restriction)
for restriction in licensing_data["meta"]["specialRestrictions"]
]
)
return jinja2_template.render(
data=licensing_data, special_restrictions=special_restrictions
)


def rewrite_detailed_licensing(
rewriter: HtmlRewriter,
jinja2_template: Template,
mindtouch_client: MindtouchClient,
page: LibraryPage,
) -> str:
"""
Get and statically rewrite the detailed licensing info of libretexts.org
"""

return rewriter.rewrite(
_render_html_from_data(
jinja2_template=jinja2_template,
licensing_data=_get_licensing_report_data(
mindtouch_client.get_cover_page_encoded_url(page)
),
)
).content
19 changes: 18 additions & 1 deletion scraper/src/mindtouch2zim/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from mindtouch2zim.errors import NoIllustrationFoundError
from mindtouch2zim.html import get_text
from mindtouch2zim.html_rewriting import HtmlUrlsRewriter
from mindtouch2zim.libretexts.detailed_licensing import rewrite_detailed_licensing
from mindtouch2zim.libretexts.glossary import rewrite_glossary
from mindtouch2zim.libretexts.index import rewrite_index
from mindtouch2zim.ui import (
Expand Down Expand Up @@ -251,6 +252,9 @@ def _run_internal(self) -> Path:
self.libretexts_index_template = self.jinja2_env.get_template(
"libretexts.index.html"
)
self.libretexts_detailed_licensing_template = self.jinja2_env.get_template(
"libretexts.detailed-licensing.html"
)

# Start creator early to detect problems early.
with creator as creator:
Expand Down Expand Up @@ -542,6 +546,20 @@ def _process_page(
jinja2_template=self.libretexts_glossary_template,
original_content=page_content.html_body,
)
elif (
"https://cdn.libretexts.net/github/LibreTextsMain/DynamicLicensing/dist/dynamicLicensing.min.js"
in page_content.html_body
):
logger.debug(
f"Rewriting {context.current_thread_workitem} as libretexts.org"
" detailed licensing"
)
rewriten = rewrite_detailed_licensing(
rewriter=rewriter,
jinja2_template=self.libretexts_detailed_licensing_template,
mindtouch_client=self.mindtouch_client,
page=page,
)
except Exception as exc:
# code has been tested to work "in-general", but many edge-case occurs
# and since these pages are absolutely not essential, we just display a
Expand All @@ -550,7 +568,6 @@ def _process_page(
f"Problem processing special {context.current_thread_workitem}"
f", page is probably empty, storing empty page: {exc}"
)
return ""
if not rewriten:
# Default rewriting for 'normal' pages
rewriten = rewriter.rewrite(page_content.html_body).content
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
<h2>Overview</h2>
<p>
<strong>Title:</strong>
<a href="{{ data.text.url }}" target="_blank" rel="noreferrer">{{ data.text.title }}</a>
</p>
<p><strong>Webpages:</strong>{{ data.text.totalPages}}</p>
{% if special_restrictions %}
<p>
<strong>Applicable Restrictions:</strong>
{{ special_restrictions }}
</p>
{% endif %}
<p><strong>All licenses found:</strong></p>
<ul>
{% for license in data.meta.licenses %}
<li>
<a href="{{ license.link }}" target="_blank" rel="noreferrer"
>{{ license.label }}{% if license.version %}&nbsp;{{ license.version }}{% endif %}</a
>: {{ license.percent }}% ({{ license.count }} {% if license.count > 1 %}pages{% else %}page{%
endif %})
</li>
{% endfor %}
</ul>
<h2>By Page</h2>
{% macro render_detail(detail) -%}
<li>
<a href="{{ detail.url }}" target="_blank">{{ detail.title }}</a> {% if detail.license %} -
<a href="{{ detail.license.link }}" target="_blank" rel="noreferrer">
<em>{{ detail.license.label }} {{ detail.license.version or "" }}</em></a
>
{% endif %} {% if detail.children %}
<ul>
{% for child in detail.children %} {{ render_detail(child) }} {% endfor %}
</ul>
{% endif %}
</li>
{% endmacro %}
<div style="column-count: 2; margin-top: 1em">
<ul style="margin: 0">
{{ render_detail(data.text) }}
</ul>
</div>
65 changes: 65 additions & 0 deletions scraper/tests-integration/libretexts/test_detailed_licensing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from typing import Any

import pytest
from jinja2 import Environment, FileSystemLoader, select_autoescape

from mindtouch2zim.constants import ROOT_DIR
from mindtouch2zim.libretexts.detailed_licensing import (
_get_licensing_report_data,
_render_html_from_data,
)


@pytest.fixture(scope="module")
def licensing_report_data() -> Any:
return _get_licensing_report_data(
"https://geo.libretexts.org/Courses/California_State_University_Los_Angeles/"
"Book%3A_An_Introduction_to_Geology_(Johnson_Affolter_Inkenbrandt_and_Mosher)"
)


def test_get_licensing_report_data(licensing_report_data: Any):
"""Check we can still get licensing report data"""

assert licensing_report_data

# statistics properties
assert "meta" in licensing_report_data
assert "specialRestrictions" in licensing_report_data["meta"]
assert "licenses" in licensing_report_data["meta"]
assert isinstance(licensing_report_data["meta"]["licenses"], list)
assert "label" in licensing_report_data["meta"]["licenses"][0]
assert "link" in licensing_report_data["meta"]["licenses"][0]
assert "version" in licensing_report_data["meta"]["licenses"][0]
assert "count" in licensing_report_data["meta"]["licenses"][0]
assert int(licensing_report_data["meta"]["licenses"][0]["count"])
assert "percent" in licensing_report_data["meta"]["licenses"][0]
assert float(licensing_report_data["meta"]["licenses"][0]["percent"])
assert "text" in licensing_report_data
assert "totalPages" in licensing_report_data["text"]

# details properties
def check_item(data: Any):
assert "license" in data
assert "label" in data["license"]
assert "link" in data["license"]
# optional property, not set at least for "Undeclared" license
if data["license"]["label"] != "Undeclared":
assert "version" in data["license"]
assert "url" in data
assert "title" in data
assert "children" in data
assert isinstance(data["children"], list)
for child in data["children"]:
check_item(child)

check_item(licensing_report_data["text"])


def test_render_licensing_template(licensing_report_data: Any):
jinja2_env = Environment(
loader=FileSystemLoader(ROOT_DIR.joinpath("templates")),
autoescape=select_autoescape(),
)
template = jinja2_env.get_template("libretexts.detailed-licensing.html")
assert _render_html_from_data(template, licensing_report_data)

0 comments on commit b3c8911

Please sign in to comment.