Skip to content

Commit

Permalink
Allow to add a ZIM scraper suffix via CLI argument
Browse files Browse the repository at this point in the history
  • Loading branch information
benoit74 committed Jan 31, 2024
1 parent ae18aed commit 43e9843
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 4 deletions.
2 changes: 1 addition & 1 deletion src/warc2zim/__about__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.0.0-dev0"
__version__ = "2.0.0-dev1"
4 changes: 3 additions & 1 deletion src/warc2zim/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,8 @@ def __init__(self, args):

self.written_records = self.total_records = 0

self.scraper_suffix = args.scraper_suffix

def init_env(self):
# autoescape=False to allow injecting html entities from translated text
env = Environment(
Expand Down Expand Up @@ -240,7 +242,7 @@ def run(self):
Illustration_48x48_at_1=self.illustration,
Tags=";".join(self.tags),
Source=self.source,
Scraper=f"warc2zim {get_version()}",
Scraper=f"warc2zim {get_version()}{self.scraper_suffix}",
).start()

for filename in importlib.resources.files("warc2zim.statics").iterdir():
Expand Down
6 changes: 6 additions & 0 deletions src/warc2zim/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ def main(raw_args=None):
default="",
)

parser.add_argument(
"--scraper-suffix",
help="Additional string to append as a suffix to ZIM Scraper metadata, in "
"addition to regular warc2zim value",
)

args = parser.parse_args(args=raw_args)
converter = Converter(args)
return converter.run()
Expand Down
16 changes: 14 additions & 2 deletions tests/test_warc_to_zim.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,21 @@
import requests
from zimscraperlib.zim import Archive

from warc2zim.__about__ import __version__
from warc2zim.converter import iter_warc_records
from warc2zim.main import main
from warc2zim.url_rewriting import normalize
from warc2zim.utils import get_record_url

TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data")

SCRAPER_SUFFIX = " + zimit x.y.z-devw"

# ============================================================================
CMDLINES = [
["example-response.warc"],
["example-response.warc", "--progress-file", "progress.json"],
["example-response.warc", "--scraper-suffix", SCRAPER_SUFFIX],
["example-resource.warc.gz", "--favicon", "https://example.com/some/favicon.ico"],
["example-resource.warc.gz", "--favicon", "https://www.google.com/favicon.ico"],
["example-revisit.warc.gz"],
Expand Down Expand Up @@ -96,7 +99,7 @@ def get_article_raw(self, zimfile, path):
zim_fh = Archive(zimfile)
return zim_fh.get_item(path)

def verify_warc_and_zim(self, warcfile, zimfile):
def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix):
assert os.path.isfile(warcfile)
assert os.path.isfile(zimfile)

Expand All @@ -107,6 +110,13 @@ def verify_warc_and_zim(self, warcfile, zimfile):
warc_urls = set()

zim_fh = Archive(zimfile)

if verify_scraper_suffix:
assert (
f"warc2zim {__version__}{SCRAPER_SUFFIX}"
== zim_fh.get_text_metadata("Scraper")
)

for record in iter_warc_records([warcfile]):
url = get_record_url(record)
if not url:
Expand Down Expand Up @@ -283,7 +293,9 @@ def test_warc_to_zim(self, cmdline, tmp_path):
and progress["written"] <= progress["total"]
)

self.verify_warc_and_zim(warcfile, tmp_path / zimfile)
self.verify_warc_and_zim(
warcfile, tmp_path / zimfile, "--scraper-suffix" in cmdline
)

def test_same_domain_only(self, tmp_path):
zim_output = "same-domain.zim"
Expand Down

0 comments on commit 43e9843

Please sign in to comment.