diff --git a/src/warc2zim/__about__.py b/src/warc2zim/__about__.py index 494af57f..11fb1b2f 100644 --- a/src/warc2zim/__about__.py +++ b/src/warc2zim/__about__.py @@ -1 +1 @@ -__version__ = "2.0.0-dev0" +__version__ = "2.0.0-dev1" diff --git a/src/warc2zim/converter.py b/src/warc2zim/converter.py index 96f33100..c0b399bd 100644 --- a/src/warc2zim/converter.py +++ b/src/warc2zim/converter.py @@ -142,6 +142,8 @@ def __init__(self, args): self.written_records = self.total_records = 0 + self.scraper_suffix = args.scraper_suffix + def init_env(self): # autoescape=False to allow injecting html entities from translated text env = Environment( @@ -240,7 +242,7 @@ def run(self): Illustration_48x48_at_1=self.illustration, Tags=";".join(self.tags), Source=self.source, - Scraper=f"warc2zim {get_version()}", + Scraper=f"warc2zim {get_version()}{self.scraper_suffix}", ).start() for filename in importlib.resources.files("warc2zim.statics").iterdir(): diff --git a/src/warc2zim/main.py b/src/warc2zim/main.py index b459f0f0..d2582c1c 100644 --- a/src/warc2zim/main.py +++ b/src/warc2zim/main.py @@ -79,6 +79,12 @@ def main(raw_args=None): default="", ) + parser.add_argument( + "--scraper-suffix", + help="Additional string to append as a suffix to ZIM Scraper metadata, in " + "addition to regular warc2zim value", + ) + args = parser.parse_args(args=raw_args) converter = Converter(args) return converter.run() diff --git a/tests/test_warc_to_zim.py b/tests/test_warc_to_zim.py index 4d659ee3..14dd31f6 100644 --- a/tests/test_warc_to_zim.py +++ b/tests/test_warc_to_zim.py @@ -10,6 +10,7 @@ import requests from zimscraperlib.zim import Archive +from warc2zim.__about__ import __version__ from warc2zim.converter import iter_warc_records from warc2zim.main import main from warc2zim.url_rewriting import normalize @@ -17,11 +18,13 @@ TEST_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data") +SCRAPER_SUFFIX = " + zimit x.y.z-devw" # ============================================================================ CMDLINES = [ ["example-response.warc"], ["example-response.warc", "--progress-file", "progress.json"], + ["example-response.warc", "--scraper-suffix", SCRAPER_SUFFIX], ["example-resource.warc.gz", "--favicon", "https://example.com/some/favicon.ico"], ["example-resource.warc.gz", "--favicon", "https://www.google.com/favicon.ico"], ["example-revisit.warc.gz"], @@ -96,7 +99,7 @@ def get_article_raw(self, zimfile, path): zim_fh = Archive(zimfile) return zim_fh.get_item(path) - def verify_warc_and_zim(self, warcfile, zimfile): + def verify_warc_and_zim(self, warcfile, zimfile, verify_scraper_suffix): assert os.path.isfile(warcfile) assert os.path.isfile(zimfile) @@ -107,6 +110,13 @@ def verify_warc_and_zim(self, warcfile, zimfile): warc_urls = set() zim_fh = Archive(zimfile) + + if verify_scraper_suffix: + assert ( + f"warc2zim {__version__}{SCRAPER_SUFFIX}" + == zim_fh.get_text_metadata("Scraper") + ) + for record in iter_warc_records([warcfile]): url = get_record_url(record) if not url: @@ -283,7 +293,9 @@ def test_warc_to_zim(self, cmdline, tmp_path): and progress["written"] <= progress["total"] ) - self.verify_warc_and_zim(warcfile, tmp_path / zimfile) + self.verify_warc_and_zim( + warcfile, tmp_path / zimfile, "--scraper-suffix" in cmdline + ) def test_same_domain_only(self, tmp_path): zim_output = "same-domain.zim"