diff --git a/src/warc2zim/content_rewriting/css.py b/src/warc2zim/content_rewriting/css.py index be525997..534eada5 100644 --- a/src/warc2zim/content_rewriting/css.py +++ b/src/warc2zim/content_rewriting/css.py @@ -11,15 +11,23 @@ from tinycss2.serializer import serialize_url from warc2zim.content_rewriting import UrlRewriterProto -from warc2zim.content_rewriting.rx_replacer import RxRewriter, m2str +from warc2zim.content_rewriting.rx_replacer import RxRewriter class FallbackRegexCssRewriter(RxRewriter): def __init__(self, url_rewriter: UrlRewriterProto): rules = [ ( - re.compile(r"""url\((?P['"])?.+?(?P=quote)(?['"])?(?P.+?)(?P=quote)(? str: if isinstance(content, bytes): @@ -39,12 +48,7 @@ def rewrite(self, content: str | bytes) -> str: try: output = serialize(rules) except Exception: - fallback_rewriter = FallbackRegexCssRewriter(self.url_rewriter) - if isinstance(content, bytes): - content = content.decode() - return fallback_rewriter.rewrite_content( - content, {} # pyright: ignore[reportArgumentType] - ) + return self.fallback_rewriter.rewrite_content(content, {}) return output def rewrite_inline(self, content: str) -> str: @@ -54,8 +58,7 @@ def rewrite_inline(self, content: str) -> str: output = serialize(rules) return output except Exception: - fallback_rewriter = FallbackRegexCssRewriter(self.url_rewriter) - return fallback_rewriter.rewrite_content(content, {}) + return self.fallback_rewriter.rewrite_content(content, {}) def process_list(self, components: Iterable[ast.Node]): if components: # May be null diff --git a/tests/test_css_rewriting.py b/tests/test_css_rewriting.py index 4b0266b3..1375ce9a 100644 --- a/tests/test_css_rewriting.py +++ b/tests/test_css_rewriting.py @@ -38,40 +38,68 @@ def no_rewrite_content(request): def test_no_rewrite(no_rewrite_content): assert ( CssRewriter(ArticleUrlRewriter(no_rewrite_content.article_url, set())).rewrite( - no_rewrite_content.input_ + no_rewrite_content.input_bytes ) - == no_rewrite_content.expected.decode() + == no_rewrite_content.expected_bytes.decode() ) @pytest.fixture( params=[ - ContentForTests(b'"border:'), - ContentForTests(b"border: solid 1px #c0c0c0; width= 100%"), - ContentForTests(b"width:"), - ContentForTests(b"border-bottom-width: 1px;border-bottom-color: #c0c0c0;w"), + ContentForTests('"border:'), + ContentForTests("border: solid 1px #c0c0c0; width= 100%"), + # Despite being invalid, tinycss parse it as "width" property without value. + ContentForTests("width:", "width:;"), + ContentForTests("border-bottom-width: 1px;border-bottom-color: #c0c0c0;w"), + ContentForTests( + 'background: url("http://exemple.com/foo.png"); width=', + 'background: url("exemple.com/foo.png"); width=', + ), ] ) -def invalid_content(request): +def invalid_content_inline(request): yield request.param -def test_invalid_css(invalid_content): +def test_invalid_css_inline(invalid_content_inline): assert ( - CssRewriter(ArticleUrlRewriter(invalid_content.article_url, set())).rewrite( - invalid_content.input_ - ) - == invalid_content.expected.decode() + CssRewriter( + ArticleUrlRewriter(invalid_content_inline.article_url, set()) + ).rewrite_inline(invalid_content_inline.input_str) + == invalid_content_inline.expected_str ) @pytest.fixture( - params=[ContentForTests('p{background: url("http://exemple.com/foo.png"); width=}')] + params=[ + # Tinycss parse `"border:}` as a string with an unexpected eof in string. + # At serialization, tiny try to recover and close the opened rule + ContentForTests(b'p {"border:}', b'p {"border:}}'), + ContentForTests(b'"p {border:}'), + ContentForTests(b"p { border: solid 1px #c0c0c0; width= 100% }"), + ContentForTests(b"p { width: }"), + ContentForTests( + b"p { border-bottom-width: 1px;border-bottom-color: #c0c0c0;w }" + ), + ContentForTests( + b'p { background: url("http://exemple.com/foo.png"); width= }', + b'p { background: url("exemple.com/foo.png"); width= }', + ), + ] ) -def invalid_and_rewrite(request): +def invalid_content(request): yield request.param +def test_invalid_cssl(invalid_content): + assert ( + CssRewriter(ArticleUrlRewriter(invalid_content.article_url, set())).rewrite( + invalid_content.input_bytes + ) + == invalid_content.expected_bytes.decode() + ) + + def test_rewrite(): content = b""" /* A comment with a link : http://foo.com */