Skip to content

Commit

Permalink
Be more tolerant to invalid css.
Browse files Browse the repository at this point in the history
When tinycss2 fails to parse, we use a regex rewriter to (trying to)
rewrite url in the css.

In case of invalid css tinycss2 doesn't not failed on parsing, but
generate a ParserError token
https://doc.courtbouillon.org/tinycss2/stable/api_reference.html?highlight=parser#tinycss2.ast.ParseError

This is on serialization that exception occurs.
We even try/except all transformation to be sure.

Fix #155
  • Loading branch information
mgautierfr authored and benoit74 committed Feb 9, 2024
1 parent 8a8a402 commit 1fe30b7
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 17 deletions.
54 changes: 44 additions & 10 deletions src/warc2zim/content_rewriting/css.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
from collections.abc import Iterable

from tinycss2 import (
Expand All @@ -10,27 +11,60 @@
from tinycss2.serializer import serialize_url

from warc2zim.content_rewriting import UrlRewriterProto
from warc2zim.content_rewriting.rx_replacer import RxRewriter


class FallbackRegexCssRewriter(RxRewriter):
def __init__(self, url_rewriter: UrlRewriterProto):
rules = [
(
re.compile(r"""url\((?P<quote>['"])?(?P<url>.+?)(?P=quote)(?<!\\)\)"""),
lambda m_object, _opts: "".join(
[
"url(",
m_object["quote"],
url_rewriter(m_object["url"]),
m_object["quote"],
")",
]
),
)
]
super().__init__(rules)


class CssRewriter:
def __init__(self, url_rewriter: UrlRewriterProto):
self.url_rewriter = url_rewriter
self.fallback_rewriter = FallbackRegexCssRewriter(url_rewriter)

def rewrite(self, content: str | bytes) -> str:
if isinstance(content, bytes):
rules = parse_stylesheet_bytes(content)[0]
else:
rules = parse_stylesheet(content)
self.process_list(rules)
try:
if isinstance(content, bytes):
rules = parse_stylesheet_bytes(content)[0]
else:
rules = parse_stylesheet(content)
self.process_list(rules)

output = serialize(rules)
output = serialize(rules)
except Exception:
# If tinycss fail to parse css, it will generate a "Error" token.
# Exception is raised at serialization time.
# We try/catch the whole process to be sure anyway.
return self.fallback_rewriter.rewrite_content(content, {})
return output

def rewrite_inline(self, content: str) -> str:
rules = parse_declaration_list(content)
self.process_list(rules)
output = serialize(rules)
return output
try:
rules = parse_declaration_list(content)
self.process_list(rules)
output = serialize(rules)
return output
except Exception:
# If tinycss fail to parse css, it will generate a "Error" token.
# Exception is raised at serialization time.
# We try/catch the whole process to be sure anyway.
return self.fallback_rewriter.rewrite_content(content, {})

def process_list(self, components: Iterable[ast.Node]):
if components: # May be null
Expand Down
88 changes: 81 additions & 7 deletions tests/test_css_rewriting.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,30 @@
from warc2zim.content_rewriting.css import CssRewriter
from warc2zim.url_rewriting import ArticleUrlRewriter

from .utils import ContentForTests


@pytest.fixture(
params=[
b"p { color: red; }",
b"p {\n color: red;\n}",
b"p { background: blue; }",
b"p { background: rgb(15, 0, 52); }",
b"/* See bug issue at http://exemple.com/issue/link */ p { color: blue; }",
ContentForTests(b"p { color: red; }"),
ContentForTests(b"p {\n color: red;\n}"),
ContentForTests(b"p { background: blue; }"),
ContentForTests(b"p { background: rgb(15, 0, 52); }"),
ContentForTests(
b"/* See bug issue at http://exemple.com/issue/link */ p { color: blue; }"
),
ContentForTests(
b"p { width= } div { background: url(http://exemple.com/img.png)}",
b"p { width= } div { background: url(exemple.com/img.png)}",
),
ContentForTests(
b"p { width= } div { background: url('http://exemple.com/img.png')}",
b'p { width= } div { background: url("exemple.com/img.png")}',
),
ContentForTests(
b'p { width= } div { background: url("http://exemple.com/img.png")}',
b'p { width= } div { background: url("exemple.com/img.png")}',
),
]
)
def no_rewrite_content(request):
Expand All @@ -21,8 +37,66 @@ def no_rewrite_content(request):

def test_no_rewrite(no_rewrite_content):
assert (
CssRewriter(ArticleUrlRewriter("kiwix.org", set())).rewrite(no_rewrite_content)
== no_rewrite_content.decode()
CssRewriter(ArticleUrlRewriter(no_rewrite_content.article_url, set())).rewrite(
no_rewrite_content.input_bytes
)
== no_rewrite_content.expected_bytes.decode()
)


@pytest.fixture(
params=[
ContentForTests('"border:'),
ContentForTests("border: solid 1px #c0c0c0; width= 100%"),
# Despite being invalid, tinycss parse it as "width" property without value.
ContentForTests("width:", "width:;"),
ContentForTests("border-bottom-width: 1px;border-bottom-color: #c0c0c0;w"),
ContentForTests(
'background: url("http://exemple.com/foo.png"); width=',
'background: url("exemple.com/foo.png"); width=',
),
]
)
def invalid_content_inline(request):
yield request.param


def test_invalid_css_inline(invalid_content_inline):
assert (
CssRewriter(
ArticleUrlRewriter(invalid_content_inline.article_url, set())
).rewrite_inline(invalid_content_inline.input_str)
== invalid_content_inline.expected_str
)


@pytest.fixture(
params=[
# Tinycss parse `"border:}` as a string with an unexpected eof in string.
# At serialization, tiny try to recover and close the opened rule
ContentForTests(b'p {"border:}', b'p {"border:}}'),
ContentForTests(b'"p {border:}'),
ContentForTests(b"p { border: solid 1px #c0c0c0; width= 100% }"),
ContentForTests(b"p { width: }"),
ContentForTests(
b"p { border-bottom-width: 1px;border-bottom-color: #c0c0c0;w }"
),
ContentForTests(
b'p { background: url("http://exemple.com/foo.png"); width= }',
b'p { background: url("exemple.com/foo.png"); width= }',
),
]
)
def invalid_content(request):
yield request.param


def test_invalid_cssl(invalid_content):
assert (
CssRewriter(ArticleUrlRewriter(invalid_content.article_url, set())).rewrite(
invalid_content.input_bytes
)
== invalid_content.expected_bytes.decode()
)


Expand Down

0 comments on commit 1fe30b7

Please sign in to comment.