diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index adc3f450e..2c030ced1 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -8,9 +8,10 @@ import copy try: from urlparse import urlsplit + from urllib import unquote_plus except ImportError: # Python 3 - from urllib.parse import urlsplit + from urllib.parse import urlsplit, unquote_plus from lxml import etree from lxml.html import defs from lxml.html import fromstring, XHTML_NAMESPACE @@ -482,7 +483,7 @@ def _kill_elements(self, doc, condition, iterate=None): def _remove_javascript_link(self, link): # links like "j a v a s c r i p t:" might be interpreted in IE - new = _substitute_whitespace('', link) + new = _substitute_whitespace('', unquote_plus(link)) if _is_javascript_scheme(new): # FIXME: should this be None to delete? return '' @@ -509,6 +510,11 @@ def _has_sneaky_javascript(self, style): return True if 'expression(' in style: return True + if '@import' in style: + return True + if ' + # thus passing the through into the output. + html = '', + lxml.html.tostring(clean_html(s))) + + def test_sneaky_import_in_style(self): + # Prevent "@@importimport" -> "@import" replacement. + style_codes = [ + "@@importimport(extstyle.css)", + "@ @ import import(extstyle.css)", + "@ @ importimport(extstyle.css)", + "@@ import import(extstyle.css)", + "@ @import import(extstyle.css)", + "@@importimport()", + ] + for style_code in style_codes: + html = '' % style_code + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'', + cleaned, + "%s -> %s" % (style_code, cleaned)) + + def test_formaction_attribute_in_button_input(self): + # The formaction attribute overrides the form's action and should be + # treated as a malicious link attribute + html = ('
' + '') + expected = ('some text
+hello | world | +
hello | world | +
Cyan
""" + + safe_attrs=set(lxml.html.defs.safe_attrs) + safe_attrs.add('style') + + cleaner = Cleaner( + safe_attrs_only=True, + safe_attrs=safe_attrs) + result = cleaner.clean_html(html) + + self.assertEqual(html, result) + + def test_safe_attrs_excluded(self): + html = """Cyan
""" + expected = """Cyan
""" + + safe_attrs=set() + + cleaner = Cleaner( + safe_attrs_only=True, + safe_attrs=safe_attrs) + result = cleaner.clean_html(html) + + self.assertEqual(expected, result) + + def test_clean_invalid_root_tag(self): + # only testing that cleaning with invalid root tags works at all + s = lxml.html.fromstring('parenta paragraph
@@ -51,7 +51,7 @@ a link - a control char link + a control char link data another linka paragraph
@@ -84,7 +84,7 @@ a link - a control char link + a control char link data another linka paragraph