diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index bbec243a0..edf3593c8 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -8,9 +8,10 @@ import copy try: from urlparse import urlsplit + from urllib import unquote_plus except ImportError: # Python 3 - from urllib.parse import urlsplit + from urllib.parse import urlsplit, unquote_plus from lxml import etree from lxml.html import defs from lxml.html import fromstring, tostring, XHTML_NAMESPACE @@ -466,7 +467,7 @@ def _kill_elements(self, doc, condition, iterate=None): def _remove_javascript_link(self, link): # links like "j a v a s c r i p t:" might be interpreted in IE - new = _substitute_whitespace('', link) + new = _substitute_whitespace('', unquote_plus(link)) if _is_javascript_scheme(new): # FIXME: should this be None to delete? return '' @@ -493,6 +494,11 @@ def _has_sneaky_javascript(self, style): return True if 'expression(' in style: return True + if '@import' in style: + return True + if ' + # thus passing the through into the output. + html = '', + lxml.html.tostring(clean_html(s))) + + def test_sneaky_import_in_style(self): + # Prevent "@@importimport" -> "@import" replacement. + style_codes = [ + "@@importimport(extstyle.css)", + "@ @ import import(extstyle.css)", + "@ @ importimport(extstyle.css)", + "@@ import import(extstyle.css)", + "@ @import import(extstyle.css)", + "@@importimport()", + ] + for style_code in style_codes: + html = '' % style_code + s = lxml.html.fragment_fromstring(html) + + cleaned = lxml.html.tostring(clean_html(s)) + self.assertEqual( + b'', + cleaned, + "%s -> %s" % (style_code, cleaned)) + + def test_formaction_attribute_in_button_input(self): + # The formaction attribute overrides the form's action and should be + # treated as a malicious link attribute + html = ('
' + '') + expected = ('
' + '
') + cleaner = Cleaner( + forms=False, + safe_attrs_only=False, + ) + self.assertEqual( + expected, + cleaner.clean_html(html)) + def test_suite(): suite = unittest.TestSuite() diff --git a/src/lxml/html/tests/test_clean.py.orig b/src/lxml/html/tests/test_clean.py.orig new file mode 100644 index 000000000..c8bc8f06c --- /dev/null +++ b/src/lxml/html/tests/test_clean.py.orig @@ -0,0 +1,105 @@ +import unittest, sys +from lxml.tests.common_imports import make_doctest +from lxml.etree import LIBXML_VERSION + +import lxml.html +from lxml.html.clean import Cleaner, clean_html + + +class CleanerTest(unittest.TestCase): + def test_allow_tags(self): + html = """ + + + + +

some text

+ + + + + + + +
helloworld
helloworld
+ + + + """ + + html_root = lxml.html.document_fromstring(html) + cleaner = Cleaner( + remove_unknown_tags = False, + allow_tags = ['table', 'tr', 'td']) + result = cleaner.clean_html(html_root) + + self.assertEqual(12-5+1, len(list(result.iter()))) + + def test_safe_attrs_included(self): + html = """

Cyan

""" + + safe_attrs=set(lxml.html.defs.safe_attrs) + safe_attrs.add('style') + + cleaner = Cleaner( + safe_attrs_only=True, + safe_attrs=safe_attrs) + result = cleaner.clean_html(html) + + self.assertEqual(html, result) + + def test_safe_attrs_excluded(self): + html = """

Cyan

""" + expected = """

Cyan

""" + + safe_attrs=set() + + cleaner = Cleaner( + safe_attrs_only=True, + safe_attrs=safe_attrs) + result = cleaner.clean_html(html) + + self.assertEqual(expected, result) + + def test_clean_invalid_root_tag(self): + # only testing that cleaning with invalid root tags works at all + s = lxml.html.fromstring('parent child') + self.assertEqual('parent child', clean_html(s).text_content()) + + s = lxml.html.fromstring('child') + self.assertEqual('child', clean_html(s).text_content()) + + def test_sneaky_noscript_in_style(self): + # This gets parsed as through into the output. + html = '', + lxml.html.tostring(clean_html(s))) + + def test_formaction_attribute_in_button_input(self): + # The formaction attribute overrides the form's action and should be + # treated as a malicious link attribute + html = ('
' + '') + expected = ('
' + '
') + cleaner = Cleaner( + forms=False, + safe_attrs_only=False, + ) + self.assertEqual( + expected, + cleaner.clean_html(html)) + + +def test_suite(): + suite = unittest.TestSuite() + if sys.version_info >= (2,4): + suite.addTests([make_doctest('test_clean.txt')]) + if LIBXML_VERSION >= (2,6,31): + suite.addTests([make_doctest('test_clean_embed.txt')]) + suite.addTests(unittest.makeSuite(CleanerTest)) + return suite diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt index 4431c9802..8b6aca182 100644 --- a/src/lxml/html/tests/test_clean.txt +++ b/src/lxml/html/tests/test_clean.txt @@ -18,7 +18,7 @@ ... ... ... a link -... a control char link +... a control char link ... data ... another link ...

a paragraph

@@ -51,7 +51,7 @@ a link - a control char link + a control char link data another link

a paragraph

@@ -84,7 +84,7 @@ a link - a control char link + a control char link data another link

a paragraph