From 9e0184fff0c880720d0f61415f67e16bb8165fe1 Mon Sep 17 00:00:00 2001 From: turly221 Date: Fri, 29 Nov 2024 05:11:14 +0000 Subject: [PATCH 1/4] commit patch 8733371 --- src/lxml/html/clean.py | 5 +- src/lxml/html/clean.py.orig | 725 +++++++++++++++++++++++++++++ src/lxml/html/tests/test_clean.txt | 6 +- 3 files changed, 731 insertions(+), 5 deletions(-) create mode 100644 src/lxml/html/clean.py.orig diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index 8ce4f9bee..e38768733 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -8,9 +8,10 @@ import copy try: from urlparse import urlsplit + from urllib import unquote_plus except ImportError: # Python 3 - from urllib.parse import urlsplit + from urllib.parse import urlsplit, unquote_plus from lxml import etree from lxml.html import defs from lxml.html import fromstring, tostring, XHTML_NAMESPACE @@ -466,7 +467,7 @@ def _kill_elements(self, doc, condition, iterate=None): def _remove_javascript_link(self, link): # links like "j a v a s c r i p t:" might be interpreted in IE - new = _substitute_whitespace('', link) + new = _substitute_whitespace('', unquote_plus(link)) if _is_javascript_scheme(new): # FIXME: should this be None to delete? return '' diff --git a/src/lxml/html/clean.py.orig b/src/lxml/html/clean.py.orig new file mode 100644 index 000000000..8ce4f9bee --- /dev/null +++ b/src/lxml/html/clean.py.orig @@ -0,0 +1,725 @@ +"""A cleanup tool for HTML. + +Removes unwanted tags and content. See the `Cleaner` class for +details. +""" + +import re +import copy +try: + from urlparse import urlsplit +except ImportError: + # Python 3 + from urllib.parse import urlsplit +from lxml import etree +from lxml.html import defs +from lxml.html import fromstring, tostring, XHTML_NAMESPACE +from lxml.html import xhtml_to_html, _transform_result + +try: + unichr +except NameError: + # Python 3 + unichr = chr +try: + unicode +except NameError: + # Python 3 + unicode = str +try: + bytes +except NameError: + # Python < 2.6 + bytes = str +try: + basestring +except NameError: + basestring = (str, bytes) + + +__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', + 'word_break', 'word_break_html'] + +# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl +# Particularly the CSS cleaning; most of the tag cleaning is integrated now +# I have multiple kinds of schemes searched; but should schemes be +# whitelisted instead? +# max height? +# remove images? Also in CSS? background attribute? +# Some way to whitelist object, iframe, etc (e.g., if you want to +# allow *just* embedded YouTube movies) +# Log what was deleted and why? +# style="behavior: ..." might be bad in IE? +# Should we have something for just ? That's the worst of the +# metas. +# UTF-7 detections? Example: +# +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- +# you don't always have to have the charset set, if the page has no charset +# and there's UTF7-like code in it. +# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php + + +# This is an IE-specific construct you can have in a stylesheet to +# run some Javascript: +_css_javascript_re = re.compile( + r'expression\s*\(.*?\)', re.S|re.I) + +# Do I have to worry about @\nimport? +_css_import_re = re.compile( + r'@\s*import', re.I) + +# All kinds of schemes besides just javascript: that can cause +# execution: +_is_javascript_scheme = re.compile( + r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):', + re.I).search +_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub +# FIXME: should data: be blocked? + +# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx +_conditional_comment_re = re.compile( + r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) + +_find_styled_elements = etree.XPath( + "descendant-or-self::*[@style]") + +_find_external_links = etree.XPath( + ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" + "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), + namespaces={'x':XHTML_NAMESPACE}) + +class Cleaner(object): + """ + Instances cleans the document of each of the possible offending + elements. The cleaning is controlled by attributes; you can + override attributes in a subclass, or set them in the constructor. + + ``scripts``: + Removes any ``