From 9e0184fff0c880720d0f61415f67e16bb8165fe1 Mon Sep 17 00:00:00 2001
From: turly221 <feifan@scntist.com>
Date: Fri, 29 Nov 2024 05:11:14 +0000
Subject: [PATCH 1/4] commit patch 8733371

---
 src/lxml/html/clean.py             |   5 +-
 src/lxml/html/clean.py.orig        | 725 +++++++++++++++++++++++++++++
 src/lxml/html/tests/test_clean.txt |   6 +-
 3 files changed, 731 insertions(+), 5 deletions(-)
 create mode 100644 src/lxml/html/clean.py.orig
diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index 8ce4f9bee..e38768733 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -8,9 +8,10 @@
 import copy
 try:
     from urlparse import urlsplit
+    from urllib import unquote_plus
 except ImportError:
     # Python 3
-    from urllib.parse import urlsplit
+    from urllib.parse import urlsplit, unquote_plus
 from lxml import etree
 from lxml.html import defs
 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
@@ -466,7 +467,7 @@ def _kill_elements(self, doc, condition, iterate=None):
 
     def _remove_javascript_link(self, link):
         # links like "j a v a s c r i p t:" might be interpreted in IE
-        new = _substitute_whitespace('', link)
+        new = _substitute_whitespace('', unquote_plus(link))
         if _is_javascript_scheme(new):
             # FIXME: should this be None to delete?
             return ''
diff --git a/src/lxml/html/clean.py.orig b/src/lxml/html/clean.py.orig
new file mode 100644
index 000000000..8ce4f9bee
--- /dev/null
+++ b/src/lxml/html/clean.py.orig
@@ -0,0 +1,725 @@
+"""A cleanup tool for HTML.
+
+Removes unwanted tags and content.  See the `Cleaner` class for
+details.
+"""
+
+import re
+import copy
+try:
+    from urlparse import urlsplit
+except ImportError:
+    # Python 3
+    from urllib.parse import urlsplit
+from lxml import etree
+from lxml.html import defs
+from lxml.html import fromstring, tostring, XHTML_NAMESPACE
+from lxml.html import xhtml_to_html, _transform_result
+
+try:
+    unichr
+except NameError:
+    # Python 3
+    unichr = chr
+try:
+    unicode
+except NameError:
+    # Python 3
+    unicode = str
+try:
+    bytes
+except NameError:
+    # Python < 2.6
+    bytes = str
+try:
+    basestring
+except NameError:
+    basestring = (str, bytes)
+
+
+__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
+           'word_break', 'word_break_html']
+
+# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl
+#   Particularly the CSS cleaning; most of the tag cleaning is integrated now
+# I have multiple kinds of schemes searched; but should schemes be
+#   whitelisted instead?
+# max height?
+# remove images?  Also in CSS?  background attribute?
+# Some way to whitelist object, iframe, etc (e.g., if you want to
+#   allow *just* embedded YouTube movies)
+# Log what was deleted and why?
+# style="behavior: ..." might be bad in IE?
+# Should we have something for just <meta http-equiv>?  That's the worst of the
+#   metas.
+# UTF-7 detections?  Example:
+#     <HEAD><META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=UTF-7"> </HEAD>+ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4-
+#   you don't always have to have the charset set, if the page has no charset
+#   and there's UTF7-like code in it.
+# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php
+
+
+# This is an IE-specific construct you can have in a stylesheet to
+# run some Javascript:
+_css_javascript_re = re.compile(
+    r'expression\s*\(.*?\)', re.S|re.I)
+
+# Do I have to worry about @\nimport?
+_css_import_re = re.compile(
+    r'@\s*import', re.I)
+
+# All kinds of schemes besides just javascript: that can cause
+# execution:
+_is_javascript_scheme = re.compile(
+    r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
+    re.I).search
+_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
+# FIXME: should data: be blocked?
+
+# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx
+_conditional_comment_re = re.compile(
+    r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
+
+_find_styled_elements = etree.XPath(
+    "descendant-or-self::*[@style]")
+
+_find_external_links = etree.XPath(
+    ("descendant-or-self::a  [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
+     "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
+    namespaces={'x':XHTML_NAMESPACE})
+
+class Cleaner(object):
+    """
+    Instances cleans the document of each of the possible offending
+    elements.  The cleaning is controlled by attributes; you can
+    override attributes in a subclass, or set them in the constructor.
+
+    ``scripts``:
+        Removes any ``<script>`` tags.
+
+    ``javascript``:
+        Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
+        as they could contain Javascript.
+
+    ``comments``:
+        Removes any comments.
+
+    ``style``:
+        Removes any style tags or attributes.
+
+    ``links``:
+        Removes any ``<link>`` tags
+
+    ``meta``:
+        Removes any ``<meta>`` tags
+
+    ``page_structure``:
+        Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
+
+    ``processing_instructions``:
+        Removes any processing instructions.
+
+    ``embedded``:
+        Removes any embedded objects (flash, iframes)
+
+    ``frames``:
+        Removes any frame-related tags
+
+    ``forms``:
+        Removes any form tags
+
+    ``annoying_tags``:
+        Tags that aren't *wrong*, but are annoying.  ``<blink>`` and ``<marquee>``
+
+    ``remove_tags``:
+        A list of tags to remove.  Only the tags will be removed,
+        their content will get pulled up into the parent tag.
+
+    ``kill_tags``:
+        A list of tags to kill.  Killing also removes the tag's content,
+        i.e. the whole subtree, not just the tag itself.
+
+    ``allow_tags``:
+        A list of tags to include (default include all).
+
+    ``remove_unknown_tags``:
+        Remove any tags that aren't standard parts of HTML.
+
+    ``safe_attrs_only``:
+        If true, only include 'safe' attributes (specifically the list
+        from the feedparser HTML sanitisation web site).
+
+    ``safe_attrs``:
+        A set of attribute names to override the default list of attributes
+        considered 'safe' (when safe_attrs_only=True).
+
+    ``add_nofollow``:
+        If true, then any <a> tags will have ``rel="nofollow"`` added to them.
+
+    ``host_whitelist``:
+        A list or set of hosts that you can use for embedded content
+        (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
+        You can also implement/override the method
+        ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
+        implement more complex rules for what can be embedded.
+        Anything that passes this test will be shown, regardless of
+        the value of (for instance) ``embedded``.
+
+        Note that this parameter might not work as intended if you do not
+        make the links absolute before doing the cleaning.
+
+        Note that you may also need to set ``whitelist_tags``.
+
+    ``whitelist_tags``:
+        A set of tags that can be included with ``host_whitelist``.
+        The default is ``iframe`` and ``embed``; you may wish to
+        include other tags like ``script``, or you may want to
+        implement ``allow_embedded_url`` for more control.  Set to None to
+        include all tags.
+
+    This modifies the document *in place*.
+    """
+
+    scripts = True
+    javascript = True
+    comments = True
+    style = False
+    links = True
+    meta = True
+    page_structure = True
+    processing_instructions = True
+    embedded = True
+    frames = True
+    forms = True
+    annoying_tags = True
+    remove_tags = None
+    allow_tags = None
+    kill_tags = None
+    remove_unknown_tags = True
+    safe_attrs_only = True
+    safe_attrs = defs.safe_attrs
+    add_nofollow = False
+    host_whitelist = ()
+    whitelist_tags = set(['iframe', 'embed'])
+
+    def __init__(self, **kw):
+        for name, value in kw.items():
+            if not hasattr(self, name):
+                raise TypeError(
+                    "Unknown parameter: %s=%r" % (name, value))
+            setattr(self, name, value)
+
+    # Used to lookup the primary URL for a given tag that is up for
+    # removal:
+    _tag_link_attrs = dict(
+        script='src',
+        link='href',
+        # From: http://java.sun.com/j2se/1.4.2/docs/guide/misc/applet.html
+        # From what I can tell, both attributes can contain a link:
+        applet=['code', 'object'],
+        iframe='src',
+        embed='src',
+        layer='src',
+        # FIXME: there doesn't really seem like a general way to figure out what
+        # links an <object> tag uses; links often go in <param> tags with values
+        # that we don't really know.  You'd have to have knowledge about specific
+        # kinds of plugins (probably keyed off classid), and match against those.
+        ##object=?,
+        # FIXME: not looking at the action currently, because it is more complex
+        # than than -- if you keep the form, you should keep the form controls.
+        ##form='action',
+        a='href',
+        )
+
+    def __call__(self, doc):
+        """
+        Cleans the document.
+        """
+        if hasattr(doc, 'getroot'):
+            # ElementTree instance, instead of an element
+            doc = doc.getroot()
+        # convert XHTML to HTML
+        xhtml_to_html(doc)
+        # Normalize a case that IE treats <image> like <img>, and that
+        # can confuse either this step or later steps.
+        for el in doc.iter('image'):
+            el.tag = 'img'
+        if not self.comments:
+            # Of course, if we were going to kill comments anyway, we don't
+            # need to worry about this
+            self.kill_conditional_comments(doc)
+
+        kill_tags = set(self.kill_tags or ())
+        remove_tags = set(self.remove_tags or ())
+        allow_tags = set(self.allow_tags or ())
+
+        if self.scripts:
+            kill_tags.add('script')
+        if self.safe_attrs_only:
+            safe_attrs = set(self.safe_attrs)
+            for el in doc.iter():
+                attrib = el.attrib
+                for aname in attrib.keys():
+                    if aname not in safe_attrs:
+                        del attrib[aname]
+        if self.javascript:
+            if not (self.safe_attrs_only and
+                    self.safe_attrs == defs.safe_attrs):
+                # safe_attrs handles events attributes itself
+                for el in doc.iter():
+                    attrib = el.attrib
+                    for aname in attrib.keys():
+                        if aname.startswith('on'):
+                            del attrib[aname]
+            doc.rewrite_links(self._remove_javascript_link,
+                              resolve_base_href=False)
+            if not self.style:
+                # If we're deleting style then we don't have to remove JS links
+                # from styles, otherwise...
+                for el in _find_styled_elements(doc):
+                    old = el.get('style')
+                    new = _css_javascript_re.sub('', old)
+                    new = _css_import_re.sub('', new)
+                    if self._has_sneaky_javascript(new):
+                        # Something tricky is going on...
+                        del el.attrib['style']
+                    elif new != old:
+                        el.set('style', new)
+                for el in list(doc.iter('style')):
+                    if el.get('type', '').lower().strip() == 'text/javascript':
+                        el.drop_tree()
+                        continue
+                    old = el.text or ''
+                    new = _css_javascript_re.sub('', old)
+                    # The imported CSS can do anything; we just can't allow:
+                    new = _css_import_re.sub('', old)
+                    if self._has_sneaky_javascript(new):
+                        # Something tricky is going on...
+                        el.text = '/* deleted */'
+                    elif new != old:
+                        el.text = new
+        if self.comments or self.processing_instructions:
+            # FIXME: why either?  I feel like there's some obscure reason
+            # because you can put PIs in comments...?  But I've already
+            # forgotten it
+            kill_tags.add(etree.Comment)
+        if self.processing_instructions:
+            kill_tags.add(etree.ProcessingInstruction)
+        if self.style:
+            kill_tags.add('style')
+            etree.strip_attributes(doc, 'style')
+        if self.links:
+            kill_tags.add('link')
+        elif self.style or self.javascript:
+            # We must get rid of included stylesheets if Javascript is not
+            # allowed, as you can put Javascript in them
+            for el in list(doc.iter('link')):
+                if 'stylesheet' in el.get('rel', '').lower():
+                    # Note this kills alternate stylesheets as well
+                    if not self.allow_element(el):
+                        el.drop_tree()
+        if self.meta:
+            kill_tags.add('meta')
+        if self.page_structure:
+            remove_tags.update(('head', 'html', 'title'))
+        if self.embedded:
+            # FIXME: is <layer> really embedded?
+            # We should get rid of any <param> tags not inside <applet>;
+            # These are not really valid anyway.
+            for el in list(doc.iter('param')):
+                found_parent = False
+                parent = el.getparent()
+                while parent is not None and parent.tag not in ('applet', 'object'):
+                    parent = parent.getparent()
+                if parent is None:
+                    el.drop_tree()
+            kill_tags.update(('applet',))
+            # The alternate contents that are in an iframe are a good fallback:
+            remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
+        if self.frames:
+            # FIXME: ideally we should look at the frame links, but
+            # generally frames don't mix properly with an HTML
+            # fragment anyway.
+            kill_tags.update(defs.frame_tags)
+        if self.forms:
+            remove_tags.add('form')
+            kill_tags.update(('button', 'input', 'select', 'textarea'))
+        if self.annoying_tags:
+            remove_tags.update(('blink', 'marquee'))
+
+        _remove = []
+        _kill = []
+        for el in doc.iter():
+            if el.tag in kill_tags:
+                if self.allow_element(el):
+                    continue
+                _kill.append(el)
+            elif el.tag in remove_tags:
+                if self.allow_element(el):
+                    continue
+                _remove.append(el)
+
+        if _remove and _remove[0] == doc:
+            # We have to drop the parent-most tag, which we can't
+            # do.  Instead we'll rewrite it:
+            el = _remove.pop(0)
+            el.tag = 'div'
+            el.attrib.clear()
+        elif _kill and _kill[0] == doc:
+            # We have to drop the parent-most element, which we can't
+            # do.  Instead we'll clear it:
+            el = _kill.pop(0)
+            if el.tag != 'html':
+                el.tag = 'div'
+            el.clear()
+
+        _kill.reverse() # start with innermost tags
+        for el in _kill:
+            el.drop_tree()
+        for el in _remove:
+            el.drop_tag()
+
+        if self.remove_unknown_tags:
+            if allow_tags:
+                raise ValueError(
+                    "It does not make sense to pass in both allow_tags and remove_unknown_tags")
+            allow_tags = set(defs.tags)
+        if allow_tags:
+            bad = []
+            for el in doc.iter():
+                if el.tag not in allow_tags:
+                    bad.append(el)
+            if bad:
+                if bad[0] is doc:
+                    el = bad.pop(0)
+                    el.tag = 'div'
+                    el.attrib.clear()
+                for el in bad:
+                    el.drop_tag()
+        if self.add_nofollow:
+            for el in _find_external_links(doc):
+                if not self.allow_follow(el):
+                    rel = el.get('rel')
+                    if rel:
+                        if ('nofollow' in rel
+                                and ' nofollow ' in (' %s ' % rel)):
+                            continue
+                        rel = '%s nofollow' % rel
+                    else:
+                        rel = 'nofollow'
+                    el.set('rel', rel)
+
+    def allow_follow(self, anchor):
+        """
+        Override to suppress rel="nofollow" on some anchors.
+        """
+        return False
+
+    def allow_element(self, el):
+        if el.tag not in self._tag_link_attrs:
+            return False
+        attr = self._tag_link_attrs[el.tag]
+        if isinstance(attr, (list, tuple)):
+            for one_attr in attr:
+                url = el.get(one_attr)
+                if not url:
+                    return False
+                if not self.allow_embedded_url(el, url):
+                    return False
+            return True
+        else:
+            url = el.get(attr)
+            if not url:
+                return False
+            return self.allow_embedded_url(el, url)
+
+    def allow_embedded_url(self, el, url):
+        if (self.whitelist_tags is not None
+            and el.tag not in self.whitelist_tags):
+            return False
+        scheme, netloc, path, query, fragment = urlsplit(url)
+        netloc = netloc.lower().split(':', 1)[0]
+        if scheme not in ('http', 'https'):
+            return False
+        if netloc in self.host_whitelist:
+            return True
+        return False
+
+    def kill_conditional_comments(self, doc):
+        """
+        IE conditional comments basically embed HTML that the parser
+        doesn't normally see.  We can't allow anything like that, so
+        we'll kill any comments that could be conditional.
+        """
+        bad = []
+        self._kill_elements(
+            doc, lambda el: _conditional_comment_re.search(el.text),
+            etree.Comment)                
+
+    def _kill_elements(self, doc, condition, iterate=None):
+        bad = []
+        for el in doc.iter(iterate):
+            if condition(el):
+                bad.append(el)
+        for el in bad:
+            el.drop_tree()
+
+    def _remove_javascript_link(self, link):
+        # links like "j a v a s c r i p t:" might be interpreted in IE
+        new = _substitute_whitespace('', link)
+        if _is_javascript_scheme(new):
+            # FIXME: should this be None to delete?
+            return ''
+        return link
+
+    _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
+
+    def _has_sneaky_javascript(self, style):
+        """
+        Depending on the browser, stuff like ``e x p r e s s i o n(...)``
+        can get interpreted, or ``expre/* stuff */ssion(...)``.  This
+        checks for attempt to do stuff like this.
+
+        Typically the response will be to kill the entire style; if you
+        have just a bit of Javascript in the style another rule will catch
+        that and remove only the Javascript from the style; this catches
+        more sneaky attempts.
+        """
+        style = self._substitute_comments('', style)
+        style = style.replace('\\', '')
+        style = _substitute_whitespace('', style)
+        style = style.lower()
+        if 'javascript:' in style:
+            return True
+        if 'expression(' in style:
+            return True
+        return False
+
+    def clean_html(self, html):
+        result_type = type(html)
+        if isinstance(html, basestring):
+            doc = fromstring(html)
+        else:
+            doc = copy.deepcopy(html)
+        self(doc)
+        return _transform_result(result_type, doc)
+
+clean = Cleaner()
+clean_html = clean.clean_html
+
+############################################################
+## Autolinking
+############################################################
+
+_link_regexes = [
+    re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
+    # This is conservative, but autolinking can be a bit conservative:
+    re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I),
+    ]
+
+_avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
+
+_avoid_hosts = [
+    re.compile(r'^localhost', re.I),
+    re.compile(r'\bexample\.(?:com|org|net)$', re.I),
+    re.compile(r'^127\.0\.0\.1$'),
+    ]
+
+_avoid_classes = ['nolink']
+
+def autolink(el, link_regexes=_link_regexes,
+             avoid_elements=_avoid_elements,
+             avoid_hosts=_avoid_hosts,
+             avoid_classes=_avoid_classes):
+    """
+    Turn any URLs into links.
+
+    It will search for links identified by the given regular
+    expressions (by default mailto and http(s) links).
+
+    It won't link text in an element in avoid_elements, or an element
+    with a class in avoid_classes.  It won't link to anything with a
+    host that matches one of the regular expressions in avoid_hosts
+    (default localhost and 127.0.0.1).
+
+    If you pass in an element, the element's tail will not be
+    substituted, only the contents of the element.
+    """
+    if el.tag in avoid_elements:
+        return
+    class_name = el.get('class')
+    if class_name:
+        class_name = class_name.split()
+        for match_class in avoid_classes:
+            if match_class in class_name:
+                return
+    for child in list(el):
+        autolink(child, link_regexes=link_regexes,
+                 avoid_elements=avoid_elements,
+                 avoid_hosts=avoid_hosts,
+                 avoid_classes=avoid_classes)
+        if child.tail:
+            text, tail_children = _link_text(
+                child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
+            if tail_children:
+                child.tail = text
+                index = el.index(child)
+                el[index+1:index+1] = tail_children
+    if el.text:
+        text, pre_children = _link_text(
+            el.text, link_regexes, avoid_hosts, factory=el.makeelement)
+        if pre_children:
+            el.text = text
+            el[:0] = pre_children
+
+def _link_text(text, link_regexes, avoid_hosts, factory):
+    leading_text = ''
+    links = []
+    last_pos = 0
+    while 1:
+        best_match, best_pos = None, None
+        for regex in link_regexes:
+            regex_pos = last_pos
+            while 1:
+                match = regex.search(text, pos=regex_pos)
+                if match is None:
+                    break
+                host = match.group('host')
+                for host_regex in avoid_hosts:
+                    if host_regex.search(host):
+                        regex_pos = match.end()
+                        break
+                else:
+                    break
+            if match is None:
+                continue
+            if best_pos is None or match.start() < best_pos:
+                best_match = match
+                best_pos = match.start()
+        if best_match is None:
+            # No more matches
+            if links:
+                assert not links[-1].tail
+                links[-1].tail = text
+            else:
+                assert not leading_text
+                leading_text = text
+            break
+        link = best_match.group(0)
+        end = best_match.end()
+        if link.endswith('.') or link.endswith(','):
+            # These punctuation marks shouldn't end a link
+            end -= 1
+            link = link[:-1]
+        prev_text = text[:best_match.start()]
+        if links:
+            assert not links[-1].tail
+            links[-1].tail = prev_text
+        else:
+            assert not leading_text
+            leading_text = prev_text
+        anchor = factory('a')
+        anchor.set('href', link)
+        body = best_match.group('body')
+        if not body:
+            body = link
+        if body.endswith('.') or body.endswith(','):
+            body = body[:-1]
+        anchor.text = body
+        links.append(anchor)
+        text = text[end:]
+    return leading_text, links
+                
+def autolink_html(html, *args, **kw):
+    result_type = type(html)
+    if isinstance(html, basestring):
+        doc = fromstring(html)
+    else:
+        doc = copy.deepcopy(html)
+    autolink(doc, *args, **kw)
+    return _transform_result(result_type, doc)
+
+autolink_html.__doc__ = autolink.__doc__
+
+############################################################
+## Word wrapping
+############################################################
+
+_avoid_word_break_elements = ['pre', 'textarea', 'code']
+_avoid_word_break_classes = ['nobreak']
+
+def word_break(el, max_width=40,
+               avoid_elements=_avoid_word_break_elements,
+               avoid_classes=_avoid_word_break_classes,
+               break_character=unichr(0x200b)):
+    """
+    Breaks any long words found in the body of the text (not attributes).
+
+    Doesn't effect any of the tags in avoid_elements, by default
+    ``<textarea>`` and ``<pre>``
+
+    Breaks words by inserting &#8203;, which is a unicode character
+    for Zero Width Space character.  This generally takes up no space
+    in rendering, but does copy as a space, and in monospace contexts
+    usually takes up space.
+
+    See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
+    """
+    # Character suggestion of &#8203 comes from:
+    #   http://www.cs.tut.fi/~jkorpela/html/nobr.html
+    if el.tag in _avoid_word_break_elements:
+        return
+    class_name = el.get('class')
+    if class_name:
+        dont_break = False
+        class_name = class_name.split()
+        for avoid in avoid_classes:
+            if avoid in class_name:
+                dont_break = True
+                break
+        if dont_break:
+            return
+    if el.text:
+        el.text = _break_text(el.text, max_width, break_character)
+    for child in el:
+        word_break(child, max_width=max_width,
+                   avoid_elements=avoid_elements,
+                   avoid_classes=avoid_classes,
+                   break_character=break_character)
+        if child.tail:
+            child.tail = _break_text(child.tail, max_width, break_character)
+
+def word_break_html(html, *args, **kw):
+    result_type = type(html)
+    doc = fromstring(html)
+    word_break(doc, *args, **kw)
+    return _transform_result(result_type, doc)
+
+def _break_text(text, max_width, break_character):
+    words = text.split()
+    for word in words:
+        if len(word) > max_width:
+            replacement = _insert_break(word, max_width, break_character)
+            text = text.replace(word, replacement)
+    return text
+
+_break_prefer_re = re.compile(r'[^a-z]', re.I)
+
+def _insert_break(word, width, break_character):
+    orig_word = word
+    result = ''
+    while len(word) > width:
+        start = word[:width]
+        breaks = list(_break_prefer_re.finditer(start))
+        if breaks:
+            last_break = breaks[-1]
+            # Only walk back up to 10 characters to find a nice break:
+            if last_break.end() > width-10:
+                # FIXME: should the break character be at the end of the
+                # chunk, or the beginning of the next chunk?
+                start = word[:last_break.end()]
+        result += start + break_character
+        word = word[len(start):]
+    result += word
+    return result
+    
diff --git a/src/lxml/html/tests/test_clean.txt b/src/lxml/html/tests/test_clean.txt
index 4431c9802..8b6aca182 100644
--- a/src/lxml/html/tests/test_clean.txt
+++ b/src/lxml/html/tests/test_clean.txt
@@ -18,7 +18,7 @@
 ...   <body onload="evil_function()">
 ...     <!-- I am interpreted for EVIL! -->
 ...     <a href="javascript:evil_function()">a link</a>
-...     <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t:evil_function()">a control char link</a>
+...     <a href="j\x01a\x02v\x03a\x04s\x05c\x06r\x07i\x0Ep t%20:evil_function()">a control char link</a>
 ...     <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
 ...     <a href="#" onclick="evil_function()">another link</a>
 ...     <p onclick="evil_function()">a paragraph</p>
@@ -51,7 +51,7 @@
   <body onload="evil_function()">
     <!-- I am interpreted for EVIL! -->
     <a href="javascript:evil_function()">a link</a>
-    <a href="javascrip t:evil_function()">a control char link</a>
+    <a href="javascrip t%20:evil_function()">a control char link</a>
     <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
     <a href="#" onclick="evil_function()">another link</a>
     <p onclick="evil_function()">a paragraph</p>
@@ -84,7 +84,7 @@
   <body onload="evil_function()">
     <!-- I am interpreted for EVIL! -->
     <a href="javascript:evil_function()">a link</a>
-    <a href="javascrip%20t:evil_function()">a control char link</a>
+    <a href="javascrip%20t%20:evil_function()">a control char link</a>
     <a href="data:text/html;base64,PHNjcmlwdD5hbGVydCgidGVzdCIpOzwvc2NyaXB0Pg==">data</a>
     <a href="#" onclick="evil_function()">another link</a>
     <p onclick="evil_function()">a paragraph</p>

From e2678c6deddc859517c04d95610eba27229d61bc Mon Sep 17 00:00:00 2001
From: turly221 <feifan@scntist.com>
Date: Fri, 29 Nov 2024 05:11:15 +0000
Subject: [PATCH 2/4] commit patch 11731263

---
 src/lxml/html/clean.py                 |  3 +
 src/lxml/html/clean.py.orig            |  5 +-
 src/lxml/html/tests/test_clean.py      | 10 ++++
 src/lxml/html/tests/test_clean.py.orig | 80 ++++++++++++++++++++++++++
 4 files changed, 96 insertions(+), 2 deletions(-)
 create mode 100644 src/lxml/html/tests/test_clean.py.orig

diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index e38768733..64788dd8e 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -494,6 +494,9 @@ def _has_sneaky_javascript(self, style):
             return True
         if 'expression(' in style:
             return True
+        if '</noscript' in style:
+            # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+            return True
         return False
 
     def clean_html(self, html):
diff --git a/src/lxml/html/clean.py.orig b/src/lxml/html/clean.py.orig
index 8ce4f9bee..e38768733 100644
--- a/src/lxml/html/clean.py.orig
+++ b/src/lxml/html/clean.py.orig
@@ -8,9 +8,10 @@ import re
 import copy
 try:
     from urlparse import urlsplit
+    from urllib import unquote_plus
 except ImportError:
     # Python 3
-    from urllib.parse import urlsplit
+    from urllib.parse import urlsplit, unquote_plus
 from lxml import etree
 from lxml.html import defs
 from lxml.html import fromstring, tostring, XHTML_NAMESPACE
@@ -466,7 +467,7 @@ class Cleaner(object):
 
     def _remove_javascript_link(self, link):
         # links like "j a v a s c r i p t:" might be interpreted in IE
-        new = _substitute_whitespace('', link)
+        new = _substitute_whitespace('', unquote_plus(link))
         if _is_javascript_scheme(new):
             # FIXME: should this be None to delete?
             return ''
diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py
index 3bcaaf5a2..dc9bb4e22 100644
--- a/src/lxml/html/tests/test_clean.py
+++ b/src/lxml/html/tests/test_clean.py
@@ -69,6 +69,16 @@ def test_clean_invalid_root_tag(self):
         s = lxml.html.fromstring('<invalid tag>child</another>')
         self.assertEqual('child', clean_html(s).text_content())
 
+    def test_sneaky_noscript_in_style(self):
+        # This gets parsed as <noscript> -> <style>"...</noscript>..."</style>
+        # thus passing the </noscript> through into the output.
+        html = '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+        s = lxml.html.fragment_fromstring(html)
+
+        self.assertEqual(
+            b'<noscript><style>/* deleted */</style></noscript>',
+            lxml.html.tostring(clean_html(s)))
+
 
 def test_suite():
     suite = unittest.TestSuite()
diff --git a/src/lxml/html/tests/test_clean.py.orig b/src/lxml/html/tests/test_clean.py.orig
new file mode 100644
index 000000000..3bcaaf5a2
--- /dev/null
+++ b/src/lxml/html/tests/test_clean.py.orig
@@ -0,0 +1,80 @@
+import unittest, sys
+from lxml.tests.common_imports import make_doctest
+from lxml.etree import LIBXML_VERSION
+
+import lxml.html
+from lxml.html.clean import Cleaner, clean_html
+
+
+class CleanerTest(unittest.TestCase):
+    def test_allow_tags(self):
+        html = """
+            <html>
+            <head>
+            </head>
+            <body>
+            <p>some text</p>
+            <table>
+            <tr>
+            <td>hello</td><td>world</td>
+            </tr>
+            <tr>
+            <td>hello</td><td>world</td>
+            </tr>
+            </table>
+            <img>
+            </body>
+            </html>
+            """
+
+        html_root = lxml.html.document_fromstring(html)
+        cleaner = Cleaner(
+            remove_unknown_tags = False,
+            allow_tags = ['table', 'tr', 'td'])
+        result = cleaner.clean_html(html_root)
+
+        self.assertEqual(12-5+1, len(list(result.iter())))
+
+    def test_safe_attrs_included(self):
+        html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
+
+        safe_attrs=set(lxml.html.defs.safe_attrs)
+        safe_attrs.add('style')
+
+        cleaner = Cleaner(
+            safe_attrs_only=True,
+            safe_attrs=safe_attrs)
+        result = cleaner.clean_html(html)
+
+        self.assertEqual(html, result)
+
+    def test_safe_attrs_excluded(self):
+        html = """<p><span style="color: #00ffff;">Cyan</span></p>"""
+        expected = """<p><span>Cyan</span></p>"""
+
+        safe_attrs=set()
+
+        cleaner = Cleaner(
+            safe_attrs_only=True,
+            safe_attrs=safe_attrs)
+        result = cleaner.clean_html(html)
+
+        self.assertEqual(expected, result)
+
+    def test_clean_invalid_root_tag(self):
+        # only testing that cleaning with invalid root tags works at all
+        s = lxml.html.fromstring('parent <invalid tag>child</another>')
+        self.assertEqual('parent child', clean_html(s).text_content())
+
+        s = lxml.html.fromstring('<invalid tag>child</another>')
+        self.assertEqual('child', clean_html(s).text_content())
+
+
+def test_suite():
+    suite = unittest.TestSuite()
+    if sys.version_info >= (2,4):
+        suite.addTests([make_doctest('test_clean.txt')])
+        if LIBXML_VERSION >= (2,6,31):
+            suite.addTests([make_doctest('test_clean_embed.txt')])
+    suite.addTests(unittest.makeSuite(CleanerTest))
+    return suite

From ac6b8ad85e38874fa91d0e81b04c6163b033a945 Mon Sep 17 00:00:00 2001
From: turly221 <feifan@scntist.com>
Date: Fri, 29 Nov 2024 05:11:17 +0000
Subject: [PATCH 3/4] commit patch 12358793

---
 src/lxml/html/defs.py                  |   2 +
 src/lxml/html/defs.py.orig             | 137 +++++++++++++++++++++++++
 src/lxml/html/tests/test_clean.py      |  15 +++
 src/lxml/html/tests/test_clean.py.orig |  10 ++
 4 files changed, 164 insertions(+)
 create mode 100644 src/lxml/html/defs.py.orig

diff --git a/src/lxml/html/defs.py b/src/lxml/html/defs.py
index e6ca58826..79e036859 100644
--- a/src/lxml/html/defs.py
+++ b/src/lxml/html/defs.py
@@ -27,6 +27,8 @@
     'usemap',
     # Not standard:
     'dynsrc', 'lowsrc',
+    # HTML5 formaction
+    'formaction'
     ])
 
 # Not in the HTML 4 spec:
diff --git a/src/lxml/html/defs.py.orig b/src/lxml/html/defs.py.orig
new file mode 100644
index 000000000..e6ca58826
--- /dev/null
+++ b/src/lxml/html/defs.py.orig
@@ -0,0 +1,137 @@
+# FIXME: this should all be confirmed against what a DTD says
+# (probably in a test; this may not match the DTD exactly, but we
+# should document just how it differs).
+
+# Data taken from http://www.w3.org/TR/html401/index/elements.html
+# and http://www.w3.org/community/webed/wiki/HTML/New_HTML5_Elements
+# for html5_tags.
+
+try:
+    frozenset
+except NameError:
+    from sets import Set as frozenset
+
+
+empty_tags = frozenset([
+    'area', 'base', 'basefont', 'br', 'col', 'frame', 'hr',
+    'img', 'input', 'isindex', 'link', 'meta', 'param'])
+
+deprecated_tags = frozenset([
+    'applet', 'basefont', 'center', 'dir', 'font', 'isindex',
+    'menu', 's', 'strike', 'u'])
+
+# archive actually takes a space-separated list of URIs
+link_attrs = frozenset([
+    'action', 'archive', 'background', 'cite', 'classid',
+    'codebase', 'data', 'href', 'longdesc', 'profile', 'src',
+    'usemap',
+    # Not standard:
+    'dynsrc', 'lowsrc',
+    ])
+
+# Not in the HTML 4 spec:
+# onerror, onresize
+event_attrs = frozenset([
+    'onblur', 'onchange', 'onclick', 'ondblclick', 'onerror',
+    'onfocus', 'onkeydown', 'onkeypress', 'onkeyup', 'onload',
+    'onmousedown', 'onmousemove', 'onmouseout', 'onmouseover',
+    'onmouseup', 'onreset', 'onresize', 'onselect', 'onsubmit',
+    'onunload',
+    ])
+
+safe_attrs = frozenset([
+    'abbr', 'accept', 'accept-charset', 'accesskey', 'action', 'align',
+    'alt', 'axis', 'border', 'cellpadding', 'cellspacing', 'char', 'charoff',
+    'charset', 'checked', 'cite', 'class', 'clear', 'cols', 'colspan',
+    'color', 'compact', 'coords', 'datetime', 'dir', 'disabled', 'enctype',
+    'for', 'frame', 'headers', 'height', 'href', 'hreflang', 'hspace', 'id',
+    'ismap', 'label', 'lang', 'longdesc', 'maxlength', 'media', 'method',
+    'multiple', 'name', 'nohref', 'noshade', 'nowrap', 'prompt', 'readonly',
+    'rel', 'rev', 'rows', 'rowspan', 'rules', 'scope', 'selected', 'shape',
+    'size', 'span', 'src', 'start', 'summary', 'tabindex', 'target', 'title',
+    'type', 'usemap', 'valign', 'value', 'vspace', 'width'])
+
+# From http://htmlhelp.com/reference/html40/olist.html
+top_level_tags = frozenset([
+    'html', 'head', 'body', 'frameset',
+    ])
+
+head_tags = frozenset([
+    'base', 'isindex', 'link', 'meta', 'script', 'style', 'title',
+    ])
+
+general_block_tags = frozenset([
+    'address',
+    'blockquote',
+    'center',
+    'del',
+    'div',
+    'h1',
+    'h2',
+    'h3',
+    'h4',
+    'h5',
+    'h6',
+    'hr',
+    'ins',
+    'isindex',
+    'noscript',
+    'p',
+    'pre',
+    ])
+
+list_tags = frozenset([
+    'dir', 'dl', 'dt', 'dd', 'li', 'menu', 'ol', 'ul',
+    ])
+
+table_tags = frozenset([
+    'table', 'caption', 'colgroup', 'col',
+    'thead', 'tfoot', 'tbody', 'tr', 'td', 'th',
+    ])
+
+# just this one from
+# http://www.georgehernandez.com/h/XComputers/HTML/2BlockLevel.htm
+block_tags = general_block_tags | list_tags | table_tags | frozenset([
+    # Partial form tags
+    'fieldset', 'form', 'legend', 'optgroup', 'option',
+    ])
+
+form_tags = frozenset([
+    'form', 'button', 'fieldset', 'legend', 'input', 'label',
+    'select', 'optgroup', 'option', 'textarea',
+    ])
+
+special_inline_tags = frozenset([
+    'a', 'applet', 'basefont', 'bdo', 'br', 'embed', 'font', 'iframe',
+    'img', 'map', 'area', 'object', 'param', 'q', 'script',
+    'span', 'sub', 'sup',
+    ])
+
+phrase_tags = frozenset([
+    'abbr', 'acronym', 'cite', 'code', 'del', 'dfn', 'em',
+    'ins', 'kbd', 'samp', 'strong', 'var',
+    ])
+
+font_style_tags = frozenset([
+    'b', 'big', 'i', 's', 'small', 'strike', 'tt', 'u',
+    ])
+
+frame_tags = frozenset([
+    'frameset', 'frame', 'noframes',
+    ])
+    
+html5_tags = frozenset([
+    'article', 'aside', 'audio', 'canvas', 'command', 'datalist',
+    'details', 'embed', 'figcaption', 'figure', 'footer', 'header',
+    'hgroup', 'keygen', 'mark', 'math', 'meter', 'nav', 'output',
+    'progress', 'rp', 'rt', 'ruby', 'section', 'source', 'summary',
+    'svg', 'time', 'track', 'video', 'wbr'
+    ])
+
+# These tags aren't standard
+nonstandard_tags = frozenset(['blink', 'marquee'])
+
+
+tags = (top_level_tags | head_tags | general_block_tags | list_tags
+        | table_tags | form_tags | special_inline_tags | phrase_tags
+        | font_style_tags | nonstandard_tags | html5_tags)
diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py
index dc9bb4e22..c8bc8f06c 100644
--- a/src/lxml/html/tests/test_clean.py
+++ b/src/lxml/html/tests/test_clean.py
@@ -79,6 +79,21 @@ def test_sneaky_noscript_in_style(self):
             b'<noscript><style>/* deleted */</style></noscript>',
             lxml.html.tostring(clean_html(s)))
 
+    def test_formaction_attribute_in_button_input(self):
+        # The formaction attribute overrides the form's action and should be
+        # treated as a malicious link attribute
+        html = ('<form id="test"><input type="submit" formaction="javascript:alert(1)"></form>'
+        '<button form="test" formaction="javascript:alert(1)">X</button>')
+        expected = ('<div><form id="test"><input type="submit" formaction=""></form>'
+        '<button form="test" formaction="">X</button></div>')
+        cleaner = Cleaner(
+            forms=False,
+            safe_attrs_only=False,
+        )
+        self.assertEqual(
+            expected,
+            cleaner.clean_html(html))
+
 
 def test_suite():
     suite = unittest.TestSuite()
diff --git a/src/lxml/html/tests/test_clean.py.orig b/src/lxml/html/tests/test_clean.py.orig
index 3bcaaf5a2..dc9bb4e22 100644
--- a/src/lxml/html/tests/test_clean.py.orig
+++ b/src/lxml/html/tests/test_clean.py.orig
@@ -69,6 +69,16 @@ class CleanerTest(unittest.TestCase):
         s = lxml.html.fromstring('<invalid tag>child</another>')
         self.assertEqual('child', clean_html(s).text_content())
 
+    def test_sneaky_noscript_in_style(self):
+        # This gets parsed as <noscript> -> <style>"...</noscript>..."</style>
+        # thus passing the </noscript> through into the output.
+        html = '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+        s = lxml.html.fragment_fromstring(html)
+
+        self.assertEqual(
+            b'<noscript><style>/* deleted */</style></noscript>',
+            lxml.html.tostring(clean_html(s)))
+
 
 def test_suite():
     suite = unittest.TestSuite()

From a5cc2b88acde82e542002ed9063a23ac352304b0 Mon Sep 17 00:00:00 2001
From: turly221 <feifan@scntist.com>
Date: Fri, 29 Nov 2024 05:11:20 +0000
Subject: [PATCH 4/4] commit patch 13084088

---
 src/lxml/html/clean.py                 |  2 ++
 src/lxml/html/clean.py.orig            |  3 +++
 src/lxml/html/tests/test_clean.py      | 20 ++++++++++++++++++++
 src/lxml/html/tests/test_clean.py.orig | 15 +++++++++++++++
 4 files changed, 40 insertions(+)

diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py
index 64788dd8e..6e2159132 100644
--- a/src/lxml/html/clean.py
+++ b/src/lxml/html/clean.py
@@ -494,6 +494,8 @@ def _has_sneaky_javascript(self, style):
             return True
         if 'expression(' in style:
             return True
+        if '@import' in style:
+            return True
         if '</noscript' in style:
             # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
             return True
diff --git a/src/lxml/html/clean.py.orig b/src/lxml/html/clean.py.orig
index e38768733..64788dd8e 100644
--- a/src/lxml/html/clean.py.orig
+++ b/src/lxml/html/clean.py.orig
@@ -494,6 +494,9 @@ class Cleaner(object):
             return True
         if 'expression(' in style:
             return True
+        if '</noscript' in style:
+            # e.g. '<noscript><style><a title="</noscript><img src=x onerror=alert(1)>">'
+            return True
         return False
 
     def clean_html(self, html):
diff --git a/src/lxml/html/tests/test_clean.py b/src/lxml/html/tests/test_clean.py
index c8bc8f06c..fd194991f 100644
--- a/src/lxml/html/tests/test_clean.py
+++ b/src/lxml/html/tests/test_clean.py
@@ -79,6 +79,26 @@ def test_sneaky_noscript_in_style(self):
             b'<noscript><style>/* deleted */</style></noscript>',
             lxml.html.tostring(clean_html(s)))
 
+    def test_sneaky_import_in_style(self):
+        # Prevent "@@importimport" -> "@import" replacement.
+        style_codes = [
+            "@@importimport(extstyle.css)",
+            "@ @  import import(extstyle.css)",
+            "@ @ importimport(extstyle.css)",
+            "@@  import import(extstyle.css)",
+            "@ @import import(extstyle.css)",
+            "@@importimport()",
+        ]
+        for style_code in style_codes:
+            html = '<style>%s</style>' % style_code
+            s = lxml.html.fragment_fromstring(html)
+
+            cleaned = lxml.html.tostring(clean_html(s))
+            self.assertEqual(
+                b'<style>/* deleted */</style>',
+                cleaned,
+                "%s  ->  %s" % (style_code, cleaned))
+
     def test_formaction_attribute_in_button_input(self):
         # The formaction attribute overrides the form's action and should be
         # treated as a malicious link attribute
diff --git a/src/lxml/html/tests/test_clean.py.orig b/src/lxml/html/tests/test_clean.py.orig
index dc9bb4e22..c8bc8f06c 100644
--- a/src/lxml/html/tests/test_clean.py.orig
+++ b/src/lxml/html/tests/test_clean.py.orig
@@ -79,6 +79,21 @@ class CleanerTest(unittest.TestCase):
             b'<noscript><style>/* deleted */</style></noscript>',
             lxml.html.tostring(clean_html(s)))
 
+    def test_formaction_attribute_in_button_input(self):
+        # The formaction attribute overrides the form's action and should be
+        # treated as a malicious link attribute
+        html = ('<form id="test"><input type="submit" formaction="javascript:alert(1)"></form>'
+        '<button form="test" formaction="javascript:alert(1)">X</button>')
+        expected = ('<div><form id="test"><input type="submit" formaction=""></form>'
+        '<button form="test" formaction="">X</button></div>')
+        cleaner = Cleaner(
+            forms=False,
+            safe_attrs_only=False,
+        )
+        self.assertEqual(
+            expected,
+            cleaner.clean_html(html))
+
 
 def test_suite():
     suite = unittest.TestSuite()