Skip to content

Commit

Permalink
EDEV-91: Move from bleach to nh3 (#1574)
Browse files Browse the repository at this point in the history
  • Loading branch information
jamesbiggs authored Mar 21, 2024
1 parent 7f928d6 commit 309d251
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 180 deletions.
27 changes: 21 additions & 6 deletions etna/ciim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from django.urls import NoReverseMatch, reverse
from django.utils.safestring import mark_safe

import bleach
import nh3

from pyquery import PyQuery as pq

Expand Down Expand Up @@ -258,8 +258,23 @@ def format_link(link_html: str) -> Dict[str, str]:
return {"href": href, "id": id, "text": document.text()}


def strip_html(value: str, preserve_marks=False):
tags = []
if preserve_marks:
tags.append("mark")
return mark_safe(bleach.clean(value, tags=tags, strip=True))
def strip_html(value: str, *, preserve_marks, ensure_spaces):
"""
Temporary HTML sanitiser to remove unwanted tags from data.
K-int will eventually sanitise this at API level.
preserve_marks=True will keep <mark> tags in the output, otherwise they are removed.
Replacing <span> and <p> tags is necessary to prevent "bunched" data,
"This is a<span>test</span>example" will return as "This is atestexample"
without the placement of the space.
"""
clean_tags = {"span", "p"} if ensure_spaces else set()
clean_html = nh3.clean(
value, tags={*clean_tags, "mark"} if preserve_marks else clean_tags
)
for tag in clean_tags:
opening_regex = rf"<{tag}[^>]*>"
closing_regex = rf"</{tag}>"
clean_html = re.sub(opening_regex, " ", clean_html)
clean_html = re.sub(closing_regex, "", clean_html)
return mark_safe(clean_html.lstrip())
4 changes: 2 additions & 2 deletions etna/feedback/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from wagtail.models import Page, Revision, Site
from wagtail.models.sites import get_site_for_hostname

import bleach
import nh3

from etna.feedback import constants
from etna.feedback.models import FeedbackSubmission
Expand Down Expand Up @@ -168,7 +168,7 @@ def clean_comment(self) -> str:
value = self.cleaned_data.get("comment")
if value is None:
return ""
return bleach.clean(value, strip=True)
return nh3.clean(value)

def clean(self) -> str:
# Validate the 'signature' value against the 'id' value
Expand Down
4 changes: 2 additions & 2 deletions etna/records/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,7 @@ def has_source_url(self) -> bool:
@cached_property
def summary_title(self) -> str:
if raw := self._get_raw_summary_title():
return mark_safe(strip_html(raw, preserve_marks=True))
return mark_safe(strip_html(raw, preserve_marks=True, ensure_spaces=True))
return raw

def _get_raw_summary_title(self) -> str:
Expand Down Expand Up @@ -267,7 +267,7 @@ def listing_description(self) -> str:
will be left in-tact, but and other HTML is stripped.
"""
if raw := self._get_raw_description(use_highlights=True):
return mark_safe(strip_html(raw, preserve_marks=True))
return mark_safe(strip_html(raw, preserve_marks=True, ensure_spaces=True))
return ""

def _get_raw_description(self, use_highlights: bool = False) -> str:
Expand Down
6 changes: 3 additions & 3 deletions etna/records/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,11 +191,11 @@ def test_listing_description(self):
self.assertEqual(
self.record.listing_description,
(
"\nThis series contains papers concering a wide variety of legal matters referred "
"This series contains papers concering a wide variety of legal matters referred "
"to the Law Officers for their advice or approval and includes applications for the "
"Attorney General's General Fiat for leave to appeal to the House of Lords in criminal "
"cases."
"\nAlso included are a number of opinions, more of which can be found in LO 3"
"cases. "
"Also included are a number of opinions, more of which can be found in LO 3"
),
)

Expand Down
Loading

0 comments on commit 309d251

Please sign in to comment.