diff --git a/ietf/doc/templatetags/ietf_filters.py b/ietf/doc/templatetags/ietf_filters.py
index 5a8afd995..e1a80a26b 100644
--- a/ietf/doc/templatetags/ietf_filters.py
+++ b/ietf/doc/templatetags/ietf_filters.py
@@ -26,10 +26,11 @@ import debug # pyflakes:ignore
from ietf.doc.models import BallotDocEvent, Document
from ietf.doc.models import ConsensusDocEvent
from ietf.ietfauth.utils import can_request_rfc_publication as utils_can_request_rfc_publication
-from ietf.utils.html import sanitize_fragment
from ietf.utils import log
from ietf.doc.utils import prettify_std_name
-from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped, bleach_linker, bleach_cleaner, validate_url
+from ietf.utils.html import clean_html
+from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped, linkify
+from ietf.utils.validators import validate_url
register = template.Library()
@@ -98,7 +99,7 @@ def sanitize(value):
attributes to those deemed acceptable. See ietf/utils/html.py
for the details.
"""
- return mark_safe(sanitize_fragment(value))
+ return mark_safe(clean_html(value))
# For use with ballot view
@@ -446,16 +447,16 @@ def ad_area(user):
@register.filter
def format_history_text(text, trunc_words=25):
"""Run history text through some cleaning and add ellipsis if it's too long."""
- full = mark_safe(bleach_cleaner.clean(text))
- full = bleach_linker.linkify(urlize_ietf_docs(full))
+ full = mark_safe(clean_html(text))
+ full = linkify(urlize_ietf_docs(full))
return format_snippet(full, trunc_words)
@register.filter
def format_snippet(text, trunc_words=25):
# urlize if there aren't already links present
- text = bleach_linker.linkify(text)
- full = keep_spacing(collapsebr(linebreaksbr(mark_safe(sanitize_fragment(text)))))
+ text = linkify(text)
+ full = keep_spacing(collapsebr(linebreaksbr(mark_safe(clean_html(text)))))
snippet = truncatewords_html(full, trunc_words)
if snippet != full:
return mark_safe('
%s
%s
' % (snippet, full))
diff --git a/ietf/meeting/tests_views.py b/ietf/meeting/tests_views.py
index 0647da52a..b52f8de9b 100644
--- a/ietf/meeting/tests_views.py
+++ b/ietf/meeting/tests_views.py
@@ -6423,8 +6423,7 @@ class MaterialsTests(TestCase):
text = doc.text()
self.assertIn('Some text', text)
self.assertNotIn('', text)
- self.assertIn('charset="utf-8"', text)
-
+
# txt upload
test_file = BytesIO(b'This is some text for a test, with the word\nvirtual at the beginning of a line.')
test_file.name = "some.txt"
diff --git a/ietf/meeting/utils.py b/ietf/meeting/utils.py
index cfe7adfae..4f800980c 100644
--- a/ietf/meeting/utils.py
+++ b/ietf/meeting/utils.py
@@ -30,7 +30,7 @@ from ietf.group.utils import can_manage_materials
from ietf.name.models import SessionStatusName, ConstraintName, DocTypeName
from ietf.person.models import Person
from ietf.stats.models import MeetingRegistration
-from ietf.utils.html import sanitize_document
+from ietf.utils.html import clean_html
from ietf.utils.log import log
from ietf.utils.timezone import date_today
@@ -773,8 +773,8 @@ def handle_upload_file(file, filename, meeting, subdir, request=None, encoding=N
return "Failure trying to save '%s'. Hint: Try to upload as UTF-8: %s..." % (filename, str(e)[:120])
# Whole file sanitization; add back what's missing from a complete
# document (sanitize will remove these).
- clean = sanitize_document(text)
- destination.write(clean.encode('utf8'))
+ clean = clean_html(text)
+ destination.write(clean.encode("utf8"))
if request and clean != text:
messages.warning(request,
(
diff --git a/ietf/utils/html.py b/ietf/utils/html.py
index 9d0cd7c84..3f3efe2f3 100644
--- a/ietf/utils/html.py
+++ b/ietf/utils/html.py
@@ -5,11 +5,7 @@
import bleach
-import copy
import html2text
-import lxml.etree
-import lxml.html
-import lxml.html.clean
import debug # pyflakes:ignore
@@ -17,62 +13,66 @@ from django import forms
from django.utils.functional import keep_lazy
from ietf.utils.mime import get_mime_type
-from ietf.utils.text import bleach_cleaner, tags as acceptable_tags
-acceptable_protocols = ['http', 'https', 'mailto', 'xmpp', ]
-def unescape(text):
- """
- Returns the given text with ampersands, quotes and angle brackets decoded
- for use in URLs.
+# Allow the protocols/tags/attributes we specifically want, plus anything that bleach declares
+# to be safe. As of 2025-01-27, the explicit lists for protocols and tags are a strict superset
+# of bleach's defaults.
+acceptable_protocols = bleach.sanitizer.ALLOWED_PROTOCOLS.union(
+ {"http", "https", "mailto", "ftp", "xmpp"}
+)
+acceptable_tags = bleach.sanitizer.ALLOWED_TAGS.union(
+ {
+ # fmt: off
+ "a", "abbr", "acronym", "address", "b", "big",
+ "blockquote", "body", "br", "caption", "center", "cite", "code", "col",
+ "colgroup", "dd", "del", "dfn", "dir", "div", "dl", "dt", "em", "font",
+ "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "i", "ins", "kbd",
+ "li", "ol", "p", "pre", "q", "s", "samp", "small", "span", "strike", "style",
+ "strong", "sub", "sup", "table", "title", "tbody", "td", "tfoot", "th", "thead",
+ "tr", "tt", "u", "ul", "var"
+ # fmt: on
+ }
+)
+acceptable_attributes = bleach.sanitizer.ALLOWED_ATTRIBUTES | {
+ "*": ["id"],
+ "ol": ["start"],
+}
+
+
+# Instantiate sanitizer classes
+_bleach_cleaner = bleach.sanitizer.Cleaner(
+ tags=acceptable_tags,
+ attributes=acceptable_attributes,
+ protocols=acceptable_protocols,
+ strip=True,
+)
+
+
+_liberal_bleach_cleaner = bleach.sanitizer.Cleaner(
+ tags=acceptable_tags.union({"img", "figure", "figcaption"}),
+ attributes=acceptable_attributes | {"img": ["src", "alt"]},
+ protocols=acceptable_protocols,
+ strip=True,
+)
+
+
+def clean_html(text: str):
+ """Clean the HTML in a string"""
+ return _bleach_cleaner.clean(text)
+
+
+def liberal_clean_html(text: str):
+ """More permissively clean the HTML in a string"""
+ return _liberal_bleach_cleaner.clean(text)
- This function undoes what django.utils.html.escape() does
- """
- return text.replace('&', '&').replace(''', "'").replace('"', '"').replace('>', '>').replace('<', '<' )
@keep_lazy(str)
def remove_tags(html, tags):
"""Returns the given HTML sanitized, and with the given tags removed."""
- allowed = set(acceptable_tags) - set([ t.lower() for t in tags ])
+ allowed = acceptable_tags - set(t.lower() for t in tags)
return bleach.clean(html, tags=allowed, strip=True)
-# ----------------------------------------------------------------------
-# Html fragment cleaning
-
-def sanitize_fragment(html):
- return bleach_cleaner.clean(html)
-
-# ----------------------------------------------------------------------
-# Page cleaning
-
-
-class Cleaner(lxml.html.clean.Cleaner):
- charset = 'utf-8'
- def __init__(self, charset='utf-8', **kw):
- self.charset = charset
- super(Cleaner, self).__init__(**kw)
-
- # Copied from lxml 4.2.0 and modified to insert charset meta:
- def clean_html(self, html):
- result_type = type(html)
- if isinstance(html, (str, bytes)):
- doc = lxml.html.fromstring(html)
- else:
- doc = copy.deepcopy(html)
- self(doc)
- head = doc.find('head')
- if head != None:
- meta = lxml.etree.Element('meta', charset=self.charset)
- meta.tail = '\n'
- head.insert(0, meta)
- return lxml.html._transform_result(result_type, doc)
-
-# We will be saving as utf-8 later, so set that in the meta tag.
-lxml_cleaner = Cleaner(allow_tags=acceptable_tags, remove_unknown_tags=None, style=False, page_structure=False, charset='utf-8')
-
-def sanitize_document(html):
- return lxml_cleaner.clean_html(html)
-
# ----------------------------------------------------------------------
# Text field cleaning
@@ -86,4 +86,15 @@ def clean_text_field(text):
else:
raise forms.ValidationError("Unexpected text field mime type: %s" % mime_type)
return text
-
+
+
+def unescape(text):
+ """
+ Returns the given text with ampersands, quotes and angle brackets decoded
+ for use in URLs.
+
+ This function undoes what django.utils.html.escape() does
+ """
+ return text.replace('&', '&').replace(''', "'").replace('"', '"').replace('>', '>').replace('<', '<' )
+
+
diff --git a/ietf/utils/markdown.py b/ietf/utils/markdown.py
index 446d34895..0b522685b 100644
--- a/ietf/utils/markdown.py
+++ b/ietf/utils/markdown.py
@@ -12,13 +12,15 @@ from markdown.postprocessors import Postprocessor
from django.utils.safestring import mark_safe
from ietf.doc.templatetags.ietf_filters import urlize_ietf_docs
-from ietf.utils.text import bleach_cleaner, liberal_bleach_cleaner, bleach_linker
+from .html import clean_html, liberal_clean_html
+from .text import linkify
+
class LinkifyExtension(Extension):
"""
Simple Markdown extension inspired by https://github.com/daGrevis/mdx_linkify,
- but using our bleach_linker directly. Doing the linkification on the converted
+ but using our own linker directly. Doing the linkification on the converted
Markdown output introduces artifacts.
"""
@@ -31,12 +33,12 @@ class LinkifyExtension(Extension):
class LinkifyPostprocessor(Postprocessor):
def run(self, text):
- return urlize_ietf_docs(bleach_linker.linkify(text))
+ return urlize_ietf_docs(linkify(text))
def markdown(text):
return mark_safe(
- bleach_cleaner.clean(
+ clean_html(
python_markdown.markdown(
text,
extensions=[
@@ -52,7 +54,7 @@ def markdown(text):
def liberal_markdown(text):
return mark_safe(
- liberal_bleach_cleaner.clean(
+ liberal_clean_html(
python_markdown.markdown(
text,
extensions=[
diff --git a/ietf/utils/templatetags/textfilters.py b/ietf/utils/templatetags/textfilters.py
index 70b94cf67..3b240740e 100644
--- a/ietf/utils/templatetags/textfilters.py
+++ b/ietf/utils/templatetags/textfilters.py
@@ -11,7 +11,7 @@ from django.utils.safestring import mark_safe
import debug # pyflakes:ignore
-from ietf.utils.text import xslugify as _xslugify, texescape, bleach_linker
+from ietf.utils.text import linkify as _linkify, xslugify as _xslugify, texescape
register = template.Library()
@@ -74,7 +74,7 @@ def texescape_filter(value):
@register.filter
@stringfilter
def linkify(value):
- text = mark_safe(bleach_linker.linkify(value))
+ text = mark_safe(_linkify(value))
return text
@register.filter
diff --git a/ietf/utils/text.py b/ietf/utils/text.py
index 2fba113d0..4e5d5b6cd 100644
--- a/ietf/utils/text.py
+++ b/ietf/utils/text.py
@@ -1,17 +1,15 @@
# Copyright The IETF Trust 2016-2020, All Rights Reserved
# -*- coding: utf-8 -*-
-
-import bleach # type: ignore
-import copy
+import bleach
import email
import re
import textwrap
import tlds
import unicodedata
-from django.core.validators import URLValidator
from django.core.exceptions import ValidationError
+from django.core.validators import URLValidator
from django.utils.functional import keep_lazy
from django.utils.safestring import mark_safe
@@ -19,66 +17,52 @@ import debug # pyflakes:ignore
from .texescape import init as texescape_init, tex_escape_map
-tlds_sorted = sorted(tlds.tld_set, key=len, reverse=True)
-protocols = set(bleach.sanitizer.ALLOWED_PROTOCOLS)
-protocols.add("ftp") # we still have some ftp links
-protocols.add("xmpp") # we still have some xmpp links
+# Sort in reverse so substrings are considered later - e.g., so ".co" comes after ".com".
+tlds_sorted = sorted(tlds.tld_set, reverse=True)
-tags = set(bleach.sanitizer.ALLOWED_TAGS).union(
- {
- # fmt: off
- 'a', 'abbr', 'acronym', 'address', 'b', 'big',
- 'blockquote', 'body', 'br', 'caption', 'center', 'cite', 'code', 'col',
- 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
- 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr', 'html', 'i', 'ins', 'kbd',
- 'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'style',
- 'strong', 'sub', 'sup', 'table', 'title', 'tbody', 'td', 'tfoot', 'th', 'thead',
- 'tr', 'tt', 'u', 'ul', 'var'
- # fmt: on
- }
-)
+# Protocols we're interested in auto-linking. See also ietf.utils.html.acceptable_protocols,
+# which is protocols we allow people to include explicitly in sanitized html.
+linkable_protocols = ["http", "https", "mailto", "ftp", "xmpp"]
-attributes = copy.copy(bleach.sanitizer.ALLOWED_ATTRIBUTES)
-attributes["*"] = ["id"]
-attributes["ol"] = ["start"]
-bleach_cleaner = bleach.sanitizer.Cleaner(
- tags=tags, attributes=attributes, protocols=protocols, strip=True
-)
-
-liberal_tags = copy.copy(tags)
-liberal_attributes = copy.copy(attributes)
-liberal_tags.update(["img","figure","figcaption"])
-liberal_attributes["img"] = ["src","alt"]
-
-liberal_bleach_cleaner = bleach.sanitizer.Cleaner(
- tags=liberal_tags, attributes=liberal_attributes, protocols=protocols, strip=True
-)
-
-validate_url = URLValidator()
+_validate_url = URLValidator()
def check_url_validity(attrs, new=False):
+ """Callback for bleach linkify
+
+ :param attrs: dict of attributes of the tag
+ :param new: boolean - True if the link is new; False if was found in text
+ :return: new dict of attributes for the link, or None to block link creation
+
+ Attributes are namespaced, so normally look like `(None, "SomeAttribute")`.
+ This includes as the keys in the `attrs` argument, so `attrs[(None, "href")]`
+ would be the value of the href attribute.
+ """
if (None, "href") not in attrs:
# rfc2html creates a tags without href
return attrs
url = attrs[(None, "href")]
try:
if url.startswith("http"):
- validate_url(url)
+ _validate_url(url)
except ValidationError:
return None
return attrs
-bleach_linker = bleach.Linker(
+_bleach_linker = bleach.Linker(
callbacks=[check_url_validity],
- url_re=bleach.linkifier.build_url_re(tlds=tlds_sorted, protocols=protocols),
+ url_re=bleach.linkifier.build_url_re(tlds=tlds_sorted, protocols=linkable_protocols),
email_re=bleach.linkifier.build_email_re(tlds=tlds_sorted), # type: ignore
parse_email=True,
)
+def linkify(text):
+ return _bleach_linker.linkify(text)
+
+
@keep_lazy(str)
def xslugify(value):
"""
diff --git a/requirements.txt b/requirements.txt
index ae8b06fae..073a6bfa0 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -43,8 +43,7 @@ jsonfield>=3.1.0 # for SubmissionCheck. This is https://github.com/bradjaspe
jsonschema[format]>=4.2.1
jwcrypto>=1.2 # for signed notifications - this is aspirational, and is not really used.
logging_tree>=1.9 # Used only by the showloggers management command
-lxml>=5.3.0 # lxml[html_clean] fails on some architectures
-lxml_html_clean>=0.4.1
+lxml>=5.3.0
markdown>=3.3.6
types-markdown>=3.3.6
mock>=4.0.3 # Used only by tests, of course