fix: Avoid crashes in urlize_ietf_docs (#4161)

* fix: Don't crash when urlreverse fails as part of urlize_ietf_docs

Also fix an HTMLization nit.

* Fix more corner cases found during test-crawl

* Handle "I-D.*"" reference-style matches

* Refactor use of bleach. Better Markdown linkification and formatting.

* Address review comment from @rjsparks
This commit is contained in:
Lars Eggert 2022-07-07 20:27:30 +03:00 committed by GitHub
parent 64272b5d9f
commit fd087d4e16
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 55 additions and 35 deletions

View file

@ -17,6 +17,7 @@ from django.utils.encoding import force_str # pyflakes:ignore force_str is used
from django.urls import reverse as urlreverse
from django.core.cache import cache
from django.core.exceptions import ValidationError
from django.urls import NoReverseMatch
import debug # pyflakes:ignore
@ -181,6 +182,8 @@ def link_charter_doc_match(match):
def link_non_charter_doc_match(match):
name = match[0]
# handle "I-D.*"" reference-style matches
name = re.sub(r"^i-d\.(.*)", r"draft-\1", name, flags=re.IGNORECASE)
cname = doc_canonical_name(name)
if not cname:
return match[0]
@ -200,10 +203,13 @@ def link_non_charter_doc_match(match):
if not cname:
return match[0]
if name == cname:
url = urlreverse(
"ietf.doc.views_doc.document_main",
kwargs=dict(name=cname, rev=rev_split.group(2)),
)
try:
url = urlreverse(
"ietf.doc.views_doc.document_main",
kwargs=dict(name=cname, rev=rev_split.group(2)),
)
except NoReverseMatch:
return match[0]
return f'<a href="{url}">{match[0]}</a>'
# if we get here, we can't linkify
@ -230,19 +236,19 @@ def urlize_ietf_docs(string, autoescape=None):
else:
string = mark_safe(string)
string = re.sub(
r"\b(?<![/\-:=#])(charter-(?:[\d\w\.+]+-)*)(\d{2}(?:-\d{2}))(\.(?:txt|ps|pdf|html))?\b",
r"\b(?<![/\-:=#\"\'])(charter-(?:[\d\w\.+]+-)*)(\d{2}(?:-\d{2}))(\.(?:txt|ps|pdf|html))?\b",
link_charter_doc_match,
string,
flags=re.IGNORECASE | re.ASCII,
)
string = re.sub(
r"\b(?<![/\-:=#])((?:draft-|bofreq-|conflict-review-|status-change-)[\d\w\.+-]+(?![-@]))",
r"\b(?<![/\-:=#\"\'])((?:draft-|i-d\.|bofreq-|conflict-review-|status-change-)[\d\w\.+-]+(?![-@]))",
link_non_charter_doc_match,
string,
flags=re.IGNORECASE | re.ASCII,
)
string = re.sub(
r"\b(?<![/\-:=#])((RFC|BCP|STD|FYI)\s*0*(\d+))\b",
r"\b(?<![/\-:=#\"\'])((RFC|BCP|STD|FYI)\s*0*(\d+))\b",
link_other_doc_match,
string,
flags=re.IGNORECASE | re.ASCII,

View file

@ -89,12 +89,16 @@ class IetfFiltersTests(TestCase):
f'New version available: <b><a href="/doc/{charter.name}/01-00/">{charter.name}-01-00.txt</a></b>',
),
(
"repository https://github.com/tlswg/draft-ietf-tls-ticketrequest",
"repository https://github.com/tlswg/draft-ietf-tls-ticketrequest",
f"repository https://github.com/tlswg/{id.name}",
f"repository https://github.com/tlswg/{id.name}",
),
(
'<a href="mailto:draft-ietf-some-names@ietf.org">draft-ietf-some-names@ietf.org</a>',
'<a href="mailto:draft-ietf-some-names@ietf.org">draft-ietf-some-names@ietf.org</a>',
f'<a href="mailto:{id.name}@ietf.org">{id.name}@ietf.org</a>',
f'<a href="mailto:{id.name}@ietf.org">{id.name}@ietf.org</a>',
),
(
f"{id.name}@ietf.org",
f"{id.name}@ietf.org",
),
(
"http://ieee802.org/1/files/public/docs2015/cn-thaler-Qcn-draft-PAR.pdf",
@ -143,5 +147,5 @@ class IetfFiltersTests(TestCase):
]
for input, output in cases:
#debug.show("(urlize_ietf_docs(input),output)")
# debug.show("(input, urlize_ietf_docs(input), output)")
self.assertEqual(urlize_ietf_docs(input), output)

View file

@ -155,7 +155,7 @@
{{ doc.name }}-{{ doc.rev }}
</div>
<div class="card-body">
{{ content|urlize_ietf_docs|linkify }}
{{ content }}
</div>
</div>
{% endblock %}
@ -164,4 +164,4 @@
</script>
<script src="{% static 'ietf/js/document_timeline.js' %}">
</script>
{% endblock %}
{% endblock %}

View file

@ -142,4 +142,4 @@
{% block js %}
<script src="{% static 'ietf/js/d3.js' %}"></script>
<script src="{% static 'ietf/js/document_timeline.js' %}"></script>
{% endblock %}
{% endblock %}

View file

@ -17,14 +17,7 @@ from django import forms
from django.utils.functional import keep_lazy
from ietf.utils.mime import get_mime_type
acceptable_tags = ('a', 'abbr', 'acronym', 'address', 'b', 'big',
'blockquote', 'body', 'br', 'caption', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr', 'html', 'i', 'ins', 'kbd',
'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'style',
'strong', 'sub', 'sup', 'table', 'title', 'tbody', 'td', 'tfoot', 'th', 'thead',
'tr', 'tt', 'u', 'ul', 'var')
from ietf.utils.text import bleach_cleaner, tags as acceptable_tags
acceptable_protocols = ['http', 'https', 'mailto', 'xmpp', ]
@ -46,8 +39,6 @@ def remove_tags(html, tags):
# ----------------------------------------------------------------------
# Html fragment cleaning
bleach_cleaner = bleach.sanitizer.Cleaner(tags=acceptable_tags, protocols=acceptable_protocols, strip=True)
def sanitize_fragment(html):
return bleach_cleaner.clean(html)
@ -95,4 +86,4 @@ def clean_text_field(text):
else:
raise forms.ValidationError("Unexpected text field mime type: %s" % mime_type)
return text

View file

@ -5,16 +5,24 @@
Use this instead of importing markdown directly to guarantee consistent extensions / options through
the datatracker.
"""
import bleach
import markdown as python_markdown
from django.utils.safestring import mark_safe
from markdown.extensions.extra import ExtraExtension
ALLOWED_TAGS = bleach.ALLOWED_TAGS + ['p', 'h1', 'h2', 'h3', 'h4', 'br']
from ietf.doc.templatetags.ietf_filters import urlize_ietf_docs
from ietf.utils.text import bleach_cleaner, bleach_linker
def markdown(text):
return mark_safe(bleach.clean(
python_markdown.markdown(text, extensions=[ExtraExtension()]),
tags=ALLOWED_TAGS,
))
return mark_safe(
bleach_linker.linkify(
urlize_ietf_docs(
bleach_cleaner.clean(
python_markdown.markdown(
text, extensions=[ExtraExtension(), "nl2br"]
)
)
)
)
)

View file

@ -22,10 +22,13 @@ from .texescape import init as texescape_init, tex_escape_map
tlds_sorted = sorted(tlds.tld_set, key=len, reverse=True)
protocols = copy.copy(bleach.sanitizer.ALLOWED_PROTOCOLS)
protocols.append("ftp") # we still have some ftp links
protocols.append("xmpp") # we still have some xmpp links
validate_url = URLValidator()
def check_url_validity(attrs, new=False):
if (None, 'href') not in attrs:
return None
url = attrs[(None, 'href')]
try:
if url.startswith("http"):
@ -42,9 +45,17 @@ bleach_linker = bleach.Linker(
parse_email=True
)
tags = copy.copy(bleach.sanitizer.ALLOWED_TAGS)
tags.remove("a")
bleach_cleaner = bleach.sanitizer.Cleaner(tags=tags, protocols=protocols)
tags = (
'a', 'abbr', 'acronym', 'address', 'b', 'big',
'blockquote', 'body', 'br', 'caption', 'center', 'cite', 'code', 'col',
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr', 'html', 'i', 'ins', 'kbd',
'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'style',
'strong', 'sub', 'sup', 'table', 'title', 'tbody', 'td', 'tfoot', 'th', 'thead',
'tr', 'tt', 'u', 'ul', 'var'
)
bleach_cleaner = bleach.sanitizer.Cleaner(tags=tags, protocols=protocols, strip=True)
@keep_lazy(str)