fix: Avoid crashes in urlize_ietf_docs
(#4161)
* fix: Don't crash when urlreverse fails as part of urlize_ietf_docs Also fix an HTMLization nit. * Fix more corner cases found during test-crawl * Handle "I-D.*"" reference-style matches * Refactor use of bleach. Better Markdown linkification and formatting. * Address review comment from @rjsparks
This commit is contained in:
parent
64272b5d9f
commit
fd087d4e16
|
@ -17,6 +17,7 @@ from django.utils.encoding import force_str # pyflakes:ignore force_str is used
|
|||
from django.urls import reverse as urlreverse
|
||||
from django.core.cache import cache
|
||||
from django.core.exceptions import ValidationError
|
||||
from django.urls import NoReverseMatch
|
||||
|
||||
import debug # pyflakes:ignore
|
||||
|
||||
|
@ -181,6 +182,8 @@ def link_charter_doc_match(match):
|
|||
|
||||
def link_non_charter_doc_match(match):
|
||||
name = match[0]
|
||||
# handle "I-D.*"" reference-style matches
|
||||
name = re.sub(r"^i-d\.(.*)", r"draft-\1", name, flags=re.IGNORECASE)
|
||||
cname = doc_canonical_name(name)
|
||||
if not cname:
|
||||
return match[0]
|
||||
|
@ -200,10 +203,13 @@ def link_non_charter_doc_match(match):
|
|||
if not cname:
|
||||
return match[0]
|
||||
if name == cname:
|
||||
url = urlreverse(
|
||||
"ietf.doc.views_doc.document_main",
|
||||
kwargs=dict(name=cname, rev=rev_split.group(2)),
|
||||
)
|
||||
try:
|
||||
url = urlreverse(
|
||||
"ietf.doc.views_doc.document_main",
|
||||
kwargs=dict(name=cname, rev=rev_split.group(2)),
|
||||
)
|
||||
except NoReverseMatch:
|
||||
return match[0]
|
||||
return f'<a href="{url}">{match[0]}</a>'
|
||||
|
||||
# if we get here, we can't linkify
|
||||
|
@ -230,19 +236,19 @@ def urlize_ietf_docs(string, autoescape=None):
|
|||
else:
|
||||
string = mark_safe(string)
|
||||
string = re.sub(
|
||||
r"\b(?<![/\-:=#])(charter-(?:[\d\w\.+]+-)*)(\d{2}(?:-\d{2}))(\.(?:txt|ps|pdf|html))?\b",
|
||||
r"\b(?<![/\-:=#\"\'])(charter-(?:[\d\w\.+]+-)*)(\d{2}(?:-\d{2}))(\.(?:txt|ps|pdf|html))?\b",
|
||||
link_charter_doc_match,
|
||||
string,
|
||||
flags=re.IGNORECASE | re.ASCII,
|
||||
)
|
||||
string = re.sub(
|
||||
r"\b(?<![/\-:=#])((?:draft-|bofreq-|conflict-review-|status-change-)[\d\w\.+-]+(?![-@]))",
|
||||
r"\b(?<![/\-:=#\"\'])((?:draft-|i-d\.|bofreq-|conflict-review-|status-change-)[\d\w\.+-]+(?![-@]))",
|
||||
link_non_charter_doc_match,
|
||||
string,
|
||||
flags=re.IGNORECASE | re.ASCII,
|
||||
)
|
||||
string = re.sub(
|
||||
r"\b(?<![/\-:=#])((RFC|BCP|STD|FYI)\s*0*(\d+))\b",
|
||||
r"\b(?<![/\-:=#\"\'])((RFC|BCP|STD|FYI)\s*0*(\d+))\b",
|
||||
link_other_doc_match,
|
||||
string,
|
||||
flags=re.IGNORECASE | re.ASCII,
|
||||
|
|
|
@ -89,12 +89,16 @@ class IetfFiltersTests(TestCase):
|
|||
f'New version available: <b><a href="/doc/{charter.name}/01-00/">{charter.name}-01-00.txt</a></b>',
|
||||
),
|
||||
(
|
||||
"repository https://github.com/tlswg/draft-ietf-tls-ticketrequest",
|
||||
"repository https://github.com/tlswg/draft-ietf-tls-ticketrequest",
|
||||
f"repository https://github.com/tlswg/{id.name}",
|
||||
f"repository https://github.com/tlswg/{id.name}",
|
||||
),
|
||||
(
|
||||
'<a href="mailto:draft-ietf-some-names@ietf.org">draft-ietf-some-names@ietf.org</a>',
|
||||
'<a href="mailto:draft-ietf-some-names@ietf.org">draft-ietf-some-names@ietf.org</a>',
|
||||
f'<a href="mailto:{id.name}@ietf.org">{id.name}@ietf.org</a>',
|
||||
f'<a href="mailto:{id.name}@ietf.org">{id.name}@ietf.org</a>',
|
||||
),
|
||||
(
|
||||
f"{id.name}@ietf.org",
|
||||
f"{id.name}@ietf.org",
|
||||
),
|
||||
(
|
||||
"http://ieee802.org/1/files/public/docs2015/cn-thaler-Qcn-draft-PAR.pdf",
|
||||
|
@ -143,5 +147,5 @@ class IetfFiltersTests(TestCase):
|
|||
]
|
||||
|
||||
for input, output in cases:
|
||||
#debug.show("(urlize_ietf_docs(input),output)")
|
||||
# debug.show("(input, urlize_ietf_docs(input), output)")
|
||||
self.assertEqual(urlize_ietf_docs(input), output)
|
||||
|
|
|
@ -155,7 +155,7 @@
|
|||
{{ doc.name }}-{{ doc.rev }}
|
||||
</div>
|
||||
<div class="card-body">
|
||||
{{ content|urlize_ietf_docs|linkify }}
|
||||
{{ content }}
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
@ -164,4 +164,4 @@
|
|||
</script>
|
||||
<script src="{% static 'ietf/js/document_timeline.js' %}">
|
||||
</script>
|
||||
{% endblock %}
|
||||
{% endblock %}
|
|
@ -142,4 +142,4 @@
|
|||
{% block js %}
|
||||
<script src="{% static 'ietf/js/d3.js' %}"></script>
|
||||
<script src="{% static 'ietf/js/document_timeline.js' %}"></script>
|
||||
{% endblock %}
|
||||
{% endblock %}
|
|
@ -17,14 +17,7 @@ from django import forms
|
|||
from django.utils.functional import keep_lazy
|
||||
|
||||
from ietf.utils.mime import get_mime_type
|
||||
|
||||
acceptable_tags = ('a', 'abbr', 'acronym', 'address', 'b', 'big',
|
||||
'blockquote', 'body', 'br', 'caption', 'center', 'cite', 'code', 'col',
|
||||
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr', 'html', 'i', 'ins', 'kbd',
|
||||
'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'style',
|
||||
'strong', 'sub', 'sup', 'table', 'title', 'tbody', 'td', 'tfoot', 'th', 'thead',
|
||||
'tr', 'tt', 'u', 'ul', 'var')
|
||||
from ietf.utils.text import bleach_cleaner, tags as acceptable_tags
|
||||
|
||||
acceptable_protocols = ['http', 'https', 'mailto', 'xmpp', ]
|
||||
|
||||
|
@ -46,8 +39,6 @@ def remove_tags(html, tags):
|
|||
# ----------------------------------------------------------------------
|
||||
# Html fragment cleaning
|
||||
|
||||
bleach_cleaner = bleach.sanitizer.Cleaner(tags=acceptable_tags, protocols=acceptable_protocols, strip=True)
|
||||
|
||||
def sanitize_fragment(html):
|
||||
return bleach_cleaner.clean(html)
|
||||
|
||||
|
@ -95,4 +86,4 @@ def clean_text_field(text):
|
|||
else:
|
||||
raise forms.ValidationError("Unexpected text field mime type: %s" % mime_type)
|
||||
return text
|
||||
|
||||
|
||||
|
|
|
@ -5,16 +5,24 @@
|
|||
Use this instead of importing markdown directly to guarantee consistent extensions / options through
|
||||
the datatracker.
|
||||
"""
|
||||
import bleach
|
||||
import markdown as python_markdown
|
||||
|
||||
from django.utils.safestring import mark_safe
|
||||
from markdown.extensions.extra import ExtraExtension
|
||||
|
||||
ALLOWED_TAGS = bleach.ALLOWED_TAGS + ['p', 'h1', 'h2', 'h3', 'h4', 'br']
|
||||
from ietf.doc.templatetags.ietf_filters import urlize_ietf_docs
|
||||
from ietf.utils.text import bleach_cleaner, bleach_linker
|
||||
|
||||
|
||||
def markdown(text):
|
||||
return mark_safe(bleach.clean(
|
||||
python_markdown.markdown(text, extensions=[ExtraExtension()]),
|
||||
tags=ALLOWED_TAGS,
|
||||
))
|
||||
return mark_safe(
|
||||
bleach_linker.linkify(
|
||||
urlize_ietf_docs(
|
||||
bleach_cleaner.clean(
|
||||
python_markdown.markdown(
|
||||
text, extensions=[ExtraExtension(), "nl2br"]
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
)
|
||||
|
|
|
@ -22,10 +22,13 @@ from .texescape import init as texescape_init, tex_escape_map
|
|||
tlds_sorted = sorted(tlds.tld_set, key=len, reverse=True)
|
||||
protocols = copy.copy(bleach.sanitizer.ALLOWED_PROTOCOLS)
|
||||
protocols.append("ftp") # we still have some ftp links
|
||||
protocols.append("xmpp") # we still have some xmpp links
|
||||
validate_url = URLValidator()
|
||||
|
||||
|
||||
def check_url_validity(attrs, new=False):
|
||||
if (None, 'href') not in attrs:
|
||||
return None
|
||||
url = attrs[(None, 'href')]
|
||||
try:
|
||||
if url.startswith("http"):
|
||||
|
@ -42,9 +45,17 @@ bleach_linker = bleach.Linker(
|
|||
parse_email=True
|
||||
)
|
||||
|
||||
tags = copy.copy(bleach.sanitizer.ALLOWED_TAGS)
|
||||
tags.remove("a")
|
||||
bleach_cleaner = bleach.sanitizer.Cleaner(tags=tags, protocols=protocols)
|
||||
tags = (
|
||||
'a', 'abbr', 'acronym', 'address', 'b', 'big',
|
||||
'blockquote', 'body', 'br', 'caption', 'center', 'cite', 'code', 'col',
|
||||
'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font',
|
||||
'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'head', 'hr', 'html', 'i', 'ins', 'kbd',
|
||||
'li', 'ol', 'p', 'pre', 'q', 's', 'samp', 'small', 'span', 'strike', 'style',
|
||||
'strong', 'sub', 'sup', 'table', 'title', 'tbody', 'td', 'tfoot', 'th', 'thead',
|
||||
'tr', 'tt', 'u', 'ul', 'var'
|
||||
)
|
||||
|
||||
bleach_cleaner = bleach.sanitizer.Cleaner(tags=tags, protocols=protocols, strip=True)
|
||||
|
||||
|
||||
@keep_lazy(str)
|
||||
|
|
Loading…
Reference in a new issue