Tweaked the document sanitizer to insert a charset meta tag after sanitization.

- Legacy-Id: 14832
This commit is contained in:
Henrik Levkowetz 2018-03-16 11:13:03 +00:00
parent c3e05fd194
commit 9341f96832
2 changed files with 25 additions and 3 deletions

View file

@ -1720,7 +1720,7 @@ class MaterialsTests(TestCase):
self.assertTrue(q('form .has-error'))
# Test html sanitization
test_file = StringIO('<html><h1>Title</h1><section>Some text</section></html>')
test_file = StringIO('<html><head><title>Title</title></head><body><h1>Title</h1><section>Some text</section></body></html>')
test_file.name = "some.html"
r = self.client.post(url,dict(file=test_file))
self.assertEqual(r.status_code, 302)
@ -1729,6 +1729,7 @@ class MaterialsTests(TestCase):
text = doc.text()
self.assertIn('Some text', text)
self.assertNotIn('<section>', text)
self.assertIn('charset="utf-8"', text)
test_file = StringIO(u'This is some text for a test, with the word\nvirtual at the beginning of a line.')
test_file.name = "not_really.txt"

View file

@ -2,6 +2,9 @@
"""Utilities for working with HTML."""
import bleach
import copy
import lxml.etree
import lxml.html
import lxml.html.clean
import debug # pyflakes:ignore
@ -45,8 +48,26 @@ def sanitize_fragment(html):
# ----------------------------------------------------------------------
# Page cleaning
lxml_cleaner = lxml.html.clean.Cleaner(allow_tags=acceptable_tags,
remove_unknown_tags=None, style=False, page_structure=False)
class Cleaner(lxml.html.clean.Cleaner):
charset = 'utf-8'
# Copied from lxml 4.2.0 and modified to insert charset meta:
def clean_html(self, html):
result_type = type(html)
if isinstance(html, basestring):
doc = lxml.html.fromstring(html)
else:
doc = copy.deepcopy(html)
self(doc)
head = doc.find('head')
if head != None:
meta = lxml.etree.Element('meta', charset=self.charset)
meta.tail = '\n'
head.insert(0, meta)
return lxml.html._transform_result(result_type, doc)
# We will be saving as utf-8 later, so set that in the meta tag.
lxml_cleaner = Cleaner(allow_tags=acceptable_tags, remove_unknown_tags=None, style=False, page_structure=False, charset='utf-8')
def sanitize_document(html):
return lxml_cleaner.clean_html(html)