Tweaked the document sanitizer to insert a charset meta tag after sanitization.
- Legacy-Id: 14832
This commit is contained in:
parent
c3e05fd194
commit
9341f96832
|
@ -1720,7 +1720,7 @@ class MaterialsTests(TestCase):
|
|||
self.assertTrue(q('form .has-error'))
|
||||
|
||||
# Test html sanitization
|
||||
test_file = StringIO('<html><h1>Title</h1><section>Some text</section></html>')
|
||||
test_file = StringIO('<html><head><title>Title</title></head><body><h1>Title</h1><section>Some text</section></body></html>')
|
||||
test_file.name = "some.html"
|
||||
r = self.client.post(url,dict(file=test_file))
|
||||
self.assertEqual(r.status_code, 302)
|
||||
|
@ -1729,6 +1729,7 @@ class MaterialsTests(TestCase):
|
|||
text = doc.text()
|
||||
self.assertIn('Some text', text)
|
||||
self.assertNotIn('<section>', text)
|
||||
self.assertIn('charset="utf-8"', text)
|
||||
|
||||
test_file = StringIO(u'This is some text for a test, with the word\nvirtual at the beginning of a line.')
|
||||
test_file.name = "not_really.txt"
|
||||
|
|
|
@ -2,6 +2,9 @@
|
|||
|
||||
"""Utilities for working with HTML."""
|
||||
import bleach
|
||||
import copy
|
||||
import lxml.etree
|
||||
import lxml.html
|
||||
import lxml.html.clean
|
||||
|
||||
import debug # pyflakes:ignore
|
||||
|
@ -45,8 +48,26 @@ def sanitize_fragment(html):
|
|||
# ----------------------------------------------------------------------
|
||||
# Page cleaning
|
||||
|
||||
lxml_cleaner = lxml.html.clean.Cleaner(allow_tags=acceptable_tags,
|
||||
remove_unknown_tags=None, style=False, page_structure=False)
|
||||
|
||||
class Cleaner(lxml.html.clean.Cleaner):
|
||||
charset = 'utf-8'
|
||||
# Copied from lxml 4.2.0 and modified to insert charset meta:
|
||||
def clean_html(self, html):
|
||||
result_type = type(html)
|
||||
if isinstance(html, basestring):
|
||||
doc = lxml.html.fromstring(html)
|
||||
else:
|
||||
doc = copy.deepcopy(html)
|
||||
self(doc)
|
||||
head = doc.find('head')
|
||||
if head != None:
|
||||
meta = lxml.etree.Element('meta', charset=self.charset)
|
||||
meta.tail = '\n'
|
||||
head.insert(0, meta)
|
||||
return lxml.html._transform_result(result_type, doc)
|
||||
|
||||
# We will be saving as utf-8 later, so set that in the meta tag.
|
||||
lxml_cleaner = Cleaner(allow_tags=acceptable_tags, remove_unknown_tags=None, style=False, page_structure=False, charset='utf-8')
|
||||
|
||||
def sanitize_document(html):
|
||||
return lxml_cleaner.clean_html(html)
|
||||
|
|
Loading…
Reference in a new issue