From fa9427769a2c587374c8188c7af70d14e925ceeb Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Fri, 21 Feb 2020 21:36:18 +0000 Subject: [PATCH] Added cleaning of the session request form's 'comments' field, to convert any html entered to text. Related to [17322]. - Legacy-Id: 17324 Note: SVN reference [17322] has been migrated to Git commit eb88abc394fb263a8d53c3ddd099acf2b1605fa6 --- ietf/secr/sreq/forms.py | 5 ++++- ietf/secr/sreq/views.py | 4 ++-- ietf/utils/html.py | 21 ++++++++++++++++++++- ietf/utils/mime.py | 21 +++++++++++++++++++++ ietf/utils/validators.py | 20 +++----------------- requirements.txt | 1 + requirements3.txt | 1 + 7 files changed, 52 insertions(+), 21 deletions(-) create mode 100644 ietf/utils/mime.py diff --git a/ietf/secr/sreq/forms.py b/ietf/secr/sreq/forms.py index ffd9fb9ea..dec6200da 100644 --- a/ietf/secr/sreq/forms.py +++ b/ietf/secr/sreq/forms.py @@ -11,7 +11,7 @@ import debug # pyflakes:ignore from ietf.group.models import Group from ietf.meeting.models import ResourceAssociation from ietf.person.fields import SearchablePersonsField - +from ietf.utils.html import clean_text_field # ------------------------------------------------- # Globals @@ -145,6 +145,9 @@ class SessionForm(forms.Form): check_conflict(conflict, self.group) return conflict + def clean_comments(self): + return clean_text_field(self.cleaned_data['comments']) + def clean(self): super(SessionForm, self).clean() data = self.cleaned_data diff --git a/ietf/secr/sreq/views.py b/ietf/secr/sreq/views.py index a9328068a..ead39e9ee 100644 --- a/ietf/secr/sreq/views.py +++ b/ietf/secr/sreq/views.py @@ -289,9 +289,9 @@ def confirm(request, acronym): new_session = Session.objects.create( meeting=meeting, group=group, - attendees=form.data['attendees'], + attendees=form.cleaned_data['attendees'], requested_duration=datetime.timedelta(0,int(duration)), - comments=form.data['comments'], + comments=form.cleaned_data['comments'], type_id='regular', ) SchedulingEvent.objects.create( diff --git a/ietf/utils/html.py b/ietf/utils/html.py index 3bc223939..35f2c14e4 100644 --- a/ietf/utils/html.py +++ b/ietf/utils/html.py @@ -1,4 +1,4 @@ -# Copyright The IETF Trust 2010-2019, All Rights Reserved +# Copyright The IETF Trust 2010-2020, All Rights Reserved # -*- coding: utf-8 -*- # Taken from http://code.google.com/p/soclone/source/browse/trunk/soclone/utils/html.py """Utilities for working with HTML.""" @@ -8,6 +8,7 @@ from __future__ import absolute_import, print_function, unicode_literals import bleach import copy +import html2text import lxml.etree import lxml.html import lxml.html.clean @@ -15,8 +16,11 @@ import six import debug # pyflakes:ignore +from django import forms from django.utils.functional import keep_lazy +from ietf.utils.mime import get_mime_type + acceptable_tags = ('a', 'abbr', 'acronym', 'address', 'b', 'big', 'blockquote', 'body', 'br', 'caption', 'center', 'cite', 'code', 'col', 'colgroup', 'dd', 'del', 'dfn', 'dir', 'div', 'dl', 'dt', 'em', 'font', @@ -76,3 +80,18 @@ lxml_cleaner = Cleaner(allow_tags=acceptable_tags, remove_unknown_tags=None, sty def sanitize_document(html): return lxml_cleaner.clean_html(html) + + +# ---------------------------------------------------------------------- +# Text field cleaning + +def clean_text_field(text): + mime_type, encoding = get_mime_type(text.encode('utf8')) + if mime_type == 'text/html': # or re.search(r'<\w+>', text): + text = html2text.html2text(text) + elif mime_type in ['text/plain', 'application/x-empty', ]: + pass + else: + raise forms.ValidationError("Unexpected text field mime type: %s" % mime_type) + return text + \ No newline at end of file diff --git a/ietf/utils/mime.py b/ietf/utils/mime.py new file mode 100644 index 000000000..4c58d6629 --- /dev/null +++ b/ietf/utils/mime.py @@ -0,0 +1,21 @@ +# Copyright The IETF Trust 2020, All Rights Reserved +# -*- coding: utf-8 -*- + +from __future__ import absolute_import, print_function, unicode_literals + +import magic + +def get_mime_type(content): + # try to fixup encoding + if hasattr(magic, "open"): + m = magic.open(magic.MAGIC_MIME) + m.load() + filetype = m.buffer(content) + else: + m = magic.Magic() + m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING) + magic.magic_load(m.cookie, None) + filetype = m.from_buffer(content) + + return filetype.split('; ', 1) + diff --git a/ietf/utils/validators.py b/ietf/utils/validators.py index 8ad5a6466..9bf179ae3 100644 --- a/ietf/utils/validators.py +++ b/ietf/utils/validators.py @@ -1,12 +1,10 @@ -# Copyright The IETF Trust 2016-2019, All Rights Reserved +# Copyright The IETF Trust 2016-2020, All Rights Reserved # -*- coding: utf-8 -*- - from __future__ import absolute_import, print_function, unicode_literals import os import re -import magic from pyquery import PyQuery from django.conf import settings @@ -17,6 +15,8 @@ from django.utils.deconstruct import deconstructible import debug # pyflakes:ignore +from ietf.utils.mime import get_mime_type + # Note that this is an instantiation of the regex validator, _not_ the # regex-string validator defined right below validate_no_control_chars = RegexValidator( @@ -55,20 +55,6 @@ class RegexStringValidator(object): validate_regular_expression_string = RegexStringValidator() -def get_mime_type(content): - # try to fixup encoding - if hasattr(magic, "open"): - m = magic.open(magic.MAGIC_MIME) - m.load() - filetype = m.buffer(content) - else: - m = magic.Magic() - m.cookie = magic.magic_open(magic.MAGIC_NONE | magic.MAGIC_MIME | magic.MAGIC_MIME_ENCODING) - magic.magic_load(m.cookie, None) - filetype = m.from_buffer(content) - - return filetype.split('; ', 1) - def validate_file_size(file): if file._size > settings.SECR_MAX_UPLOAD_SIZE: raise ValidationError('Please keep filesize under %s. Requested upload size was %s' % (filesizeformat(settings.SECR_MAX_UPLOAD_SIZE), filesizeformat(file._size))) diff --git a/requirements.txt b/requirements.txt index 75486916e..6f2739653 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,6 +28,7 @@ factory-boy>=2.9.0 google-api-python-client Faker>=0.8.8,!=0.8.9,!=0.8.10 # from factory-boy # Faker 0.8.9,0.8.10 sometimes return string names instead of unicode. hashids>=1.1.0 +html2text>=2019.8.11 html5lib>=1.0.1 httplib2>=0.10.3 # jsonfield 3.x and higher requires Django 2.2 or higher diff --git a/requirements3.txt b/requirements3.txt index bdb7ca411..826cb31e3 100644 --- a/requirements3.txt +++ b/requirements3.txt @@ -29,6 +29,7 @@ factory-boy>=2.9.0 google-api-python-client Faker>=0.8.8,!=0.8.9,!=0.8.10 # from factory-boy # Faker 0.8.9,0.8.10 sometimes return string names instead of unicode. hashids>=1.1.0 +html2text>=2019.8.11 html5lib>=1.0.1 httplib2>=0.10.3 # jsonfield 3.x and higher requires Django 2.2 or higher