From b92ad2f992fb673446b9b4f97f9c2083bdbfc2b8 Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Tue, 6 Mar 2018 15:55:30 +0000 Subject: [PATCH] Added sanitization of uploaded html content for session agendas and minutes, and did some refactoring of the upload form classes. - Legacy-Id: 14738 --- ietf/meeting/forms.py | 39 ++++++++++++++++++++ ietf/meeting/tests_views.py | 15 ++++++-- ietf/meeting/views.py | 66 ++++++++++------------------------ ietf/secr/proceedings/utils.py | 23 ++++++++++-- ietf/settings.py | 30 ++++++++++------ ietf/utils/html.py | 6 ++++ 6 files changed, 116 insertions(+), 63 deletions(-) diff --git a/ietf/meeting/forms.py b/ietf/meeting/forms.py index 1427d2956..07482c2a1 100644 --- a/ietf/meeting/forms.py +++ b/ietf/meeting/forms.py @@ -3,6 +3,8 @@ import codecs import datetime from django import forms +from django.conf import settings +from django.core.exceptions import ValidationError from django.db.models import Q from django.forms import BaseInlineFormSet @@ -17,6 +19,8 @@ from ietf.meeting.helpers import is_meeting_approved, get_next_agenda_name from ietf.message.models import Message from ietf.person.models import Person from ietf.utils.fields import DatepickerDateField, DurationField +from ietf.utils.validators import ( validate_file_size, validate_mime_type, + validate_file_extension, validate_no_html_frame) # need to insert empty option for use in ChoiceField # countries.insert(0, ('', '-'*9 )) @@ -305,3 +309,38 @@ class InterimCancelForm(forms.Form): super(InterimCancelForm, self).__init__(*args, **kwargs) self.fields['group'].widget.attrs['disabled'] = True self.fields['date'].widget.attrs['disabled'] = True + +class FileUploadForm(forms.Form): + file = forms.FileField(label='File to upload') + + def __init__(self, *args, **kwargs): + doc_type = kwargs.pop('doc_type') + assert doc_type in settings.MEETING_VALID_UPLOAD_EXTENSIONS + self.doc_type = doc_type + self.extensions = settings.MEETING_VALID_UPLOAD_EXTENSIONS[doc_type] + self.mime_types = settings.MEETING_VALID_UPLOAD_MIME_TYPES[doc_type] + super(FileUploadForm, self).__init__(*args, **kwargs) + label = '%s file to upload. ' % (self.doc_type.capitalize(), ) + if self.mime_types: + label += 'Note that you can only upload files with these formats: %s.' % (', '.join(self.mime_types, )) + self.fields['file'].label=label + + def clean_file(self): + file = self.cleaned_data['file'] + validate_file_size(file) + ext = validate_file_extension(file, self.extensions) + mime_type = None + if self.mime_types: + mime_type, encoding = validate_mime_type(file, self.mime_types) + if mime_type != file.content_type: + raise ValidationError('Upload Content-Type (%s) is different from the observed mime-type (%s)' % (file.content_type, mime_type)) + if mime_type in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS: + if not ext in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS[mime_type]: + raise ValidationError('Upload Content-Type (%s) does not match the extension (%s)' % (file.content_type, ext)) + if mime_type in ['text/html', ] or ext in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS['text/html']: + # We'll do html sanitization later, but for frames, we fail here, + # as the sanitized version will most likely be useless. + validate_no_html_frame(file) + return file + + diff --git a/ietf/meeting/tests_views.py b/ietf/meeting/tests_views.py index ffc051295..9b34f78e1 100644 --- a/ietf/meeting/tests_views.py +++ b/ietf/meeting/tests_views.py @@ -1680,12 +1680,23 @@ class MaterialsTests(TestCase): q = PyQuery(r.content) self.assertTrue(q('form .has-error')) + # Test html sanitization + test_file = StringIO('

Title

Some text
') + test_file.name = "some.html" + r = self.client.post(url,dict(file=test_file)) + self.assertEqual(r.status_code, 302) + doc = session.sessionpresentation_set.filter(document__type_id=doctype).first().document + self.assertEqual(doc.rev,'00') + text = doc.text() + self.assertIn('Some text', text) + self.assertNotIn('
', text) + test_file = StringIO(u'This is some text for a test, with the word\nvirtual at the beginning of a line.') test_file.name = "not_really.txt" r = self.client.post(url,dict(file=test_file,apply_to_all=False)) self.assertEqual(r.status_code, 302) doc = session.sessionpresentation_set.filter(document__type_id=doctype).first().document - self.assertEqual(doc.rev,'00') + self.assertEqual(doc.rev,'01') self.assertFalse(session2.sessionpresentation_set.filter(document__type_id=doctype)) r = self.client.get(url) @@ -1697,7 +1708,7 @@ class MaterialsTests(TestCase): r = self.client.post(url,dict(file=test_file,apply_to_all=True)) self.assertEqual(r.status_code, 302) doc = Document.objects.get(pk=doc.pk) - self.assertEqual(doc.rev,'01') + self.assertEqual(doc.rev,'02') self.assertTrue(session2.sessionpresentation_set.filter(document__type_id=doctype)) def test_upload_minutes_agenda_unscheduled(self): diff --git a/ietf/meeting/views.py b/ietf/meeting/views.py index a946334c5..9558579c2 100644 --- a/ietf/meeting/views.py +++ b/ietf/meeting/views.py @@ -63,11 +63,10 @@ from ietf.utils.mail import send_mail_message from ietf.utils.pipe import pipe from ietf.utils.pdf import pdf_pages from ietf.utils.text import xslugify -from ietf.utils.validators import ( validate_file_size, validate_mime_type, - validate_file_extension, validate_no_html_frame, get_mime_type) +from ietf.utils.validators import get_mime_type from .forms import (InterimMeetingModelForm, InterimAnnounceForm, InterimSessionModelForm, - InterimCancelForm, InterimSessionInlineFormSet) + InterimCancelForm, InterimSessionInlineFormSet, FileUploadForm) def get_menu_entries(request): @@ -1117,14 +1116,13 @@ def add_session_drafts(request, session_id, num): 'form': form, }) -class UploadBlueSheetForm(forms.Form): - file = forms.FileField(label='Bluesheet scan to upload') - def clean_file(self): - file = self.cleaned_data['file'] - validate_mime_type(file, settings.MEETING_VALID_BLUESHEET_MIME_TYPES) - validate_file_extension(file, settings.MEETING_VALID_BLUESHEET_EXTENSIONS) - return file +class UploadBlueSheetForm(FileUploadForm): + + def __init__(self, *args, **kwargs): + kwargs['doc_type'] = 'bluesheets' + super(UploadBlueSheetForm, self).__init__(*args, **kwargs ) + @role_required('Area Director', 'Secretariat', 'IRTF Chair', 'WG Chair', 'RG Chair') def upload_session_bluesheets(request, session_id, num): @@ -1193,25 +1191,15 @@ def upload_session_bluesheets(request, session_id, num): }) -# FIXME: This form validation code (based on the secretariat upload code) only looks at filename extensions -# It should look at the contents of the files instead. -class UploadMinutesForm(forms.Form): - file = forms.FileField(label='Minutes file to upload. Note that you can only upload minutes in txt, html, or pdf formats.') +class UploadMinutesForm(FileUploadForm): apply_to_all = forms.BooleanField(label='Apply to all group sessions at this meeting',initial=True,required=False) def __init__(self, show_apply_to_all_checkbox, *args, **kwargs): - super(UploadMinutesForm, self).__init__(*args, **kwargs) + kwargs['doc_type'] = 'minutes' + super(UploadMinutesForm, self).__init__(*args, **kwargs ) if not show_apply_to_all_checkbox: self.fields.pop('apply_to_all') - def clean_file(self): - file = self.cleaned_data['file'] - validate_file_size(file) - ext = validate_file_extension(file, settings.MEETING_VALID_MINUTES_EXTENSIONS) - mime_type, encoding = validate_mime_type(file, settings.MEETING_VALID_MINUTES_MIME_TYPES) - if ext in ['.html', '.htm'] or mime_type in ['text/html', ]: - validate_no_html_frame(file) - return file def upload_session_minutes(request, session_id, num): # num is redundant, but we're dragging it along an artifact of where we are in the current URL structure @@ -1301,26 +1289,15 @@ def upload_session_minutes(request, session_id, num): }) -# FIXME: This form validation code (based on the secretariat upload code) only looks at filename extensions -# It should look at the contents of the files instead. -class UploadAgendaForm(forms.Form): - file = forms.FileField(label='Agenda file to upload. Note that you can only upload agendas in txt or html formats.') +class UploadAgendaForm(FileUploadForm): apply_to_all = forms.BooleanField(label='Apply to all group sessions at this meeting',initial=True,required=False) def __init__(self, show_apply_to_all_checkbox, *args, **kwargs): - super(UploadAgendaForm, self).__init__(*args, **kwargs) + kwargs['doc_type'] = 'agenda' + super(UploadAgendaForm, self).__init__(*args, **kwargs ) if not show_apply_to_all_checkbox: self.fields.pop('apply_to_all') - def clean_file(self): - file = self.cleaned_data['file'] - validate_file_size(file) - ext = validate_file_extension(file, settings.MEETING_VALID_AGENDA_EXTENSIONS) - mime_type, encoding = validate_mime_type(file, settings.MEETING_VALID_AGENDA_MIME_TYPES) - if ext in ['.html', '.htm'] or mime_type in ['text/html', ]: - validate_no_html_frame(file) - return file - def upload_session_agenda(request, session_id, num): # num is redundant, but we're dragging it along an artifact of where we are in the current URL structure session = get_object_or_404(Session,pk=session_id) @@ -1399,7 +1376,7 @@ def upload_session_agenda(request, session_id, num): e = NewRevisionDocEvent.objects.create(doc=doc,by=request.user.person,type='new_revision',desc='New revision available: %s'%doc.rev,rev=doc.rev) doc.save_with_history([e]) # The way this function builds the filename it will never trigger the file delete in handle_file_upload. - handle_upload_file(file, filename, session.meeting, 'agenda') + handle_upload_file(file, filename, session.meeting, 'agenda', request) return redirect('ietf.meeting.views.session_details',num=num,acronym=session.group.acronym) else: form = UploadAgendaForm(show_apply_to_all_checkbox, initial={'apply_to_all':session.type_id=='session'}) @@ -1412,23 +1389,16 @@ def upload_session_agenda(request, session_id, num): }) -# FIXME: This form validation code (based on the secretariat upload code) only looks at filename extensions -# It should look at the contents of the files instead. -class UploadSlidesForm(forms.Form): +class UploadSlidesForm(FileUploadForm): title = forms.CharField(max_length=255) - file = forms.FileField(label='Slides file to upload.') apply_to_all = forms.BooleanField(label='Apply to all group sessions at this meeting',initial=False,required=False) def __init__(self, show_apply_to_all_checkbox, *args, **kwargs): - super(UploadSlidesForm, self).__init__(*args, **kwargs) + kwargs['doc_type'] = 'slides' + super(UploadSlidesForm, self).__init__(*args, **kwargs ) if not show_apply_to_all_checkbox: self.fields.pop('apply_to_all') - def clean_file(self): - file = self.cleaned_data['file'] - validate_file_size(file) - validate_file_extension(file, settings.MEETING_VALID_SLIDES_EXTENSIONS) - return file def upload_session_slides(request, session_id, num, name): # num is redundant, but we're dragging it along an artifact of where we are in the current URL structure diff --git a/ietf/secr/proceedings/utils.py b/ietf/secr/proceedings/utils.py index 799951c7d..a83fa7008 100644 --- a/ietf/secr/proceedings/utils.py +++ b/ietf/secr/proceedings/utils.py @@ -1,9 +1,15 @@ + import glob import os +from django.conf import settings +from django.contrib import messages + import debug # pyflakes:ignore -def handle_upload_file(file,filename,meeting,subdir): +from ietf.utils.html import sanitize + +def handle_upload_file(file,filename,meeting,subdir, request=None): ''' This function takes a file object, a filename and a meeting object and subdir as string. It saves the file to the appropriate directory, get_materials_path() + subdir. @@ -28,8 +34,19 @@ def handle_upload_file(file,filename,meeting,subdir): os.remove(f) destination = open(os.path.join(path,filename), 'wb+') - for chunk in file.chunks(): - destination.write(chunk) + if extension in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS['text/html']: + file.open() + text = file.read() + # Whole file sanitization; add back '' (sanitize will remove it) + clean = u"\n%s\n\n" % sanitize(text) + destination.write(clean.encode('utf8')) + if request and clean != text: + messages.warning(request, "Uploaded html content is sanitized to prevent unsafe content. " + "Your upload %s was changed by the sanitization; please check the " + "resulting content. " % (filename, )) + else: + for chunk in file.chunks(): + destination.write(chunk) destination.close() # unzip zipfile diff --git a/ietf/settings.py b/ietf/settings.py index 6e8788bc9..8b372a62a 100644 --- a/ietf/settings.py +++ b/ietf/settings.py @@ -729,16 +729,26 @@ MEETING_MATERIALS_DEFAULT_SUBMISSION_START_DAYS = 90 MEETING_MATERIALS_DEFAULT_SUBMISSION_CUTOFF_DAYS = 26 MEETING_MATERIALS_DEFAULT_SUBMISSION_CORRECTION_DAYS = 50 -MEETING_VALID_AGENDA_EXTENSIONS = ['.txt','.html','.htm', '.md', ] -MEETING_VALID_AGENDA_MIME_TYPES = ['text/plain', 'text/html', ] -# -MEETING_VALID_MINUTES_EXTENSIONS = ['.txt','.html','.htm', '.md', '.pdf', ] -MEETING_VALID_MINUTES_MIME_TYPES = ['text/plain', 'text/html', 'application/pdf', ] -# -MEETING_VALID_SLIDES_EXTENSIONS = ('.doc','.docx','.pdf','.ppt','.pptx','.txt') # Note the removal of .zip -# -MEETING_VALID_BLUESHEET_EXTENSIONS = ['.pdf', '.txt', ] -MEETING_VALID_BLUESHEET_MIME_TYPES = ['application/pdf', 'text/plain', ] +MEETING_VALID_UPLOAD_EXTENSIONS = { + 'agenda': ['.txt','.html','.htm', '.md', ], + 'minutes': ['.txt','.html','.htm', '.md', '.pdf', ], + 'slides': ['.doc','.docx','.pdf','.ppt','.pptx','.txt', ], # Note the removal of .zip + 'bluesheets': ['.pdf', '.txt', ], +} + +MEETING_VALID_UPLOAD_MIME_TYPES = { + 'agenda': ['text/plain', 'text/html', ], + 'minutes': ['text/plain', 'text/html', 'application/pdf', ], + 'slides': None, + 'bluesheets': ['application/pdf', 'text/plain', ], +} + +MEETING_VALID_MIME_TYPE_EXTENSIONS = { + 'text/plain': ['.txt', '.md', ], + 'text/html': ['.html', '.htm'], + 'application/pdf': ['.pdf'], +} + INTERNET_DRAFT_DAYS_TO_EXPIRE = 185 diff --git a/ietf/utils/html.py b/ietf/utils/html.py index 1afc77626..398bc3203 100644 --- a/ietf/utils/html.py +++ b/ietf/utils/html.py @@ -5,6 +5,8 @@ import html5lib import bleach from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers +import debug # pyflakes:ignore + from django.utils.functional import keep_lazy from django.utils import six @@ -69,6 +71,10 @@ def remove_tags(html, tags): return bleach.clean(html, tags=allowed) remove_tags = keep_lazy(remove_tags, six.text_type) +def sanitize(html, tags=acceptable_elements, extra=[], remove=[], strip=True): + tags = list(set(tags) | set(extra) ^ set(remove)) + return bleach.clean(html, tags=tags, strip=strip) + def clean_html(html): return bleach.clean(html)