Added sanitization of uploaded html content for session agendas and minutes, and did some refactoring of the upload form classes.

- Legacy-Id: 14738
This commit is contained in:
Henrik Levkowetz 2018-03-06 15:55:30 +00:00
parent 27914a0a90
commit b92ad2f992
6 changed files with 116 additions and 63 deletions

View file

@ -3,6 +3,8 @@ import codecs
import datetime
from django import forms
from django.conf import settings
from django.core.exceptions import ValidationError
from django.db.models import Q
from django.forms import BaseInlineFormSet
@ -17,6 +19,8 @@ from ietf.meeting.helpers import is_meeting_approved, get_next_agenda_name
from ietf.message.models import Message
from ietf.person.models import Person
from ietf.utils.fields import DatepickerDateField, DurationField
from ietf.utils.validators import ( validate_file_size, validate_mime_type,
validate_file_extension, validate_no_html_frame)
# need to insert empty option for use in ChoiceField
# countries.insert(0, ('', '-'*9 ))
@ -305,3 +309,38 @@ class InterimCancelForm(forms.Form):
super(InterimCancelForm, self).__init__(*args, **kwargs)
self.fields['group'].widget.attrs['disabled'] = True
self.fields['date'].widget.attrs['disabled'] = True
class FileUploadForm(forms.Form):
file = forms.FileField(label='File to upload')
def __init__(self, *args, **kwargs):
doc_type = kwargs.pop('doc_type')
assert doc_type in settings.MEETING_VALID_UPLOAD_EXTENSIONS
self.doc_type = doc_type
self.extensions = settings.MEETING_VALID_UPLOAD_EXTENSIONS[doc_type]
self.mime_types = settings.MEETING_VALID_UPLOAD_MIME_TYPES[doc_type]
super(FileUploadForm, self).__init__(*args, **kwargs)
label = '%s file to upload. ' % (self.doc_type.capitalize(), )
if self.mime_types:
label += 'Note that you can only upload files with these formats: %s.' % (', '.join(self.mime_types, ))
self.fields['file'].label=label
def clean_file(self):
file = self.cleaned_data['file']
validate_file_size(file)
ext = validate_file_extension(file, self.extensions)
mime_type = None
if self.mime_types:
mime_type, encoding = validate_mime_type(file, self.mime_types)
if mime_type != file.content_type:
raise ValidationError('Upload Content-Type (%s) is different from the observed mime-type (%s)' % (file.content_type, mime_type))
if mime_type in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS:
if not ext in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS[mime_type]:
raise ValidationError('Upload Content-Type (%s) does not match the extension (%s)' % (file.content_type, ext))
if mime_type in ['text/html', ] or ext in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS['text/html']:
# We'll do html sanitization later, but for frames, we fail here,
# as the sanitized version will most likely be useless.
validate_no_html_frame(file)
return file

View file

@ -1680,12 +1680,23 @@ class MaterialsTests(TestCase):
q = PyQuery(r.content)
self.assertTrue(q('form .has-error'))
# Test html sanitization
test_file = StringIO('<html><h1>Title</h1><section>Some text</section></html>')
test_file.name = "some.html"
r = self.client.post(url,dict(file=test_file))
self.assertEqual(r.status_code, 302)
doc = session.sessionpresentation_set.filter(document__type_id=doctype).first().document
self.assertEqual(doc.rev,'00')
text = doc.text()
self.assertIn('Some text', text)
self.assertNotIn('<section>', text)
test_file = StringIO(u'This is some text for a test, with the word\nvirtual at the beginning of a line.')
test_file.name = "not_really.txt"
r = self.client.post(url,dict(file=test_file,apply_to_all=False))
self.assertEqual(r.status_code, 302)
doc = session.sessionpresentation_set.filter(document__type_id=doctype).first().document
self.assertEqual(doc.rev,'00')
self.assertEqual(doc.rev,'01')
self.assertFalse(session2.sessionpresentation_set.filter(document__type_id=doctype))
r = self.client.get(url)
@ -1697,7 +1708,7 @@ class MaterialsTests(TestCase):
r = self.client.post(url,dict(file=test_file,apply_to_all=True))
self.assertEqual(r.status_code, 302)
doc = Document.objects.get(pk=doc.pk)
self.assertEqual(doc.rev,'01')
self.assertEqual(doc.rev,'02')
self.assertTrue(session2.sessionpresentation_set.filter(document__type_id=doctype))
def test_upload_minutes_agenda_unscheduled(self):

View file

@ -63,11 +63,10 @@ from ietf.utils.mail import send_mail_message
from ietf.utils.pipe import pipe
from ietf.utils.pdf import pdf_pages
from ietf.utils.text import xslugify
from ietf.utils.validators import ( validate_file_size, validate_mime_type,
validate_file_extension, validate_no_html_frame, get_mime_type)
from ietf.utils.validators import get_mime_type
from .forms import (InterimMeetingModelForm, InterimAnnounceForm, InterimSessionModelForm,
InterimCancelForm, InterimSessionInlineFormSet)
InterimCancelForm, InterimSessionInlineFormSet, FileUploadForm)
def get_menu_entries(request):
@ -1117,14 +1116,13 @@ def add_session_drafts(request, session_id, num):
'form': form,
})
class UploadBlueSheetForm(forms.Form):
file = forms.FileField(label='Bluesheet scan to upload')
def clean_file(self):
file = self.cleaned_data['file']
validate_mime_type(file, settings.MEETING_VALID_BLUESHEET_MIME_TYPES)
validate_file_extension(file, settings.MEETING_VALID_BLUESHEET_EXTENSIONS)
return file
class UploadBlueSheetForm(FileUploadForm):
def __init__(self, *args, **kwargs):
kwargs['doc_type'] = 'bluesheets'
super(UploadBlueSheetForm, self).__init__(*args, **kwargs )
@role_required('Area Director', 'Secretariat', 'IRTF Chair', 'WG Chair', 'RG Chair')
def upload_session_bluesheets(request, session_id, num):
@ -1193,25 +1191,15 @@ def upload_session_bluesheets(request, session_id, num):
})
# FIXME: This form validation code (based on the secretariat upload code) only looks at filename extensions
# It should look at the contents of the files instead.
class UploadMinutesForm(forms.Form):
file = forms.FileField(label='Minutes file to upload. Note that you can only upload minutes in txt, html, or pdf formats.')
class UploadMinutesForm(FileUploadForm):
apply_to_all = forms.BooleanField(label='Apply to all group sessions at this meeting',initial=True,required=False)
def __init__(self, show_apply_to_all_checkbox, *args, **kwargs):
super(UploadMinutesForm, self).__init__(*args, **kwargs)
kwargs['doc_type'] = 'minutes'
super(UploadMinutesForm, self).__init__(*args, **kwargs )
if not show_apply_to_all_checkbox:
self.fields.pop('apply_to_all')
def clean_file(self):
file = self.cleaned_data['file']
validate_file_size(file)
ext = validate_file_extension(file, settings.MEETING_VALID_MINUTES_EXTENSIONS)
mime_type, encoding = validate_mime_type(file, settings.MEETING_VALID_MINUTES_MIME_TYPES)
if ext in ['.html', '.htm'] or mime_type in ['text/html', ]:
validate_no_html_frame(file)
return file
def upload_session_minutes(request, session_id, num):
# num is redundant, but we're dragging it along an artifact of where we are in the current URL structure
@ -1301,26 +1289,15 @@ def upload_session_minutes(request, session_id, num):
})
# FIXME: This form validation code (based on the secretariat upload code) only looks at filename extensions
# It should look at the contents of the files instead.
class UploadAgendaForm(forms.Form):
file = forms.FileField(label='Agenda file to upload. Note that you can only upload agendas in txt or html formats.')
class UploadAgendaForm(FileUploadForm):
apply_to_all = forms.BooleanField(label='Apply to all group sessions at this meeting',initial=True,required=False)
def __init__(self, show_apply_to_all_checkbox, *args, **kwargs):
super(UploadAgendaForm, self).__init__(*args, **kwargs)
kwargs['doc_type'] = 'agenda'
super(UploadAgendaForm, self).__init__(*args, **kwargs )
if not show_apply_to_all_checkbox:
self.fields.pop('apply_to_all')
def clean_file(self):
file = self.cleaned_data['file']
validate_file_size(file)
ext = validate_file_extension(file, settings.MEETING_VALID_AGENDA_EXTENSIONS)
mime_type, encoding = validate_mime_type(file, settings.MEETING_VALID_AGENDA_MIME_TYPES)
if ext in ['.html', '.htm'] or mime_type in ['text/html', ]:
validate_no_html_frame(file)
return file
def upload_session_agenda(request, session_id, num):
# num is redundant, but we're dragging it along an artifact of where we are in the current URL structure
session = get_object_or_404(Session,pk=session_id)
@ -1399,7 +1376,7 @@ def upload_session_agenda(request, session_id, num):
e = NewRevisionDocEvent.objects.create(doc=doc,by=request.user.person,type='new_revision',desc='New revision available: %s'%doc.rev,rev=doc.rev)
doc.save_with_history([e])
# The way this function builds the filename it will never trigger the file delete in handle_file_upload.
handle_upload_file(file, filename, session.meeting, 'agenda')
handle_upload_file(file, filename, session.meeting, 'agenda', request)
return redirect('ietf.meeting.views.session_details',num=num,acronym=session.group.acronym)
else:
form = UploadAgendaForm(show_apply_to_all_checkbox, initial={'apply_to_all':session.type_id=='session'})
@ -1412,23 +1389,16 @@ def upload_session_agenda(request, session_id, num):
})
# FIXME: This form validation code (based on the secretariat upload code) only looks at filename extensions
# It should look at the contents of the files instead.
class UploadSlidesForm(forms.Form):
class UploadSlidesForm(FileUploadForm):
title = forms.CharField(max_length=255)
file = forms.FileField(label='Slides file to upload.')
apply_to_all = forms.BooleanField(label='Apply to all group sessions at this meeting',initial=False,required=False)
def __init__(self, show_apply_to_all_checkbox, *args, **kwargs):
super(UploadSlidesForm, self).__init__(*args, **kwargs)
kwargs['doc_type'] = 'slides'
super(UploadSlidesForm, self).__init__(*args, **kwargs )
if not show_apply_to_all_checkbox:
self.fields.pop('apply_to_all')
def clean_file(self):
file = self.cleaned_data['file']
validate_file_size(file)
validate_file_extension(file, settings.MEETING_VALID_SLIDES_EXTENSIONS)
return file
def upload_session_slides(request, session_id, num, name):
# num is redundant, but we're dragging it along an artifact of where we are in the current URL structure

View file

@ -1,9 +1,15 @@
import glob
import os
from django.conf import settings
from django.contrib import messages
import debug # pyflakes:ignore
def handle_upload_file(file,filename,meeting,subdir):
from ietf.utils.html import sanitize
def handle_upload_file(file,filename,meeting,subdir, request=None):
'''
This function takes a file object, a filename and a meeting object and subdir as string.
It saves the file to the appropriate directory, get_materials_path() + subdir.
@ -28,8 +34,19 @@ def handle_upload_file(file,filename,meeting,subdir):
os.remove(f)
destination = open(os.path.join(path,filename), 'wb+')
for chunk in file.chunks():
destination.write(chunk)
if extension in settings.MEETING_VALID_MIME_TYPE_EXTENSIONS['text/html']:
file.open()
text = file.read()
# Whole file sanitization; add back '<html>' (sanitize will remove it)
clean = u"<html>\n%s\n</html>\n" % sanitize(text)
destination.write(clean.encode('utf8'))
if request and clean != text:
messages.warning(request, "Uploaded html content is sanitized to prevent unsafe content. "
"Your upload %s was changed by the sanitization; please check the "
"resulting content. " % (filename, ))
else:
for chunk in file.chunks():
destination.write(chunk)
destination.close()
# unzip zipfile

View file

@ -729,16 +729,26 @@ MEETING_MATERIALS_DEFAULT_SUBMISSION_START_DAYS = 90
MEETING_MATERIALS_DEFAULT_SUBMISSION_CUTOFF_DAYS = 26
MEETING_MATERIALS_DEFAULT_SUBMISSION_CORRECTION_DAYS = 50
MEETING_VALID_AGENDA_EXTENSIONS = ['.txt','.html','.htm', '.md', ]
MEETING_VALID_AGENDA_MIME_TYPES = ['text/plain', 'text/html', ]
#
MEETING_VALID_MINUTES_EXTENSIONS = ['.txt','.html','.htm', '.md', '.pdf', ]
MEETING_VALID_MINUTES_MIME_TYPES = ['text/plain', 'text/html', 'application/pdf', ]
#
MEETING_VALID_SLIDES_EXTENSIONS = ('.doc','.docx','.pdf','.ppt','.pptx','.txt') # Note the removal of .zip
#
MEETING_VALID_BLUESHEET_EXTENSIONS = ['.pdf', '.txt', ]
MEETING_VALID_BLUESHEET_MIME_TYPES = ['application/pdf', 'text/plain', ]
MEETING_VALID_UPLOAD_EXTENSIONS = {
'agenda': ['.txt','.html','.htm', '.md', ],
'minutes': ['.txt','.html','.htm', '.md', '.pdf', ],
'slides': ['.doc','.docx','.pdf','.ppt','.pptx','.txt', ], # Note the removal of .zip
'bluesheets': ['.pdf', '.txt', ],
}
MEETING_VALID_UPLOAD_MIME_TYPES = {
'agenda': ['text/plain', 'text/html', ],
'minutes': ['text/plain', 'text/html', 'application/pdf', ],
'slides': None,
'bluesheets': ['application/pdf', 'text/plain', ],
}
MEETING_VALID_MIME_TYPE_EXTENSIONS = {
'text/plain': ['.txt', '.md', ],
'text/html': ['.html', '.htm'],
'application/pdf': ['.pdf'],
}
INTERNET_DRAFT_DAYS_TO_EXPIRE = 185

View file

@ -5,6 +5,8 @@ import html5lib
import bleach
from html5lib import sanitizer, serializer, tokenizer, treebuilders, treewalkers
import debug # pyflakes:ignore
from django.utils.functional import keep_lazy
from django.utils import six
@ -69,6 +71,10 @@ def remove_tags(html, tags):
return bleach.clean(html, tags=allowed)
remove_tags = keep_lazy(remove_tags, six.text_type)
def sanitize(html, tags=acceptable_elements, extra=[], remove=[], strip=True):
tags = list(set(tags) | set(extra) ^ set(remove))
return bleach.clean(html, tags=tags, strip=strip)
def clean_html(html):
return bleach.clean(html)