refactor: Drop submission validation via libmagic (#6500)

* refactor: Update parsers/base.py for Python3

* style: Black

* refactor: Remove mime type check from FileParser

* refactor: Validate that submission is UTF-8

The mime check distinguished us-ascii from UTF-8,
but as far as I can tell the code relying on it
treated both as equally valid.

* feat: Clear error when file is not valid XML

* chore: Remove unused import

* test: Update tests to match changes

* fix: Count bytes starting from 1

* test: Add tests of FileParser validations

* fix: Fix / simplify regexp

* test: Test error caused by bad XML submission

---------

Co-authored-by: Robert Sparks <rjsparks@nostrum.com>
This commit is contained in:
Jennifer Richards 2023-10-23 12:00:04 -03:00 committed by GitHub
parent a6a39a04dc
commit dc14308700
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 175 additions and 74 deletions

View file

@ -42,7 +42,7 @@ from ietf.utils import log
from ietf.utils.draft import PlaintextDraft from ietf.utils.draft import PlaintextDraft
from ietf.utils.text import normalize_text from ietf.utils.text import normalize_text
from ietf.utils.timezone import date_today from ietf.utils.timezone import date_today
from ietf.utils.xmldraft import XMLDraft, XMLParseError from ietf.utils.xmldraft import InvalidXMLError, XMLDraft, XMLParseError
class SubmissionBaseUploadForm(forms.Form): class SubmissionBaseUploadForm(forms.Form):
@ -168,6 +168,11 @@ class SubmissionBaseUploadForm(forms.Form):
try: try:
xml_draft = XMLDraft(tfn) xml_draft = XMLDraft(tfn)
except InvalidXMLError:
raise forms.ValidationError(
"The uploaded file is not valid XML. Please make sure you are uploading the correct file.",
code="invalid_xml_error",
)
except XMLParseError as e: except XMLParseError as e:
msgs = format_messages('xml', e, e.parser_msgs()) msgs = format_messages('xml', e, e.parser_msgs())
raise forms.ValidationError(msgs, code="xml_parse_error") raise forms.ValidationError(msgs, code="xml_parse_error")
@ -537,8 +542,7 @@ class DeprecatedSubmissionBaseUploadForm(SubmissionBaseUploadForm):
bytes = txt_file.read() bytes = txt_file.read()
txt_file.seek(0) txt_file.seek(0)
try: try:
text = bytes.decode(self.file_info['txt'].charset) text = bytes.decode(PlainParser.encoding)
#
self.parsed_draft = PlaintextDraft(text, txt_file.name) self.parsed_draft = PlaintextDraft(text, txt_file.name)
if self.filename == None: if self.filename == None:
self.filename = self.parsed_draft.filename self.filename = self.parsed_draft.filename
@ -649,7 +653,7 @@ class SubmissionManualUploadForm(SubmissionBaseUploadForm):
txt_file.seek(0) txt_file.seek(0)
bytes = txt_file.read() bytes = txt_file.read()
try: try:
text = bytes.decode(self.file_info["txt"].charset) text = bytes.decode(PlainParser.encoding)
parsed_draft = PlaintextDraft(text, txt_file.name) parsed_draft = PlaintextDraft(text, txt_file.name)
self._extracted_filenames_and_revisions["txt"] = (parsed_draft.filename, parsed_draft.revision) self._extracted_filenames_and_revisions["txt"] = (parsed_draft.filename, parsed_draft.revision)
except (UnicodeDecodeError, LookupError) as e: except (UnicodeDecodeError, LookupError) as e:

View file

@ -1,20 +1,19 @@
# Copyright The IETF Trust 2011-2020, All Rights Reserved # Copyright The IETF Trust 2011-2023, All Rights Reserved
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import re import re
import debug # pyflakes:ignore import debug # pyflakes:ignore
from typing import List, Optional # pyflakes:ignore from typing import List, Optional # pyflakes:ignore
from django.conf import settings from django.conf import settings
from django.template.defaultfilters import filesizeformat from django.template.defaultfilters import filesizeformat
from ietf.utils.mime import get_mime_type
from ietf.utils.timezone import date_today from ietf.utils.timezone import date_today
class MetaData(object): class MetaData:
rev = None rev = None
name = None name = None
group = None group = None
@ -25,7 +24,8 @@ class MetaData(object):
document_date = None document_date = None
authors = None authors = None
class ParseInfo(object):
class ParseInfo:
"""Collect errors from a parse""" """Collect errors from a parse"""
def __init__(self): def __init__(self):
@ -46,9 +46,9 @@ class ParseInfo(object):
self.warnings[warning_type] = warn_list + [warning_str] self.warnings[warning_type] = warn_list + [warning_str]
class FileParser(object): class FileParser:
ext = None # type: Optional[str] ext: Optional[str] = None
mimetypes = [] # type: List[str] encoding = "utf8"
def __init__(self, fd): def __init__(self, fd):
self.fd = fd self.fd = fd
@ -58,35 +58,49 @@ class FileParser(object):
# no other file parsing is recommended # no other file parsing is recommended
def critical_parse(self): def critical_parse(self):
self.parse_invalid_chars_in_filename() self.parse_invalid_chars_in_filename()
self.parse_max_size(); self.parse_max_size()
self.parse_filename_extension() self.parse_filename_extension()
self.parse_file_type() self.validate_encoding()
self.parsed_info.metadata.submission_date = date_today() self.parsed_info.metadata.submission_date = date_today()
return self.parsed_info return self.parsed_info
def parse_invalid_chars_in_filename(self): def parse_invalid_chars_in_filename(self):
name = self.fd.name name = self.fd.name
regexp = re.compile(r'&|\|\/|;|\*|\s|\$') regexp = re.compile(r"&|\\|/|;|\*|\s|\$")
chars = regexp.findall(name) chars = regexp.findall(name)
if chars: if chars:
self.parsed_info.add_error('Invalid characters were found in the name of the file which was just submitted: %s' % ', '.join(set(chars))) self.parsed_info.add_error(
"Invalid characters were found in the name of the file which was just submitted: %s"
% ", ".join(set(chars))
)
def parse_max_size(self): def parse_max_size(self):
max_size = settings.IDSUBMIT_MAX_DRAFT_SIZE[self.ext] max_size = settings.IDSUBMIT_MAX_DRAFT_SIZE[self.ext]
if self.fd.size > max_size: if self.fd.size > max_size:
self.parsed_info.add_error('File size is larger than the permitted maximum of %s' % filesizeformat(max_size)) self.parsed_info.add_error(
"File size is larger than the permitted maximum of %s"
% filesizeformat(max_size)
)
self.parsed_info.metadata.file_size = self.fd.size self.parsed_info.metadata.file_size = self.fd.size
def parse_filename_extension(self): def parse_filename_extension(self):
if not self.fd.name.lower().endswith('.'+self.ext): if not self.fd.name.lower().endswith("." + self.ext):
self.parsed_info.add_error('Expected the %s file to have extension ".%s", found the name "%s"' % (self.ext.upper(), self.ext, self.fd.name)) self.parsed_info.add_error(
'Expected the %s file to have extension ".%s", found the name "%s"'
% (self.ext.upper(), self.ext, self.fd.name)
)
def parse_file_type(self): def validate_encoding(self):
self.fd.file.seek(0) self.fd.seek(0)
content = self.fd.file.read(64*1024) bytes = self.fd.read()
mimetype, charset = get_mime_type(content) try:
if not mimetype in self.mimetypes: bytes.decode(self.encoding)
self.parsed_info.add_error('Expected an %s file of type "%s", found one of type "%s"' % (self.ext.upper(), '" or "'.join(self.mimetypes), mimetype)) except UnicodeDecodeError as err:
self.parsed_info.mimetype = mimetype invalid_bytes = bytes[err.start : err.end]
self.parsed_info.charset = charset self.parsed_info.add_error(
"Invalid {} byte(s) starting at byte {}: [{}]".format(
err.encoding,
err.start + 1,
", ".join(f"0x{b:x}" for b in invalid_bytes)
)
)

View file

@ -10,7 +10,6 @@ from ietf.submit.parsers.base import FileParser
class PlainParser(FileParser): class PlainParser(FileParser):
ext = 'txt' ext = 'txt'
mimetypes = ['text/plain', ]
def __init__(self, fd): def __init__(self, fd):
super(PlainParser, self).__init__(fd) super(PlainParser, self).__init__(fd)
@ -19,54 +18,41 @@ class PlainParser(FileParser):
# no other file parsing is recommended # no other file parsing is recommended
def critical_parse(self): def critical_parse(self):
super(PlainParser, self).critical_parse() super(PlainParser, self).critical_parse()
self.check_file_charset()
self.parse_name() self.parse_name()
return self.parsed_info return self.parsed_info
def check_file_charset(self):
charset = self.parsed_info.charset
if not charset in ['us-ascii', 'utf-8',]:
self.parsed_info.add_error('A plain text ASCII document is required. '
'Found an unexpected encoding: "%s". '
'You probably have one or more non-ascii characters in your file.' % charset
)
if self.fd.charset and charset != self.fd.charset:
self.parsed_info.add_error("Unexpected charset mismatch: upload: %s, libmagic: %s" % (self.fd.charset, charset))
def parse_name(self): def parse_name(self):
self.fd.file.seek(0) self.fd.file.seek(0)
draftre = re.compile(r'(draft-\S+)') draftre = re.compile(r'(draft-\S+)')
revisionre = re.compile(r'.*-(\d+)$') revisionre = re.compile(r'.*-(\d+)$')
limit = 80 limit = 80
if self.parsed_info.charset in ['us-ascii', 'utf-8']: while limit:
while limit: limit -= 1
limit -= 1 try:
try: line = self.fd.readline().decode(self.encoding)
line = self.fd.readline().decode(self.parsed_info.charset) except UnicodeDecodeError:
except UnicodeDecodeError:
return
match = draftre.search(line)
if not match:
continue
name = match.group(1)
name = re.sub(r'^[^\w]+', '', name)
name = re.sub(r'[^\w]+$', '', name)
name = re.sub(r'\.txt$', '', name)
extra_chars = re.sub(r'[0-9a-z\-]', '', name)
if extra_chars:
if len(extra_chars) == 1:
self.parsed_info.add_error(('The document name on the first page, "%s", contains a disallowed character with byte code: %s ' % (name, ord(extra_chars[0]))) +
'(see https://www.ietf.org/id-info/guidelines.html#naming for details).')
else:
self.parsed_info.add_error(('The document name on the first page, "%s", contains disallowed characters with byte codes: %s ' % (name, (', '.join([ str(ord(c)) for c in extra_chars] )))) +
'(see https://www.ietf.org/id-info/guidelines.html#naming for details).')
match_revision = revisionre.match(name)
if match_revision:
self.parsed_info.metadata.rev = match_revision.group(1)
else:
self.parsed_info.add_error('The name found on the first page of the document does not contain a revision: "%s"' % (name,))
name = re.sub(r'-\d+$', '', name)
self.parsed_info.metadata.name = name
return return
match = draftre.search(line)
if not match:
continue
name = match.group(1)
name = re.sub(r'^[^\w]+', '', name)
name = re.sub(r'[^\w]+$', '', name)
name = re.sub(r'\.txt$', '', name)
extra_chars = re.sub(r'[0-9a-z\-]', '', name)
if extra_chars:
if len(extra_chars) == 1:
self.parsed_info.add_error(('The document name on the first page, "%s", contains a disallowed character with byte code: %s ' % (name, ord(extra_chars[0]))) +
'(see https://www.ietf.org/id-info/guidelines.html#naming for details).')
else:
self.parsed_info.add_error(('The document name on the first page, "%s", contains disallowed characters with byte codes: %s ' % (name, (', '.join([ str(ord(c)) for c in extra_chars] )))) +
'(see https://www.ietf.org/id-info/guidelines.html#naming for details).')
match_revision = revisionre.match(name)
if match_revision:
self.parsed_info.metadata.rev = match_revision.group(1)
else:
self.parsed_info.add_error('The name found on the first page of the document does not contain a revision: "%s"' % (name,))
name = re.sub(r'-\d+$', '', name)
self.parsed_info.metadata.name = name
return
self.parsed_info.add_error('The first page of the document does not contain a legitimate name that starts with draft-*') self.parsed_info.add_error('The first page of the document does not contain a legitimate name that starts with draft-*')

View file

@ -0,0 +1,78 @@
# Copyright The IETF Trust 2023, All Rights Reserved
from django.core.files.uploadedfile import SimpleUploadedFile
from django.test.utils import override_settings
from ietf.utils.test_utils import TestCase
from .base import FileParser
@override_settings(IDSUBMIT_MAX_DRAFT_SIZE={"txt": 100})
class FileParserTests(TestCase):
class MyParser(FileParser):
"""Test parser class (FileParser is not usable on its own)"""
ext = "txt"
def test_invalid_encoding(self):
fp = self.MyParser(
SimpleUploadedFile(
name="valid-name.txt",
content=b"This is not valid utf-8 -> \xfe",
content_type="text/plain",
)
)
parse_info = fp.critical_parse()
self.assertEqual(len(parse_info.errors), 1)
self.assertIn("Invalid utf-8 byte(s)", parse_info.errors[0])
self.assertIn("0xfe", parse_info.errors[0])
def test_file_too_big(self):
fp = self.MyParser(
SimpleUploadedFile(
name="valid-name.txt",
content=b"1" + b"ten chars!" * 10, # exceeds max size of 100
content_type="text/plain",
)
)
parse_info = fp.critical_parse()
self.assertEqual(len(parse_info.errors), 1)
self.assertIn("File size is larger", parse_info.errors[0])
def test_wrong_extension(self):
fp = self.MyParser(
SimpleUploadedFile(
name="invalid-ext.xml",
content=b"This is fine",
content_type="text/plain",
)
)
parse_info = fp.critical_parse()
self.assertEqual(len(parse_info.errors), 1)
self.assertIn("Expected the TXT file to have extension", parse_info.errors[0])
def test_invalid_char_in_filename(self):
# Can't test "/" in the filename because SimpleUploadedFile treats it as a path separator
for invalid_char in r"&\;*$ ":
uploaded_file = SimpleUploadedFile(
name="invalid" + invalid_char + "name.txt",
content=b"This is fine",
content_type="text/plain",
)
fp = self.MyParser(uploaded_file)
parse_info = fp.critical_parse()
self.assertEqual(
len(parse_info.errors), 1, f"{invalid_char} should trigger 1 error"
)
self.assertIn(
"Invalid characters were found",
parse_info.errors[0],
f"{invalid_char} is an invalid char",
)
self.assertIn(
f": {invalid_char}",
parse_info.errors[0],
f"{invalid_char} is the invalid char",
)

View file

@ -3,7 +3,6 @@ from ietf.submit.parsers.base import FileParser
class XMLParser(FileParser): class XMLParser(FileParser):
ext = 'xml' ext = 'xml'
mimetypes = ['application/xml', 'text/xml', ]
# If some error is found after this method invocation # If some error is found after this method invocation
# no other file parsing is recommended # no other file parsing is recommended

View file

@ -1700,7 +1700,6 @@ class SubmitTests(BaseSubmitTestCase):
r, q, m = self.submit_bad_file("some name", ["txt"]) r, q, m = self.submit_bad_file("some name", ["txt"])
self.assertIn('Invalid characters were found in the name', m) self.assertIn('Invalid characters were found in the name', m)
self.assertIn('Expected the TXT file to have extension ".txt"', m) self.assertIn('Expected the TXT file to have extension ".txt"', m)
self.assertIn('Expected an TXT file of type "text/plain"', m)
self.assertIn('document does not contain a legitimate name', m) self.assertIn('document does not contain a legitimate name', m)
def test_submit_bad_doc_name(self): def test_submit_bad_doc_name(self):
@ -1718,7 +1717,6 @@ class SubmitTests(BaseSubmitTestCase):
r, q, m = self.submit_bad_file("some name", ["xml"]) r, q, m = self.submit_bad_file("some name", ["xml"])
self.assertIn('Invalid characters were found in the name', m) self.assertIn('Invalid characters were found in the name', m)
self.assertIn('Expected the XML file to have extension ".xml"', m) self.assertIn('Expected the XML file to have extension ".xml"', m)
self.assertIn('Expected an XML file of type "application/xml"', m)
def test_submit_file_in_archive(self): def test_submit_file_in_archive(self):
name = "draft-authorname-testing-file-exists" name = "draft-authorname-testing-file-exists"
@ -3123,6 +3121,20 @@ class SubmissionUploadFormTests(BaseSubmitTestCase):
) )
self.assertFalse(form.is_valid()) self.assertFalse(form.is_valid())
def test_invalid_xml(self):
"""Test error message for invalid XML"""
not_xml = SimpleUploadedFile(
name="not-xml.xml",
content=b"this is not xml at all",
content_type="application/xml",
)
form = SubmissionBaseUploadForm(RequestFactory().post('/some/url'), files={"xml": not_xml})
self.assertFalse(form.is_valid())
self.assertFormError(
form,
"xml",
"The uploaded file is not valid XML. Please make sure you are uploading the correct file.",
)
class AsyncSubmissionTests(BaseSubmitTestCase): class AsyncSubmissionTests(BaseSubmitTestCase):
"""Tests of async submission-related tasks""" """Tests of async submission-related tasks"""

View file

@ -8,6 +8,7 @@ import xml2rfc
import debug # pyflakes: ignore import debug # pyflakes: ignore
from contextlib import ExitStack from contextlib import ExitStack
from lxml.etree import XMLSyntaxError
from xml2rfc.util.date import augment_date, extract_date from xml2rfc.util.date import augment_date, extract_date
from ietf.utils.timezone import date_today from ietf.utils.timezone import date_today
@ -54,6 +55,8 @@ class XMLDraft(Draft):
parser = xml2rfc.XmlRfcParser(filename, quiet=True) parser = xml2rfc.XmlRfcParser(filename, quiet=True)
try: try:
tree = parser.parse() tree = parser.parse()
except XMLSyntaxError:
raise InvalidXMLError()
except Exception as e: except Exception as e:
raise XMLParseError(parser_out.getvalue(), parser_err.getvalue()) from e raise XMLParseError(parser_out.getvalue(), parser_err.getvalue()) from e
@ -235,3 +238,8 @@ class XMLParseError(Exception):
def parser_msgs(self): def parser_msgs(self):
return self._out.splitlines() + self._err.splitlines() return self._out.splitlines() + self._err.splitlines()
class InvalidXMLError(Exception):
"""File is not valid XML"""
pass