refactor: Drop submission validation via libmagic (#6500)

* refactor: Update parsers/base.py for Python3

* style: Black

* refactor: Remove mime type check from FileParser

* refactor: Validate that submission is UTF-8

The mime check distinguished us-ascii from UTF-8,
but as far as I can tell the code relying on it
treated both as equally valid.

* feat: Clear error when file is not valid XML

* chore: Remove unused import

* test: Update tests to match changes

* fix: Count bytes starting from 1

* test: Add tests of FileParser validations

* fix: Fix / simplify regexp

* test: Test error caused by bad XML submission

---------

Co-authored-by: Robert Sparks <rjsparks@nostrum.com>
This commit is contained in:
Jennifer Richards 2023-10-23 12:00:04 -03:00 committed by GitHub
parent a6a39a04dc
commit dc14308700
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 175 additions and 74 deletions

View file

@ -42,7 +42,7 @@ from ietf.utils import log
from ietf.utils.draft import PlaintextDraft
from ietf.utils.text import normalize_text
from ietf.utils.timezone import date_today
from ietf.utils.xmldraft import XMLDraft, XMLParseError
from ietf.utils.xmldraft import InvalidXMLError, XMLDraft, XMLParseError
class SubmissionBaseUploadForm(forms.Form):
@ -168,6 +168,11 @@ class SubmissionBaseUploadForm(forms.Form):
try:
xml_draft = XMLDraft(tfn)
except InvalidXMLError:
raise forms.ValidationError(
"The uploaded file is not valid XML. Please make sure you are uploading the correct file.",
code="invalid_xml_error",
)
except XMLParseError as e:
msgs = format_messages('xml', e, e.parser_msgs())
raise forms.ValidationError(msgs, code="xml_parse_error")
@ -537,8 +542,7 @@ class DeprecatedSubmissionBaseUploadForm(SubmissionBaseUploadForm):
bytes = txt_file.read()
txt_file.seek(0)
try:
text = bytes.decode(self.file_info['txt'].charset)
#
text = bytes.decode(PlainParser.encoding)
self.parsed_draft = PlaintextDraft(text, txt_file.name)
if self.filename == None:
self.filename = self.parsed_draft.filename
@ -649,7 +653,7 @@ class SubmissionManualUploadForm(SubmissionBaseUploadForm):
txt_file.seek(0)
bytes = txt_file.read()
try:
text = bytes.decode(self.file_info["txt"].charset)
text = bytes.decode(PlainParser.encoding)
parsed_draft = PlaintextDraft(text, txt_file.name)
self._extracted_filenames_and_revisions["txt"] = (parsed_draft.filename, parsed_draft.revision)
except (UnicodeDecodeError, LookupError) as e:

View file

@ -1,4 +1,4 @@
# Copyright The IETF Trust 2011-2020, All Rights Reserved
# Copyright The IETF Trust 2011-2023, All Rights Reserved
# -*- coding: utf-8 -*-
@ -10,11 +10,10 @@ from typing import List, Optional # pyflakes:ignore
from django.conf import settings
from django.template.defaultfilters import filesizeformat
from ietf.utils.mime import get_mime_type
from ietf.utils.timezone import date_today
class MetaData(object):
class MetaData:
rev = None
name = None
group = None
@ -25,7 +24,8 @@ class MetaData(object):
document_date = None
authors = None
class ParseInfo(object):
class ParseInfo:
"""Collect errors from a parse"""
def __init__(self):
@ -46,9 +46,9 @@ class ParseInfo(object):
self.warnings[warning_type] = warn_list + [warning_str]
class FileParser(object):
ext = None # type: Optional[str]
mimetypes = [] # type: List[str]
class FileParser:
ext: Optional[str] = None
encoding = "utf8"
def __init__(self, fd):
self.fd = fd
@ -58,35 +58,49 @@ class FileParser(object):
# no other file parsing is recommended
def critical_parse(self):
self.parse_invalid_chars_in_filename()
self.parse_max_size();
self.parse_max_size()
self.parse_filename_extension()
self.parse_file_type()
self.validate_encoding()
self.parsed_info.metadata.submission_date = date_today()
return self.parsed_info
def parse_invalid_chars_in_filename(self):
name = self.fd.name
regexp = re.compile(r'&|\|\/|;|\*|\s|\$')
regexp = re.compile(r"&|\\|/|;|\*|\s|\$")
chars = regexp.findall(name)
if chars:
self.parsed_info.add_error('Invalid characters were found in the name of the file which was just submitted: %s' % ', '.join(set(chars)))
self.parsed_info.add_error(
"Invalid characters were found in the name of the file which was just submitted: %s"
% ", ".join(set(chars))
)
def parse_max_size(self):
max_size = settings.IDSUBMIT_MAX_DRAFT_SIZE[self.ext]
if self.fd.size > max_size:
self.parsed_info.add_error('File size is larger than the permitted maximum of %s' % filesizeformat(max_size))
self.parsed_info.add_error(
"File size is larger than the permitted maximum of %s"
% filesizeformat(max_size)
)
self.parsed_info.metadata.file_size = self.fd.size
def parse_filename_extension(self):
if not self.fd.name.lower().endswith('.'+self.ext):
self.parsed_info.add_error('Expected the %s file to have extension ".%s", found the name "%s"' % (self.ext.upper(), self.ext, self.fd.name))
def parse_file_type(self):
self.fd.file.seek(0)
content = self.fd.file.read(64*1024)
mimetype, charset = get_mime_type(content)
if not mimetype in self.mimetypes:
self.parsed_info.add_error('Expected an %s file of type "%s", found one of type "%s"' % (self.ext.upper(), '" or "'.join(self.mimetypes), mimetype))
self.parsed_info.mimetype = mimetype
self.parsed_info.charset = charset
if not self.fd.name.lower().endswith("." + self.ext):
self.parsed_info.add_error(
'Expected the %s file to have extension ".%s", found the name "%s"'
% (self.ext.upper(), self.ext, self.fd.name)
)
def validate_encoding(self):
self.fd.seek(0)
bytes = self.fd.read()
try:
bytes.decode(self.encoding)
except UnicodeDecodeError as err:
invalid_bytes = bytes[err.start : err.end]
self.parsed_info.add_error(
"Invalid {} byte(s) starting at byte {}: [{}]".format(
err.encoding,
err.start + 1,
", ".join(f"0x{b:x}" for b in invalid_bytes)
)
)

View file

@ -10,7 +10,6 @@ from ietf.submit.parsers.base import FileParser
class PlainParser(FileParser):
ext = 'txt'
mimetypes = ['text/plain', ]
def __init__(self, fd):
super(PlainParser, self).__init__(fd)
@ -19,31 +18,18 @@ class PlainParser(FileParser):
# no other file parsing is recommended
def critical_parse(self):
super(PlainParser, self).critical_parse()
self.check_file_charset()
self.parse_name()
return self.parsed_info
def check_file_charset(self):
charset = self.parsed_info.charset
if not charset in ['us-ascii', 'utf-8',]:
self.parsed_info.add_error('A plain text ASCII document is required. '
'Found an unexpected encoding: "%s". '
'You probably have one or more non-ascii characters in your file.' % charset
)
if self.fd.charset and charset != self.fd.charset:
self.parsed_info.add_error("Unexpected charset mismatch: upload: %s, libmagic: %s" % (self.fd.charset, charset))
def parse_name(self):
self.fd.file.seek(0)
draftre = re.compile(r'(draft-\S+)')
revisionre = re.compile(r'.*-(\d+)$')
limit = 80
if self.parsed_info.charset in ['us-ascii', 'utf-8']:
while limit:
limit -= 1
try:
line = self.fd.readline().decode(self.parsed_info.charset)
line = self.fd.readline().decode(self.encoding)
except UnicodeDecodeError:
return
match = draftre.search(line)

View file

@ -0,0 +1,78 @@
# Copyright The IETF Trust 2023, All Rights Reserved
from django.core.files.uploadedfile import SimpleUploadedFile
from django.test.utils import override_settings
from ietf.utils.test_utils import TestCase
from .base import FileParser
@override_settings(IDSUBMIT_MAX_DRAFT_SIZE={"txt": 100})
class FileParserTests(TestCase):
class MyParser(FileParser):
"""Test parser class (FileParser is not usable on its own)"""
ext = "txt"
def test_invalid_encoding(self):
fp = self.MyParser(
SimpleUploadedFile(
name="valid-name.txt",
content=b"This is not valid utf-8 -> \xfe",
content_type="text/plain",
)
)
parse_info = fp.critical_parse()
self.assertEqual(len(parse_info.errors), 1)
self.assertIn("Invalid utf-8 byte(s)", parse_info.errors[0])
self.assertIn("0xfe", parse_info.errors[0])
def test_file_too_big(self):
fp = self.MyParser(
SimpleUploadedFile(
name="valid-name.txt",
content=b"1" + b"ten chars!" * 10, # exceeds max size of 100
content_type="text/plain",
)
)
parse_info = fp.critical_parse()
self.assertEqual(len(parse_info.errors), 1)
self.assertIn("File size is larger", parse_info.errors[0])
def test_wrong_extension(self):
fp = self.MyParser(
SimpleUploadedFile(
name="invalid-ext.xml",
content=b"This is fine",
content_type="text/plain",
)
)
parse_info = fp.critical_parse()
self.assertEqual(len(parse_info.errors), 1)
self.assertIn("Expected the TXT file to have extension", parse_info.errors[0])
def test_invalid_char_in_filename(self):
# Can't test "/" in the filename because SimpleUploadedFile treats it as a path separator
for invalid_char in r"&\;*$ ":
uploaded_file = SimpleUploadedFile(
name="invalid" + invalid_char + "name.txt",
content=b"This is fine",
content_type="text/plain",
)
fp = self.MyParser(uploaded_file)
parse_info = fp.critical_parse()
self.assertEqual(
len(parse_info.errors), 1, f"{invalid_char} should trigger 1 error"
)
self.assertIn(
"Invalid characters were found",
parse_info.errors[0],
f"{invalid_char} is an invalid char",
)
self.assertIn(
f": {invalid_char}",
parse_info.errors[0],
f"{invalid_char} is the invalid char",
)

View file

@ -3,7 +3,6 @@ from ietf.submit.parsers.base import FileParser
class XMLParser(FileParser):
ext = 'xml'
mimetypes = ['application/xml', 'text/xml', ]
# If some error is found after this method invocation
# no other file parsing is recommended

View file

@ -1700,7 +1700,6 @@ class SubmitTests(BaseSubmitTestCase):
r, q, m = self.submit_bad_file("some name", ["txt"])
self.assertIn('Invalid characters were found in the name', m)
self.assertIn('Expected the TXT file to have extension ".txt"', m)
self.assertIn('Expected an TXT file of type "text/plain"', m)
self.assertIn('document does not contain a legitimate name', m)
def test_submit_bad_doc_name(self):
@ -1718,7 +1717,6 @@ class SubmitTests(BaseSubmitTestCase):
r, q, m = self.submit_bad_file("some name", ["xml"])
self.assertIn('Invalid characters were found in the name', m)
self.assertIn('Expected the XML file to have extension ".xml"', m)
self.assertIn('Expected an XML file of type "application/xml"', m)
def test_submit_file_in_archive(self):
name = "draft-authorname-testing-file-exists"
@ -3123,6 +3121,20 @@ class SubmissionUploadFormTests(BaseSubmitTestCase):
)
self.assertFalse(form.is_valid())
def test_invalid_xml(self):
"""Test error message for invalid XML"""
not_xml = SimpleUploadedFile(
name="not-xml.xml",
content=b"this is not xml at all",
content_type="application/xml",
)
form = SubmissionBaseUploadForm(RequestFactory().post('/some/url'), files={"xml": not_xml})
self.assertFalse(form.is_valid())
self.assertFormError(
form,
"xml",
"The uploaded file is not valid XML. Please make sure you are uploading the correct file.",
)
class AsyncSubmissionTests(BaseSubmitTestCase):
"""Tests of async submission-related tasks"""

View file

@ -8,6 +8,7 @@ import xml2rfc
import debug # pyflakes: ignore
from contextlib import ExitStack
from lxml.etree import XMLSyntaxError
from xml2rfc.util.date import augment_date, extract_date
from ietf.utils.timezone import date_today
@ -54,6 +55,8 @@ class XMLDraft(Draft):
parser = xml2rfc.XmlRfcParser(filename, quiet=True)
try:
tree = parser.parse()
except XMLSyntaxError:
raise InvalidXMLError()
except Exception as e:
raise XMLParseError(parser_out.getvalue(), parser_err.getvalue()) from e
@ -235,3 +238,8 @@ class XMLParseError(Exception):
def parser_msgs(self):
return self._out.splitlines() + self._err.splitlines()
class InvalidXMLError(Exception):
"""File is not valid XML"""
pass