refactor: Drop submission validation via libmagic (#6500)
* refactor: Update parsers/base.py for Python3 * style: Black * refactor: Remove mime type check from FileParser * refactor: Validate that submission is UTF-8 The mime check distinguished us-ascii from UTF-8, but as far as I can tell the code relying on it treated both as equally valid. * feat: Clear error when file is not valid XML * chore: Remove unused import * test: Update tests to match changes * fix: Count bytes starting from 1 * test: Add tests of FileParser validations * fix: Fix / simplify regexp * test: Test error caused by bad XML submission --------- Co-authored-by: Robert Sparks <rjsparks@nostrum.com>
This commit is contained in:
parent
a6a39a04dc
commit
dc14308700
|
@ -42,7 +42,7 @@ from ietf.utils import log
|
|||
from ietf.utils.draft import PlaintextDraft
|
||||
from ietf.utils.text import normalize_text
|
||||
from ietf.utils.timezone import date_today
|
||||
from ietf.utils.xmldraft import XMLDraft, XMLParseError
|
||||
from ietf.utils.xmldraft import InvalidXMLError, XMLDraft, XMLParseError
|
||||
|
||||
|
||||
class SubmissionBaseUploadForm(forms.Form):
|
||||
|
@ -168,6 +168,11 @@ class SubmissionBaseUploadForm(forms.Form):
|
|||
|
||||
try:
|
||||
xml_draft = XMLDraft(tfn)
|
||||
except InvalidXMLError:
|
||||
raise forms.ValidationError(
|
||||
"The uploaded file is not valid XML. Please make sure you are uploading the correct file.",
|
||||
code="invalid_xml_error",
|
||||
)
|
||||
except XMLParseError as e:
|
||||
msgs = format_messages('xml', e, e.parser_msgs())
|
||||
raise forms.ValidationError(msgs, code="xml_parse_error")
|
||||
|
@ -537,8 +542,7 @@ class DeprecatedSubmissionBaseUploadForm(SubmissionBaseUploadForm):
|
|||
bytes = txt_file.read()
|
||||
txt_file.seek(0)
|
||||
try:
|
||||
text = bytes.decode(self.file_info['txt'].charset)
|
||||
#
|
||||
text = bytes.decode(PlainParser.encoding)
|
||||
self.parsed_draft = PlaintextDraft(text, txt_file.name)
|
||||
if self.filename == None:
|
||||
self.filename = self.parsed_draft.filename
|
||||
|
@ -649,7 +653,7 @@ class SubmissionManualUploadForm(SubmissionBaseUploadForm):
|
|||
txt_file.seek(0)
|
||||
bytes = txt_file.read()
|
||||
try:
|
||||
text = bytes.decode(self.file_info["txt"].charset)
|
||||
text = bytes.decode(PlainParser.encoding)
|
||||
parsed_draft = PlaintextDraft(text, txt_file.name)
|
||||
self._extracted_filenames_and_revisions["txt"] = (parsed_draft.filename, parsed_draft.revision)
|
||||
except (UnicodeDecodeError, LookupError) as e:
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
# Copyright The IETF Trust 2011-2020, All Rights Reserved
|
||||
# Copyright The IETF Trust 2011-2023, All Rights Reserved
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
|
@ -10,11 +10,10 @@ from typing import List, Optional # pyflakes:ignore
|
|||
from django.conf import settings
|
||||
from django.template.defaultfilters import filesizeformat
|
||||
|
||||
from ietf.utils.mime import get_mime_type
|
||||
from ietf.utils.timezone import date_today
|
||||
|
||||
|
||||
class MetaData(object):
|
||||
class MetaData:
|
||||
rev = None
|
||||
name = None
|
||||
group = None
|
||||
|
@ -25,7 +24,8 @@ class MetaData(object):
|
|||
document_date = None
|
||||
authors = None
|
||||
|
||||
class ParseInfo(object):
|
||||
|
||||
class ParseInfo:
|
||||
"""Collect errors from a parse"""
|
||||
|
||||
def __init__(self):
|
||||
|
@ -46,9 +46,9 @@ class ParseInfo(object):
|
|||
self.warnings[warning_type] = warn_list + [warning_str]
|
||||
|
||||
|
||||
class FileParser(object):
|
||||
ext = None # type: Optional[str]
|
||||
mimetypes = [] # type: List[str]
|
||||
class FileParser:
|
||||
ext: Optional[str] = None
|
||||
encoding = "utf8"
|
||||
|
||||
def __init__(self, fd):
|
||||
self.fd = fd
|
||||
|
@ -58,35 +58,49 @@ class FileParser(object):
|
|||
# no other file parsing is recommended
|
||||
def critical_parse(self):
|
||||
self.parse_invalid_chars_in_filename()
|
||||
self.parse_max_size();
|
||||
self.parse_max_size()
|
||||
self.parse_filename_extension()
|
||||
self.parse_file_type()
|
||||
self.validate_encoding()
|
||||
self.parsed_info.metadata.submission_date = date_today()
|
||||
return self.parsed_info
|
||||
|
||||
def parse_invalid_chars_in_filename(self):
|
||||
name = self.fd.name
|
||||
regexp = re.compile(r'&|\|\/|;|\*|\s|\$')
|
||||
regexp = re.compile(r"&|\\|/|;|\*|\s|\$")
|
||||
chars = regexp.findall(name)
|
||||
if chars:
|
||||
self.parsed_info.add_error('Invalid characters were found in the name of the file which was just submitted: %s' % ', '.join(set(chars)))
|
||||
self.parsed_info.add_error(
|
||||
"Invalid characters were found in the name of the file which was just submitted: %s"
|
||||
% ", ".join(set(chars))
|
||||
)
|
||||
|
||||
def parse_max_size(self):
|
||||
max_size = settings.IDSUBMIT_MAX_DRAFT_SIZE[self.ext]
|
||||
if self.fd.size > max_size:
|
||||
self.parsed_info.add_error('File size is larger than the permitted maximum of %s' % filesizeformat(max_size))
|
||||
self.parsed_info.add_error(
|
||||
"File size is larger than the permitted maximum of %s"
|
||||
% filesizeformat(max_size)
|
||||
)
|
||||
self.parsed_info.metadata.file_size = self.fd.size
|
||||
|
||||
def parse_filename_extension(self):
|
||||
if not self.fd.name.lower().endswith('.'+self.ext):
|
||||
self.parsed_info.add_error('Expected the %s file to have extension ".%s", found the name "%s"' % (self.ext.upper(), self.ext, self.fd.name))
|
||||
|
||||
def parse_file_type(self):
|
||||
self.fd.file.seek(0)
|
||||
content = self.fd.file.read(64*1024)
|
||||
mimetype, charset = get_mime_type(content)
|
||||
if not mimetype in self.mimetypes:
|
||||
self.parsed_info.add_error('Expected an %s file of type "%s", found one of type "%s"' % (self.ext.upper(), '" or "'.join(self.mimetypes), mimetype))
|
||||
self.parsed_info.mimetype = mimetype
|
||||
self.parsed_info.charset = charset
|
||||
if not self.fd.name.lower().endswith("." + self.ext):
|
||||
self.parsed_info.add_error(
|
||||
'Expected the %s file to have extension ".%s", found the name "%s"'
|
||||
% (self.ext.upper(), self.ext, self.fd.name)
|
||||
)
|
||||
|
||||
def validate_encoding(self):
|
||||
self.fd.seek(0)
|
||||
bytes = self.fd.read()
|
||||
try:
|
||||
bytes.decode(self.encoding)
|
||||
except UnicodeDecodeError as err:
|
||||
invalid_bytes = bytes[err.start : err.end]
|
||||
self.parsed_info.add_error(
|
||||
"Invalid {} byte(s) starting at byte {}: [{}]".format(
|
||||
err.encoding,
|
||||
err.start + 1,
|
||||
", ".join(f"0x{b:x}" for b in invalid_bytes)
|
||||
)
|
||||
)
|
||||
|
|
|
@ -10,7 +10,6 @@ from ietf.submit.parsers.base import FileParser
|
|||
|
||||
class PlainParser(FileParser):
|
||||
ext = 'txt'
|
||||
mimetypes = ['text/plain', ]
|
||||
|
||||
def __init__(self, fd):
|
||||
super(PlainParser, self).__init__(fd)
|
||||
|
@ -19,31 +18,18 @@ class PlainParser(FileParser):
|
|||
# no other file parsing is recommended
|
||||
def critical_parse(self):
|
||||
super(PlainParser, self).critical_parse()
|
||||
self.check_file_charset()
|
||||
self.parse_name()
|
||||
return self.parsed_info
|
||||
|
||||
def check_file_charset(self):
|
||||
charset = self.parsed_info.charset
|
||||
if not charset in ['us-ascii', 'utf-8',]:
|
||||
self.parsed_info.add_error('A plain text ASCII document is required. '
|
||||
'Found an unexpected encoding: "%s". '
|
||||
'You probably have one or more non-ascii characters in your file.' % charset
|
||||
)
|
||||
if self.fd.charset and charset != self.fd.charset:
|
||||
self.parsed_info.add_error("Unexpected charset mismatch: upload: %s, libmagic: %s" % (self.fd.charset, charset))
|
||||
|
||||
|
||||
def parse_name(self):
|
||||
self.fd.file.seek(0)
|
||||
draftre = re.compile(r'(draft-\S+)')
|
||||
revisionre = re.compile(r'.*-(\d+)$')
|
||||
limit = 80
|
||||
if self.parsed_info.charset in ['us-ascii', 'utf-8']:
|
||||
while limit:
|
||||
limit -= 1
|
||||
try:
|
||||
line = self.fd.readline().decode(self.parsed_info.charset)
|
||||
line = self.fd.readline().decode(self.encoding)
|
||||
except UnicodeDecodeError:
|
||||
return
|
||||
match = draftre.search(line)
|
||||
|
|
78
ietf/submit/parsers/tests.py
Normal file
78
ietf/submit/parsers/tests.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
# Copyright The IETF Trust 2023, All Rights Reserved
|
||||
|
||||
|
||||
from django.core.files.uploadedfile import SimpleUploadedFile
|
||||
from django.test.utils import override_settings
|
||||
|
||||
from ietf.utils.test_utils import TestCase
|
||||
|
||||
from .base import FileParser
|
||||
|
||||
|
||||
@override_settings(IDSUBMIT_MAX_DRAFT_SIZE={"txt": 100})
|
||||
class FileParserTests(TestCase):
|
||||
class MyParser(FileParser):
|
||||
"""Test parser class (FileParser is not usable on its own)"""
|
||||
|
||||
ext = "txt"
|
||||
|
||||
def test_invalid_encoding(self):
|
||||
fp = self.MyParser(
|
||||
SimpleUploadedFile(
|
||||
name="valid-name.txt",
|
||||
content=b"This is not valid utf-8 -> \xfe",
|
||||
content_type="text/plain",
|
||||
)
|
||||
)
|
||||
parse_info = fp.critical_parse()
|
||||
self.assertEqual(len(parse_info.errors), 1)
|
||||
self.assertIn("Invalid utf-8 byte(s)", parse_info.errors[0])
|
||||
self.assertIn("0xfe", parse_info.errors[0])
|
||||
|
||||
def test_file_too_big(self):
|
||||
fp = self.MyParser(
|
||||
SimpleUploadedFile(
|
||||
name="valid-name.txt",
|
||||
content=b"1" + b"ten chars!" * 10, # exceeds max size of 100
|
||||
content_type="text/plain",
|
||||
)
|
||||
)
|
||||
parse_info = fp.critical_parse()
|
||||
self.assertEqual(len(parse_info.errors), 1)
|
||||
self.assertIn("File size is larger", parse_info.errors[0])
|
||||
|
||||
def test_wrong_extension(self):
|
||||
fp = self.MyParser(
|
||||
SimpleUploadedFile(
|
||||
name="invalid-ext.xml",
|
||||
content=b"This is fine",
|
||||
content_type="text/plain",
|
||||
)
|
||||
)
|
||||
parse_info = fp.critical_parse()
|
||||
self.assertEqual(len(parse_info.errors), 1)
|
||||
self.assertIn("Expected the TXT file to have extension", parse_info.errors[0])
|
||||
|
||||
def test_invalid_char_in_filename(self):
|
||||
# Can't test "/" in the filename because SimpleUploadedFile treats it as a path separator
|
||||
for invalid_char in r"&\;*$ ":
|
||||
uploaded_file = SimpleUploadedFile(
|
||||
name="invalid" + invalid_char + "name.txt",
|
||||
content=b"This is fine",
|
||||
content_type="text/plain",
|
||||
)
|
||||
fp = self.MyParser(uploaded_file)
|
||||
parse_info = fp.critical_parse()
|
||||
self.assertEqual(
|
||||
len(parse_info.errors), 1, f"{invalid_char} should trigger 1 error"
|
||||
)
|
||||
self.assertIn(
|
||||
"Invalid characters were found",
|
||||
parse_info.errors[0],
|
||||
f"{invalid_char} is an invalid char",
|
||||
)
|
||||
self.assertIn(
|
||||
f": {invalid_char}",
|
||||
parse_info.errors[0],
|
||||
f"{invalid_char} is the invalid char",
|
||||
)
|
|
@ -3,7 +3,6 @@ from ietf.submit.parsers.base import FileParser
|
|||
|
||||
class XMLParser(FileParser):
|
||||
ext = 'xml'
|
||||
mimetypes = ['application/xml', 'text/xml', ]
|
||||
|
||||
# If some error is found after this method invocation
|
||||
# no other file parsing is recommended
|
||||
|
|
|
@ -1700,7 +1700,6 @@ class SubmitTests(BaseSubmitTestCase):
|
|||
r, q, m = self.submit_bad_file("some name", ["txt"])
|
||||
self.assertIn('Invalid characters were found in the name', m)
|
||||
self.assertIn('Expected the TXT file to have extension ".txt"', m)
|
||||
self.assertIn('Expected an TXT file of type "text/plain"', m)
|
||||
self.assertIn('document does not contain a legitimate name', m)
|
||||
|
||||
def test_submit_bad_doc_name(self):
|
||||
|
@ -1718,7 +1717,6 @@ class SubmitTests(BaseSubmitTestCase):
|
|||
r, q, m = self.submit_bad_file("some name", ["xml"])
|
||||
self.assertIn('Invalid characters were found in the name', m)
|
||||
self.assertIn('Expected the XML file to have extension ".xml"', m)
|
||||
self.assertIn('Expected an XML file of type "application/xml"', m)
|
||||
|
||||
def test_submit_file_in_archive(self):
|
||||
name = "draft-authorname-testing-file-exists"
|
||||
|
@ -3123,6 +3121,20 @@ class SubmissionUploadFormTests(BaseSubmitTestCase):
|
|||
)
|
||||
self.assertFalse(form.is_valid())
|
||||
|
||||
def test_invalid_xml(self):
|
||||
"""Test error message for invalid XML"""
|
||||
not_xml = SimpleUploadedFile(
|
||||
name="not-xml.xml",
|
||||
content=b"this is not xml at all",
|
||||
content_type="application/xml",
|
||||
)
|
||||
form = SubmissionBaseUploadForm(RequestFactory().post('/some/url'), files={"xml": not_xml})
|
||||
self.assertFalse(form.is_valid())
|
||||
self.assertFormError(
|
||||
form,
|
||||
"xml",
|
||||
"The uploaded file is not valid XML. Please make sure you are uploading the correct file.",
|
||||
)
|
||||
|
||||
class AsyncSubmissionTests(BaseSubmitTestCase):
|
||||
"""Tests of async submission-related tasks"""
|
||||
|
|
|
@ -8,6 +8,7 @@ import xml2rfc
|
|||
import debug # pyflakes: ignore
|
||||
|
||||
from contextlib import ExitStack
|
||||
from lxml.etree import XMLSyntaxError
|
||||
from xml2rfc.util.date import augment_date, extract_date
|
||||
from ietf.utils.timezone import date_today
|
||||
|
||||
|
@ -54,6 +55,8 @@ class XMLDraft(Draft):
|
|||
parser = xml2rfc.XmlRfcParser(filename, quiet=True)
|
||||
try:
|
||||
tree = parser.parse()
|
||||
except XMLSyntaxError:
|
||||
raise InvalidXMLError()
|
||||
except Exception as e:
|
||||
raise XMLParseError(parser_out.getvalue(), parser_err.getvalue()) from e
|
||||
|
||||
|
@ -235,3 +238,8 @@ class XMLParseError(Exception):
|
|||
|
||||
def parser_msgs(self):
|
||||
return self._out.splitlines() + self._err.splitlines()
|
||||
|
||||
|
||||
class InvalidXMLError(Exception):
|
||||
"""File is not valid XML"""
|
||||
pass
|
||||
|
|
Loading…
Reference in a new issue