From dc143087002a103e3c96e6c7a0dcf15498d80044 Mon Sep 17 00:00:00 2001 From: Jennifer Richards Date: Mon, 23 Oct 2023 12:00:04 -0300 Subject: [PATCH] refactor: Drop submission validation via libmagic (#6500) * refactor: Update parsers/base.py for Python3 * style: Black * refactor: Remove mime type check from FileParser * refactor: Validate that submission is UTF-8 The mime check distinguished us-ascii from UTF-8, but as far as I can tell the code relying on it treated both as equally valid. * feat: Clear error when file is not valid XML * chore: Remove unused import * test: Update tests to match changes * fix: Count bytes starting from 1 * test: Add tests of FileParser validations * fix: Fix / simplify regexp * test: Test error caused by bad XML submission --------- Co-authored-by: Robert Sparks --- ietf/submit/forms.py | 12 +++-- ietf/submit/parsers/base.py | 64 ++++++++++++++--------- ietf/submit/parsers/plain_parser.py | 70 +++++++++++--------------- ietf/submit/parsers/tests.py | 78 +++++++++++++++++++++++++++++ ietf/submit/parsers/xml_parser.py | 1 - ietf/submit/tests.py | 16 +++++- ietf/utils/xmldraft.py | 8 +++ 7 files changed, 175 insertions(+), 74 deletions(-) create mode 100644 ietf/submit/parsers/tests.py diff --git a/ietf/submit/forms.py b/ietf/submit/forms.py index 3950cdc04..0b48dae2a 100644 --- a/ietf/submit/forms.py +++ b/ietf/submit/forms.py @@ -42,7 +42,7 @@ from ietf.utils import log from ietf.utils.draft import PlaintextDraft from ietf.utils.text import normalize_text from ietf.utils.timezone import date_today -from ietf.utils.xmldraft import XMLDraft, XMLParseError +from ietf.utils.xmldraft import InvalidXMLError, XMLDraft, XMLParseError class SubmissionBaseUploadForm(forms.Form): @@ -168,6 +168,11 @@ class SubmissionBaseUploadForm(forms.Form): try: xml_draft = XMLDraft(tfn) + except InvalidXMLError: + raise forms.ValidationError( + "The uploaded file is not valid XML. Please make sure you are uploading the correct file.", + code="invalid_xml_error", + ) except XMLParseError as e: msgs = format_messages('xml', e, e.parser_msgs()) raise forms.ValidationError(msgs, code="xml_parse_error") @@ -537,8 +542,7 @@ class DeprecatedSubmissionBaseUploadForm(SubmissionBaseUploadForm): bytes = txt_file.read() txt_file.seek(0) try: - text = bytes.decode(self.file_info['txt'].charset) - # + text = bytes.decode(PlainParser.encoding) self.parsed_draft = PlaintextDraft(text, txt_file.name) if self.filename == None: self.filename = self.parsed_draft.filename @@ -649,7 +653,7 @@ class SubmissionManualUploadForm(SubmissionBaseUploadForm): txt_file.seek(0) bytes = txt_file.read() try: - text = bytes.decode(self.file_info["txt"].charset) + text = bytes.decode(PlainParser.encoding) parsed_draft = PlaintextDraft(text, txt_file.name) self._extracted_filenames_and_revisions["txt"] = (parsed_draft.filename, parsed_draft.revision) except (UnicodeDecodeError, LookupError) as e: diff --git a/ietf/submit/parsers/base.py b/ietf/submit/parsers/base.py index 679798cc4..d38ab1d7b 100644 --- a/ietf/submit/parsers/base.py +++ b/ietf/submit/parsers/base.py @@ -1,20 +1,19 @@ -# Copyright The IETF Trust 2011-2020, All Rights Reserved +# Copyright The IETF Trust 2011-2023, All Rights Reserved # -*- coding: utf-8 -*- import re -import debug # pyflakes:ignore +import debug # pyflakes:ignore -from typing import List, Optional # pyflakes:ignore +from typing import List, Optional # pyflakes:ignore from django.conf import settings from django.template.defaultfilters import filesizeformat -from ietf.utils.mime import get_mime_type from ietf.utils.timezone import date_today -class MetaData(object): +class MetaData: rev = None name = None group = None @@ -25,7 +24,8 @@ class MetaData(object): document_date = None authors = None -class ParseInfo(object): + +class ParseInfo: """Collect errors from a parse""" def __init__(self): @@ -46,9 +46,9 @@ class ParseInfo(object): self.warnings[warning_type] = warn_list + [warning_str] -class FileParser(object): - ext = None # type: Optional[str] - mimetypes = [] # type: List[str] +class FileParser: + ext: Optional[str] = None + encoding = "utf8" def __init__(self, fd): self.fd = fd @@ -58,35 +58,49 @@ class FileParser(object): # no other file parsing is recommended def critical_parse(self): self.parse_invalid_chars_in_filename() - self.parse_max_size(); + self.parse_max_size() self.parse_filename_extension() - self.parse_file_type() + self.validate_encoding() self.parsed_info.metadata.submission_date = date_today() return self.parsed_info def parse_invalid_chars_in_filename(self): name = self.fd.name - regexp = re.compile(r'&|\|\/|;|\*|\s|\$') + regexp = re.compile(r"&|\\|/|;|\*|\s|\$") chars = regexp.findall(name) if chars: - self.parsed_info.add_error('Invalid characters were found in the name of the file which was just submitted: %s' % ', '.join(set(chars))) + self.parsed_info.add_error( + "Invalid characters were found in the name of the file which was just submitted: %s" + % ", ".join(set(chars)) + ) def parse_max_size(self): max_size = settings.IDSUBMIT_MAX_DRAFT_SIZE[self.ext] if self.fd.size > max_size: - self.parsed_info.add_error('File size is larger than the permitted maximum of %s' % filesizeformat(max_size)) + self.parsed_info.add_error( + "File size is larger than the permitted maximum of %s" + % filesizeformat(max_size) + ) self.parsed_info.metadata.file_size = self.fd.size def parse_filename_extension(self): - if not self.fd.name.lower().endswith('.'+self.ext): - self.parsed_info.add_error('Expected the %s file to have extension ".%s", found the name "%s"' % (self.ext.upper(), self.ext, self.fd.name)) + if not self.fd.name.lower().endswith("." + self.ext): + self.parsed_info.add_error( + 'Expected the %s file to have extension ".%s", found the name "%s"' + % (self.ext.upper(), self.ext, self.fd.name) + ) - def parse_file_type(self): - self.fd.file.seek(0) - content = self.fd.file.read(64*1024) - mimetype, charset = get_mime_type(content) - if not mimetype in self.mimetypes: - self.parsed_info.add_error('Expected an %s file of type "%s", found one of type "%s"' % (self.ext.upper(), '" or "'.join(self.mimetypes), mimetype)) - self.parsed_info.mimetype = mimetype - self.parsed_info.charset = charset - \ No newline at end of file + def validate_encoding(self): + self.fd.seek(0) + bytes = self.fd.read() + try: + bytes.decode(self.encoding) + except UnicodeDecodeError as err: + invalid_bytes = bytes[err.start : err.end] + self.parsed_info.add_error( + "Invalid {} byte(s) starting at byte {}: [{}]".format( + err.encoding, + err.start + 1, + ", ".join(f"0x{b:x}" for b in invalid_bytes) + ) + ) diff --git a/ietf/submit/parsers/plain_parser.py b/ietf/submit/parsers/plain_parser.py index a3338e35a..091179d99 100644 --- a/ietf/submit/parsers/plain_parser.py +++ b/ietf/submit/parsers/plain_parser.py @@ -10,7 +10,6 @@ from ietf.submit.parsers.base import FileParser class PlainParser(FileParser): ext = 'txt' - mimetypes = ['text/plain', ] def __init__(self, fd): super(PlainParser, self).__init__(fd) @@ -19,54 +18,41 @@ class PlainParser(FileParser): # no other file parsing is recommended def critical_parse(self): super(PlainParser, self).critical_parse() - self.check_file_charset() self.parse_name() return self.parsed_info - def check_file_charset(self): - charset = self.parsed_info.charset - if not charset in ['us-ascii', 'utf-8',]: - self.parsed_info.add_error('A plain text ASCII document is required. ' - 'Found an unexpected encoding: "%s". ' - 'You probably have one or more non-ascii characters in your file.' % charset - ) - if self.fd.charset and charset != self.fd.charset: - self.parsed_info.add_error("Unexpected charset mismatch: upload: %s, libmagic: %s" % (self.fd.charset, charset)) - - def parse_name(self): self.fd.file.seek(0) draftre = re.compile(r'(draft-\S+)') revisionre = re.compile(r'.*-(\d+)$') limit = 80 - if self.parsed_info.charset in ['us-ascii', 'utf-8']: - while limit: - limit -= 1 - try: - line = self.fd.readline().decode(self.parsed_info.charset) - except UnicodeDecodeError: - return - match = draftre.search(line) - if not match: - continue - name = match.group(1) - name = re.sub(r'^[^\w]+', '', name) - name = re.sub(r'[^\w]+$', '', name) - name = re.sub(r'\.txt$', '', name) - extra_chars = re.sub(r'[0-9a-z\-]', '', name) - if extra_chars: - if len(extra_chars) == 1: - self.parsed_info.add_error(('The document name on the first page, "%s", contains a disallowed character with byte code: %s ' % (name, ord(extra_chars[0]))) + - '(see https://www.ietf.org/id-info/guidelines.html#naming for details).') - else: - self.parsed_info.add_error(('The document name on the first page, "%s", contains disallowed characters with byte codes: %s ' % (name, (', '.join([ str(ord(c)) for c in extra_chars] )))) + - '(see https://www.ietf.org/id-info/guidelines.html#naming for details).') - match_revision = revisionre.match(name) - if match_revision: - self.parsed_info.metadata.rev = match_revision.group(1) - else: - self.parsed_info.add_error('The name found on the first page of the document does not contain a revision: "%s"' % (name,)) - name = re.sub(r'-\d+$', '', name) - self.parsed_info.metadata.name = name + while limit: + limit -= 1 + try: + line = self.fd.readline().decode(self.encoding) + except UnicodeDecodeError: return + match = draftre.search(line) + if not match: + continue + name = match.group(1) + name = re.sub(r'^[^\w]+', '', name) + name = re.sub(r'[^\w]+$', '', name) + name = re.sub(r'\.txt$', '', name) + extra_chars = re.sub(r'[0-9a-z\-]', '', name) + if extra_chars: + if len(extra_chars) == 1: + self.parsed_info.add_error(('The document name on the first page, "%s", contains a disallowed character with byte code: %s ' % (name, ord(extra_chars[0]))) + + '(see https://www.ietf.org/id-info/guidelines.html#naming for details).') + else: + self.parsed_info.add_error(('The document name on the first page, "%s", contains disallowed characters with byte codes: %s ' % (name, (', '.join([ str(ord(c)) for c in extra_chars] )))) + + '(see https://www.ietf.org/id-info/guidelines.html#naming for details).') + match_revision = revisionre.match(name) + if match_revision: + self.parsed_info.metadata.rev = match_revision.group(1) + else: + self.parsed_info.add_error('The name found on the first page of the document does not contain a revision: "%s"' % (name,)) + name = re.sub(r'-\d+$', '', name) + self.parsed_info.metadata.name = name + return self.parsed_info.add_error('The first page of the document does not contain a legitimate name that starts with draft-*') diff --git a/ietf/submit/parsers/tests.py b/ietf/submit/parsers/tests.py new file mode 100644 index 000000000..9ab529a2f --- /dev/null +++ b/ietf/submit/parsers/tests.py @@ -0,0 +1,78 @@ +# Copyright The IETF Trust 2023, All Rights Reserved + + +from django.core.files.uploadedfile import SimpleUploadedFile +from django.test.utils import override_settings + +from ietf.utils.test_utils import TestCase + +from .base import FileParser + + +@override_settings(IDSUBMIT_MAX_DRAFT_SIZE={"txt": 100}) +class FileParserTests(TestCase): + class MyParser(FileParser): + """Test parser class (FileParser is not usable on its own)""" + + ext = "txt" + + def test_invalid_encoding(self): + fp = self.MyParser( + SimpleUploadedFile( + name="valid-name.txt", + content=b"This is not valid utf-8 -> \xfe", + content_type="text/plain", + ) + ) + parse_info = fp.critical_parse() + self.assertEqual(len(parse_info.errors), 1) + self.assertIn("Invalid utf-8 byte(s)", parse_info.errors[0]) + self.assertIn("0xfe", parse_info.errors[0]) + + def test_file_too_big(self): + fp = self.MyParser( + SimpleUploadedFile( + name="valid-name.txt", + content=b"1" + b"ten chars!" * 10, # exceeds max size of 100 + content_type="text/plain", + ) + ) + parse_info = fp.critical_parse() + self.assertEqual(len(parse_info.errors), 1) + self.assertIn("File size is larger", parse_info.errors[0]) + + def test_wrong_extension(self): + fp = self.MyParser( + SimpleUploadedFile( + name="invalid-ext.xml", + content=b"This is fine", + content_type="text/plain", + ) + ) + parse_info = fp.critical_parse() + self.assertEqual(len(parse_info.errors), 1) + self.assertIn("Expected the TXT file to have extension", parse_info.errors[0]) + + def test_invalid_char_in_filename(self): + # Can't test "/" in the filename because SimpleUploadedFile treats it as a path separator + for invalid_char in r"&\;*$ ": + uploaded_file = SimpleUploadedFile( + name="invalid" + invalid_char + "name.txt", + content=b"This is fine", + content_type="text/plain", + ) + fp = self.MyParser(uploaded_file) + parse_info = fp.critical_parse() + self.assertEqual( + len(parse_info.errors), 1, f"{invalid_char} should trigger 1 error" + ) + self.assertIn( + "Invalid characters were found", + parse_info.errors[0], + f"{invalid_char} is an invalid char", + ) + self.assertIn( + f": {invalid_char}", + parse_info.errors[0], + f"{invalid_char} is the invalid char", + ) diff --git a/ietf/submit/parsers/xml_parser.py b/ietf/submit/parsers/xml_parser.py index 674646e10..806857992 100644 --- a/ietf/submit/parsers/xml_parser.py +++ b/ietf/submit/parsers/xml_parser.py @@ -3,7 +3,6 @@ from ietf.submit.parsers.base import FileParser class XMLParser(FileParser): ext = 'xml' - mimetypes = ['application/xml', 'text/xml', ] # If some error is found after this method invocation # no other file parsing is recommended diff --git a/ietf/submit/tests.py b/ietf/submit/tests.py index 0d5626edc..572d7bda9 100644 --- a/ietf/submit/tests.py +++ b/ietf/submit/tests.py @@ -1700,7 +1700,6 @@ class SubmitTests(BaseSubmitTestCase): r, q, m = self.submit_bad_file("some name", ["txt"]) self.assertIn('Invalid characters were found in the name', m) self.assertIn('Expected the TXT file to have extension ".txt"', m) - self.assertIn('Expected an TXT file of type "text/plain"', m) self.assertIn('document does not contain a legitimate name', m) def test_submit_bad_doc_name(self): @@ -1718,7 +1717,6 @@ class SubmitTests(BaseSubmitTestCase): r, q, m = self.submit_bad_file("some name", ["xml"]) self.assertIn('Invalid characters were found in the name', m) self.assertIn('Expected the XML file to have extension ".xml"', m) - self.assertIn('Expected an XML file of type "application/xml"', m) def test_submit_file_in_archive(self): name = "draft-authorname-testing-file-exists" @@ -3123,6 +3121,20 @@ class SubmissionUploadFormTests(BaseSubmitTestCase): ) self.assertFalse(form.is_valid()) + def test_invalid_xml(self): + """Test error message for invalid XML""" + not_xml = SimpleUploadedFile( + name="not-xml.xml", + content=b"this is not xml at all", + content_type="application/xml", + ) + form = SubmissionBaseUploadForm(RequestFactory().post('/some/url'), files={"xml": not_xml}) + self.assertFalse(form.is_valid()) + self.assertFormError( + form, + "xml", + "The uploaded file is not valid XML. Please make sure you are uploading the correct file.", + ) class AsyncSubmissionTests(BaseSubmitTestCase): """Tests of async submission-related tasks""" diff --git a/ietf/utils/xmldraft.py b/ietf/utils/xmldraft.py index 52b899488..5a0abb613 100644 --- a/ietf/utils/xmldraft.py +++ b/ietf/utils/xmldraft.py @@ -8,6 +8,7 @@ import xml2rfc import debug # pyflakes: ignore from contextlib import ExitStack +from lxml.etree import XMLSyntaxError from xml2rfc.util.date import augment_date, extract_date from ietf.utils.timezone import date_today @@ -54,6 +55,8 @@ class XMLDraft(Draft): parser = xml2rfc.XmlRfcParser(filename, quiet=True) try: tree = parser.parse() + except XMLSyntaxError: + raise InvalidXMLError() except Exception as e: raise XMLParseError(parser_out.getvalue(), parser_err.getvalue()) from e @@ -235,3 +238,8 @@ class XMLParseError(Exception): def parser_msgs(self): return self._out.splitlines() + self._err.splitlines() + + +class InvalidXMLError(Exception): + """File is not valid XML""" + pass