refactor: Drop submission validation via libmagic (#6500)

* refactor: Update parsers/base.py for Python3 * style: Black * refactor: Remove mime type check from FileParser * refactor: Validate that submission is UTF-8 The mime check distinguished us-ascii from UTF-8, but as far as I can tell the code relying on it treated both as equally valid. * feat: Clear error when file is not valid XML * chore: Remove unused import * test: Update tests to match changes * fix: Count bytes starting from 1 * test: Add tests of FileParser validations * fix: Fix / simplify regexp * test: Test error caused by bad XML submission --------- Co-authored-by: Robert Sparks <rjsparks@nostrum.com>
2023-10-23 12:00:04 -03:00 · 2023-10-23 12:00:04 -03:00 · dc14308700
parent a6a39a04dc
commit dc14308700
7 changed files with 175 additions and 74 deletions
--- a/ietf/submit/forms.py
+++ b/ietf/submit/forms.py
@ -42,7 +42,7 @@ from ietf.utils import log
 from ietf.utils.draft import PlaintextDraft
 from ietf.utils.text import normalize_text
 from ietf.utils.timezone import date_today
-from ietf.utils.xmldraft import XMLDraft, XMLParseError
+from ietf.utils.xmldraft import InvalidXMLError, XMLDraft, XMLParseError
 class SubmissionBaseUploadForm(forms.Form):
@ -168,6 +168,11 @@ class SubmissionBaseUploadForm(forms.Form):
                try:
                    xml_draft = XMLDraft(tfn)
                except InvalidXMLError:
                    raise forms.ValidationError(
                        "The uploaded file is not valid XML. Please make sure you are uploading the correct file.",
                        code="invalid_xml_error",
                    )
                except XMLParseError as e:
                    msgs = format_messages('xml', e, e.parser_msgs())
                    raise forms.ValidationError(msgs, code="xml_parse_error")
@ -537,8 +542,7 @@ class DeprecatedSubmissionBaseUploadForm(SubmissionBaseUploadForm):
            bytes = txt_file.read()
            txt_file.seek(0)
            try:
-                text = bytes.decode(self.file_info['txt'].charset)
+                text = bytes.decode(PlainParser.encoding)
            #
                self.parsed_draft = PlaintextDraft(text, txt_file.name)
                if self.filename == None:
                    self.filename = self.parsed_draft.filename
@ -649,7 +653,7 @@ class SubmissionManualUploadForm(SubmissionBaseUploadForm):
            txt_file.seek(0)
            bytes = txt_file.read()
            try:
-                text = bytes.decode(self.file_info["txt"].charset)
+                text = bytes.decode(PlainParser.encoding)
                parsed_draft = PlaintextDraft(text, txt_file.name)
                self._extracted_filenames_and_revisions["txt"] = (parsed_draft.filename, parsed_draft.revision)
            except (UnicodeDecodeError, LookupError) as e:
--- a/ietf/submit/parsers/base.py
+++ b/ietf/submit/parsers/base.py
@ -1,20 +1,19 @@
-# Copyright The IETF Trust 2011-2020, All Rights Reserved
+# Copyright The IETF Trust 2011-2023, All Rights Reserved
 # -*- coding: utf-8 -*-
 import re
-import debug                            # pyflakes:ignore
+import debug  # pyflakes:ignore
-from typing import List, Optional   # pyflakes:ignore
+from typing import List, Optional  # pyflakes:ignore
 from django.conf import settings
 from django.template.defaultfilters import filesizeformat
 from ietf.utils.mime import get_mime_type
 from ietf.utils.timezone import date_today
-class MetaData(object):
+class MetaData:
    rev = None
    name = None
    group = None
@ -25,7 +24,8 @@ class MetaData(object):
    document_date = None
    authors = None
-class ParseInfo(object):
+
 class ParseInfo:
    """Collect errors from a parse"""
    def __init__(self):
@ -46,9 +46,9 @@ class ParseInfo(object):
        self.warnings[warning_type] = warn_list + [warning_str]
-class FileParser(object):
+class FileParser:
-    ext = None                          # type: Optional[str]
+    ext: Optional[str] = None
-    mimetypes = []                      # type: List[str]
+    encoding = "utf8"
    def __init__(self, fd):
        self.fd = fd
@ -58,35 +58,49 @@ class FileParser(object):
    # no other file parsing is recommended
    def critical_parse(self):
        self.parse_invalid_chars_in_filename()
-        self.parse_max_size();
+        self.parse_max_size()
        self.parse_filename_extension()
-        self.parse_file_type()
+        self.validate_encoding()
        self.parsed_info.metadata.submission_date = date_today()
        return self.parsed_info
    def parse_invalid_chars_in_filename(self):
        name = self.fd.name
-        regexp = re.compile(r'&|\|\/|;|\*|\s|\$')
+        regexp = re.compile(r"&|\\|/|;|\*|\s|\$")
        chars = regexp.findall(name)
        if chars:
-            self.parsed_info.add_error('Invalid characters were found in the name of the file which was just submitted: %s' % ', '.join(set(chars)))
+            self.parsed_info.add_error(
                "Invalid characters were found in the name of the file which was just submitted: %s"
                % ", ".join(set(chars))
            )
    def parse_max_size(self):
        max_size = settings.IDSUBMIT_MAX_DRAFT_SIZE[self.ext]
        if self.fd.size > max_size:
-            self.parsed_info.add_error('File size is larger than the permitted maximum of %s' % filesizeformat(max_size))
+            self.parsed_info.add_error(
                "File size is larger than the permitted maximum of %s"
                % filesizeformat(max_size)
            )
        self.parsed_info.metadata.file_size = self.fd.size
    def parse_filename_extension(self):
-        if not self.fd.name.lower().endswith('.'+self.ext):
+        if not self.fd.name.lower().endswith("." + self.ext):
-            self.parsed_info.add_error('Expected the %s file to have extension ".%s", found the name "%s"' % (self.ext.upper(), self.ext, self.fd.name))
+            self.parsed_info.add_error(
                'Expected the %s file to have extension ".%s", found the name "%s"'
                % (self.ext.upper(), self.ext, self.fd.name)
            )
-    def parse_file_type(self):
+    def validate_encoding(self):
-        self.fd.file.seek(0)
+        self.fd.seek(0)
-        content = self.fd.file.read(64*1024)
+        bytes = self.fd.read()
-        mimetype, charset = get_mime_type(content)
+        try:
-        if not mimetype in self.mimetypes:
+            bytes.decode(self.encoding)
-            self.parsed_info.add_error('Expected an %s file of type "%s", found one of type "%s"' % (self.ext.upper(), '" or "'.join(self.mimetypes), mimetype))
+        except UnicodeDecodeError as err:
-        self.parsed_info.mimetype = mimetype
+            invalid_bytes = bytes[err.start : err.end]
-        self.parsed_info.charset = charset
+            self.parsed_info.add_error(
-        
+                "Invalid {} byte(s) starting at byte {}: [{}]".format(
                    err.encoding,
                    err.start + 1,
                    ", ".join(f"0x{b:x}" for b in invalid_bytes)
                )
            )
--- a/ietf/submit/parsers/plain_parser.py
+++ b/ietf/submit/parsers/plain_parser.py
@ -10,7 +10,6 @@ from ietf.submit.parsers.base import FileParser
 class PlainParser(FileParser):
    ext = 'txt'
    mimetypes = ['text/plain', ]
    def __init__(self, fd):
        super(PlainParser, self).__init__(fd)
@ -19,54 +18,41 @@ class PlainParser(FileParser):
    # no other file parsing is recommended
    def critical_parse(self):
        super(PlainParser, self).critical_parse()
        self.check_file_charset()
        self.parse_name()
        return self.parsed_info
    def check_file_charset(self):
        charset = self.parsed_info.charset
        if not charset in ['us-ascii', 'utf-8',]:
            self.parsed_info.add_error('A plain text ASCII document is required.  '
                'Found an unexpected encoding: "%s".  '
                'You probably have one or more non-ascii characters in your file.'  % charset
            )
        if self.fd.charset and charset != self.fd.charset:
            self.parsed_info.add_error("Unexpected charset mismatch: upload: %s, libmagic: %s" % (self.fd.charset, charset))
    def parse_name(self):
        self.fd.file.seek(0)
        draftre = re.compile(r'(draft-\S+)')
        revisionre = re.compile(r'.*-(\d+)$')
        limit = 80
-        if self.parsed_info.charset in ['us-ascii', 'utf-8']:
+        while limit:
-            while limit:
+            limit -= 1
-                limit -= 1
+            try:
-                try:
+                line = self.fd.readline().decode(self.encoding)
-                    line = self.fd.readline().decode(self.parsed_info.charset)
+            except UnicodeDecodeError:
                except UnicodeDecodeError:
                    return
                match = draftre.search(line)
                if not match:
                    continue
                name = match.group(1)
                name = re.sub(r'^[^\w]+', '', name)
                name = re.sub(r'[^\w]+$', '', name)
                name = re.sub(r'\.txt$', '', name)
                extra_chars = re.sub(r'[0-9a-z\-]', '', name)
                if extra_chars:
                    if len(extra_chars) == 1:
                        self.parsed_info.add_error(('The document name on the first page, "%s", contains a disallowed character with byte code: %s ' % (name, ord(extra_chars[0]))) +
                                                    '(see https://www.ietf.org/id-info/guidelines.html#naming for details).')
                    else:
                        self.parsed_info.add_error(('The document name on the first page, "%s", contains disallowed characters with byte codes: %s ' % (name, (', '.join([ str(ord(c)) for c in extra_chars] )))) +
                                                    '(see https://www.ietf.org/id-info/guidelines.html#naming for details).')
                match_revision = revisionre.match(name)
                if match_revision:
                    self.parsed_info.metadata.rev = match_revision.group(1)
                else:
                    self.parsed_info.add_error('The name found on the first page of the document does not contain a revision: "%s"' % (name,))
                name = re.sub(r'-\d+$', '', name)
                self.parsed_info.metadata.name = name
                return
            match = draftre.search(line)
            if not match:
                continue
            name = match.group(1)
            name = re.sub(r'^[^\w]+', '', name)
            name = re.sub(r'[^\w]+$', '', name)
            name = re.sub(r'\.txt$', '', name)
            extra_chars = re.sub(r'[0-9a-z\-]', '', name)
            if extra_chars:
                if len(extra_chars) == 1:
                    self.parsed_info.add_error(('The document name on the first page, "%s", contains a disallowed character with byte code: %s ' % (name, ord(extra_chars[0]))) +
                                                '(see https://www.ietf.org/id-info/guidelines.html#naming for details).')
                else:
                    self.parsed_info.add_error(('The document name on the first page, "%s", contains disallowed characters with byte codes: %s ' % (name, (', '.join([ str(ord(c)) for c in extra_chars] )))) +
                                                '(see https://www.ietf.org/id-info/guidelines.html#naming for details).')
            match_revision = revisionre.match(name)
            if match_revision:
                self.parsed_info.metadata.rev = match_revision.group(1)
            else:
                self.parsed_info.add_error('The name found on the first page of the document does not contain a revision: "%s"' % (name,))
            name = re.sub(r'-\d+$', '', name)
            self.parsed_info.metadata.name = name
            return
        self.parsed_info.add_error('The first page of the document does not contain a legitimate name that starts with draft-*')
--- a/ietf/submit/parsers/tests.py
+++ b/ietf/submit/parsers/tests.py
@ -0,0 +1,78 @@
 # Copyright The IETF Trust 2023, All Rights Reserved
 from django.core.files.uploadedfile import SimpleUploadedFile
 from django.test.utils import override_settings
 from ietf.utils.test_utils import TestCase
 from .base import FileParser
@override_settings(IDSUBMIT_MAX_DRAFT_SIZE={"txt": 100})
 class FileParserTests(TestCase):
    class MyParser(FileParser):
        """Test parser class (FileParser is not usable on its own)"""
        ext = "txt"
    def test_invalid_encoding(self):
        fp = self.MyParser(
            SimpleUploadedFile(
                name="valid-name.txt",
                content=b"This is not valid utf-8 -> \xfe",
                content_type="text/plain",
            )
        )
        parse_info = fp.critical_parse()
        self.assertEqual(len(parse_info.errors), 1)
        self.assertIn("Invalid utf-8 byte(s)", parse_info.errors[0])
        self.assertIn("0xfe", parse_info.errors[0])
    def test_file_too_big(self):
        fp = self.MyParser(
            SimpleUploadedFile(
                name="valid-name.txt",
                content=b"1" + b"ten chars!" * 10,  # exceeds max size of 100
                content_type="text/plain",
            )
        )
        parse_info = fp.critical_parse()
        self.assertEqual(len(parse_info.errors), 1)
        self.assertIn("File size is larger", parse_info.errors[0])
    def test_wrong_extension(self):
        fp = self.MyParser(
            SimpleUploadedFile(
                name="invalid-ext.xml",
                content=b"This is fine",
                content_type="text/plain",
            )
        )
        parse_info = fp.critical_parse()
        self.assertEqual(len(parse_info.errors), 1)
        self.assertIn("Expected the TXT file to have extension", parse_info.errors[0])
    def test_invalid_char_in_filename(self):
        # Can't test "/" in the filename because SimpleUploadedFile treats it as a path separator
        for invalid_char in r"&\;*$ ":
            uploaded_file = SimpleUploadedFile(
                name="invalid" + invalid_char + "name.txt",
                content=b"This is fine",
                content_type="text/plain",
            )
            fp = self.MyParser(uploaded_file)
            parse_info = fp.critical_parse()
            self.assertEqual(
                len(parse_info.errors), 1, f"{invalid_char} should trigger 1 error"
            )
            self.assertIn(
                "Invalid characters were found",
                parse_info.errors[0],
                f"{invalid_char} is an invalid char",
            )
            self.assertIn(
                f": {invalid_char}",
                parse_info.errors[0],
                f"{invalid_char} is the invalid char",
            )
--- a/ietf/submit/parsers/xml_parser.py
+++ b/ietf/submit/parsers/xml_parser.py
@ -3,7 +3,6 @@ from ietf.submit.parsers.base import FileParser
 class XMLParser(FileParser):
    ext = 'xml'
    mimetypes = ['application/xml', 'text/xml', ]
    # If some error is found after this method invocation
    # no other file parsing is recommended
--- a/ietf/submit/tests.py
+++ b/ietf/submit/tests.py
@ -1700,7 +1700,6 @@ class SubmitTests(BaseSubmitTestCase):
        r, q, m = self.submit_bad_file("some name", ["txt"])
        self.assertIn('Invalid characters were found in the name', m)
        self.assertIn('Expected the TXT file to have extension ".txt"', m)
        self.assertIn('Expected an TXT file of type "text/plain"', m)
        self.assertIn('document does not contain a legitimate name', m)
    def test_submit_bad_doc_name(self):
@ -1718,7 +1717,6 @@ class SubmitTests(BaseSubmitTestCase):
        r, q, m = self.submit_bad_file("some name", ["xml"])
        self.assertIn('Invalid characters were found in the name', m)
        self.assertIn('Expected the XML file to have extension ".xml"', m)
        self.assertIn('Expected an XML file of type "application/xml"', m)
    def test_submit_file_in_archive(self):
        name = "draft-authorname-testing-file-exists"
@ -3123,6 +3121,20 @@ class SubmissionUploadFormTests(BaseSubmitTestCase):
        )
        self.assertFalse(form.is_valid())
    def test_invalid_xml(self):
        """Test error message for invalid XML"""
        not_xml = SimpleUploadedFile(
            name="not-xml.xml",
            content=b"this is not xml at all",
            content_type="application/xml",
        )
        form = SubmissionBaseUploadForm(RequestFactory().post('/some/url'), files={"xml": not_xml})
        self.assertFalse(form.is_valid())
        self.assertFormError(
            form,
            "xml",
            "The uploaded file is not valid XML. Please make sure you are uploading the correct file.",
        )
 class AsyncSubmissionTests(BaseSubmitTestCase):
    """Tests of async submission-related tasks"""
--- a/ietf/utils/xmldraft.py
+++ b/ietf/utils/xmldraft.py
@ -8,6 +8,7 @@ import xml2rfc
 import debug  # pyflakes: ignore
 from contextlib import ExitStack
 from lxml.etree import XMLSyntaxError
 from xml2rfc.util.date import augment_date, extract_date
 from ietf.utils.timezone import date_today
@ -54,6 +55,8 @@ class XMLDraft(Draft):
            parser = xml2rfc.XmlRfcParser(filename, quiet=True)
            try:
                tree = parser.parse()
            except XMLSyntaxError:
                raise InvalidXMLError()
            except Exception as e:
                raise XMLParseError(parser_out.getvalue(), parser_err.getvalue()) from e
@ -235,3 +238,8 @@ class XMLParseError(Exception):
    def parser_msgs(self):
        return self._out.splitlines() + self._err.splitlines()
 class InvalidXMLError(Exception):
    """File is not valid XML"""
    pass