Two levels of parsing. Fixes #584

- Legacy-Id: 2819
This commit is contained in:
Emilio A. Sánchez López 2011-02-08 08:26:12 +00:00
parent c0e1084a20
commit c0f0d2c237
8 changed files with 72 additions and 43 deletions

View file

@ -1,21 +1,8 @@
import datetime
from email.utils import parseaddr
from django import forms
from django.conf import settings
from django.db.models import Q
from django.forms.util import ErrorList
from django.forms.fields import email_re
from django.template.loader import render_to_string
from ietf.liaisons.accounts import (can_add_outgoing_liaison, can_add_incoming_liaison,
get_person_for_user, is_ietf_liaison_manager)
from ietf.liaisons.models import LiaisonDetail, Uploads, OutgoingLiaisonApproval, SDOs
from ietf.liaisons.utils import IETFHM
from ietf.liaisons.widgets import (FromWidget, ReadOnlyWidget, ButtonWidget,
ShowAttachmentsWidget, RelatedLiaisonWidget)
from ietf.proceedings.models import Meeting
from ietf.submit.parsers.plain_parser import PlainParser
from ietf.submit.parsers.pdf_parser import PDFParser
@ -85,22 +72,22 @@ class UploadForm(forms.Form):
yield fieldset_dict
def clean_txt(self):
parsed_info = PlainParser(self.cleaned_data['txt']).parse_critical()
parsed_info = PlainParser(self.cleaned_data['txt']).parse()
if parsed_info.errors:
raise forms.ValidationError(parsed_info.errors)
def clean_pdf(self):
parsed_info = PDFParser(self.cleaned_data['pdf']).parse_critical()
parsed_info = PDFParser(self.cleaned_data['pdf']).parse()
if parsed_info.errors:
raise forms.ValidationError(parsed_info.errors)
def clean_ps(self):
parsed_info = PSParser(self.cleaned_data['ps']).parse_critical()
parsed_info = PSParser(self.cleaned_data['ps']).parse()
if parsed_info.errors:
raise forms.ValidationError(parsed_info.errors)
def clean_xml(self):
parsed_info = XMLParser(self.cleaned_data['xml']).parse_critical()
parsed_info = XMLParser(self.cleaned_data['xml']).parse()
if parsed_info.errors:
raise forms.ValidationError(parsed_info.errors)

View file

@ -5,11 +5,18 @@ import re
CUTOFF_HOUR = 17
class MetaDataDraft(object):
revision = None
filename = None
group = None
class ParseInfo(object):
def __init__(self):
self.errors = []
self.warnings = {}
self.metadraft = MetaDataDraft()
def add_error(self, error_str):
self.errors.append(error_str)
@ -25,7 +32,7 @@ class FileParser(object):
self.fd = fd
self.parsed_info = ParseInfo()
def parse_critical(self):
def parse(self):
if not self.fd:
return self.parsed_info
for attr in dir(self):
@ -33,9 +40,19 @@ class FileParser(object):
method = getattr(self, attr, None)
if callable(method):
method()
return self.parsed_info
# If some critical parsing has returned an error do not continue
if self.parsed_info.errors:
return self.parsed_info
# Continue with non critical parsing, note that they also can return errors
for attr in dir(self):
if attr.startswith('parse_normal_'):
method = getattr(self, attr, None)
if callable(method):
method()
if self.parsed_info.errors:
return self.parsed_info
def parse_critical_invalid_chars_in_filename(self):
def parse_critical_000_invalid_chars_in_filename(self):
name = self.fd.name
regexp = re.compile(r'&|\|\/|;|\*|\s|\$')
chars = regexp.findall(name)

View file

@ -1,7 +1,8 @@
from ietf.submit.parsers.base import FileParser
class PDFParser(FileParser):
def parse_critical_filename_extension(self):
if not self.fd.name.endswith('.pdf'):
self.parsed_info.add_error('Format of this document must be PDF')

View file

@ -1,36 +1,70 @@
import re
from ietf.idtracker.models import InternetDraft
from ietf.submit.error_manager import MainErrorManager
from ietf.submit.parsers.base import FileParser
MAX_PLAIN_FILE_SIZE = 6000000
NONE_WG_PK = 1027
class PlainParser(FileParser):
def parse_critical_max_size(self):
if self.fd.size > MAX_PLAIN_FILE_SIZE:
self.parsed_info.add_error(MainErrorManager.get_error_str('EXCEEDED_SIZE'))
def parse_critical_file_charset(self):
def parse_critical_001_file_charset(self):
import magic
self.fd.file.seek(0)
m = magic.open(magic.MAGIC_MIME)
m.load()
filetype=m.buffer(self.fd.file.read())
filetype = m.buffer(self.fd.file.read())
if not 'ascii' in filetype:
self.parsed_info.add_error('A plain text document must be submitted.');
self.parsed_info.add_error('A plain text document must be submitted.')
def parse_filename(self):
def parse_critical_002_filename(self):
self.fd.file.seek(0)
draftre = re.compile('(draft-\S+)')
revisionre = re.compile('.*-(\d+)$')
limit = 80
while limit:
limit -= 1
line = self.fd.readline()
match = draftre.match(line)
match = draftre.search(line)
if not match:
continue
filename = match.group(0)
filename = re.sub('^[^\w]+', '', filename)
filename = re.sub('[^\w]+$', '', filename)
filename = re.sub('\.txt$', '', filename)
line = re.sub('^[^\w]+', '')
extra_chars = re.sub('[0-9a-z\-]', '', filename)
if extra_chars:
self.parsed_info.add_error('Filename contains non alpha-numeric character: %s' % ', '.join(set(extra_chars)))
match_revision = revisionre.match(filename)
if match_revision:
self.parsed_info.metadraft.revision = match_revision.group(0)
filename = re.sub('-\d+$', '', filename)
self.parsed_info.metadraft.filename = filename
return
self.parsed_info.add_error(MainErrorManager.get_error_str('INVALID_FILENAME'))
def parse_critical_003_wg(self):
filename = self.parsed_info.metadraft.filename
try:
existing_draft = InternetDraft.objects.get(filename=filename)
self.parsed_info.metadraft.wg = existing_draft.group
except InternetDraft.DoesNotExist:
if filename.startswith('draft-ietf-'):
# Extra check for WG that contains dashes
for group in IETFWG.objects.filter(group_acronym__acronym__contains='-'):
if filename.startswith('draft-ietf-%s-' % group.group_acronym.acronym):
self.parsed_info.metadraft.wg = group
return
group_acronym = filename.split('-')[2]
try:
self.parsed_info.metadraft.wg = IETFWG.objects.get(group_acronym__acronym=group_acronym)
except IETFWG.DoesNotExist:
self.parsed_info.add_error('Invalid WG ID: %s' % group_acronym)
else:
self.parsed_info.metadraft.wg = IETFWG.objects.get(pk=NONE_WG_PK)

View file

@ -1,7 +1,8 @@
from ietf.submit.parsers.base import FileParser
class PSParser(FileParser):
def parse_critical_filename_extension(self):
if not self.fd.name.endswith('.ps'):
self.parsed_info.add_error('Format of this document must be PS')

View file

@ -1,7 +1,8 @@
from ietf.submit.parsers.base import FileParser
class XMLParser(FileParser):
def parse_critical_filename_extension(self):
if not self.fd.name.endswith('.xml'):
self.parsed_info.add_error('Format of this document must be XML')

View file

@ -1,6 +1,4 @@
from django.conf.urls.defaults import patterns, url
from django.db.models import Q
from ietf.liaisons.models import LiaisonDetail
urlpatterns = patterns('ietf.submit.views',

View file

@ -1,16 +1,6 @@
# Copyright The IETF Trust 2007, All Rights Reserved
import datetime
from email.utils import parseaddr
from django.conf import settings
from django.core.urlresolvers import reverse
from django.db.models import Q
from django.forms.fields import email_re
from django.http import HttpResponse, HttpResponseRedirect
from django.shortcuts import render_to_response, get_object_or_404
from django.shortcuts import render_to_response
from django.template import RequestContext
from django.utils import simplejson
from django.views.generic.list_detail import object_list, object_detail
from ietf.submit.forms import UploadForm