From 020e7f89ff7c0ce2455cbb753310719625ac6650 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jacobo=20Tarrag=C3=B3n?= Date: Tue, 8 Feb 2011 10:42:09 +0000 Subject: [PATCH] merging author parsing in plain parser. Closes #585. - Legacy-Id: 2820 --- ietf/submit/parsers/base.py | 2 + ietf/submit/parsers/plain_parser.py | 265 ++++++++++++++++++++++++++++ 2 files changed, 267 insertions(+) diff --git a/ietf/submit/parsers/base.py b/ietf/submit/parsers/base.py index 6571fdfa1..6891bfd76 100644 --- a/ietf/submit/parsers/base.py +++ b/ietf/submit/parsers/base.py @@ -9,6 +9,7 @@ class MetaDataDraft(object): revision = None filename = None group = None + authors = None class ParseInfo(object): @@ -51,6 +52,7 @@ class FileParser(object): method() if self.parsed_info.errors: return self.parsed_info + return self.parsed_info def parse_critical_000_invalid_chars_in_filename(self): name = self.fd.name diff --git a/ietf/submit/parsers/plain_parser.py b/ietf/submit/parsers/plain_parser.py index 6f560feaa..6f88c4d83 100644 --- a/ietf/submit/parsers/plain_parser.py +++ b/ietf/submit/parsers/plain_parser.py @@ -68,3 +68,268 @@ class PlainParser(FileParser): self.parsed_info.add_error('Invalid WG ID: %s' % group_acronym) else: self.parsed_info.metadraft.wg = IETFWG.objects.get(pk=NONE_WG_PK) + + def parse_critical_authors(self): + """ + comes from http://svn.tools.ietf.org/svn/tools/ietfdb/branch/idsubmit/ietf/utils/draft.py + """ + + def _stripheaders(rawlines): + stripped = [] + pages = [] + page = [] + line = "" + debug = False + newpage = False + sentence = False + haveblank = False + + def endpage(pages, page, line): + if line: + page += [ line ] + return begpage(pages, page) + def begpage(pages, page, line=None): + if page and len(page) > 5: + pages += [ "\n".join(page) ] + page = [] + newpage = True + if line: + page += [ line ] + return pages, page + + for line in rawlines: + line = line.rstrip() + if re.search("\[?[Pp]age [0-9ivx]+\]?[ \t\f]*$", line, re.I): + pages, page = endpage(pages, page, line) + continue + if re.search("\f", line, re.I): + pages, page = begpage(pages, page) + continue + if re.search("^ *Internet.Draft.+[12][0-9][0-9][0-9] *$", line, re.I): + pages, page = begpage(pages, page, line) + continue + if re.search("^ *Draft.+[12][0-9][0-9][0-9] *$", line, re.I): + pages, page = begpage(pages, page, line) + continue + if re.search("^RFC[ -]?[0-9]+.*( +)[12][0-9][0-9][0-9]$", line, re.I): + pages, page = begpage(pages, page, line) + continue + if re.search("^draft-[-a-z0-9_.]+.*[0-9][0-9][0-9][0-9]$", line, re.I): + pages, page = endpage(pages, page, line) + continue + if re.search(".{60,}(Jan|Feb|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|Sep|Oct|Nov|Dec) (19[89][0-9]|20[0-9][0-9]) *$", line, re.I): + pages, page = begpage(pages, page, line) + continue + if newpage and re.search("^ *draft-[-a-z0-9_.]+ *$", line, re.I): + pages, page = begpage(pages, page, line) + continue + if re.search("^[^ \t]+", line): + sentence = True + if re.search("[^ \t]", line): + if newpage: + if sentence: + stripped += [""] + else: + if haveblank: + stripped += [""] + haveblank = False + sentence = False + newpage = False + if re.search("[.:]$", line): + sentence = True + if re.search("^[ \t]*$", line): + haveblank = True + page += [ line ] + continue + page += [ line ] + stripped += [ line ] + pages, page = begpage(pages, page) + return stripped, pages + + self.fd.file.seek(0) + raw_lines = self.fd.file.read().split("\n") + draft_lines, draft_pages = _stripheaders(raw_lines) + + longform = { + "Beth": "Elizabeth", + "Bill": "William", + "Bob": "Robert", + "Dick": "Richard", + "Fred": "Alfred", + "Jerry": "Gerald", + "Liz": "Elizabeth", + "Lynn": "Carolyn", + "Ned": "Edward" , + "Ted":"Edward", + } + aux = { + "honor" : r"(?:Dr\.?|Prof(?:\.?|essor)|Sir|Lady|Dame)", + "prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von)", + "suffix": r"(jr|II|2nd|III|3rd|IV|4th)", + "first" : r"([A-Z][-A-Za-z]*)((\.?[- ]{1,2}[A-Za-z]+)*)", + "last" : r"([-A-Za-z']{2,})", + } + authformats = [ + r" {6}(%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)([, ]?(.+\.?|\(.+\.?|\)))?$" % aux, + r" {6}(((%(prefix)s )?%(last)s)( %(suffix)s)?, %(first)s)([, ]([Ee]d\.?|\([Ee]d\.?\)))?$" % aux, + r" {6}(%(last)s)$" % aux, + ] + + authors = [] + companies = [] + + # Collect first-page author information first + have_blankline = False + have_draftline = False + prev_blankline = False + for line in draft_lines[:15]: + leading_space = len(re.findall("^ *", line)[0]) + line_len = len(line.rstrip()) + trailing_space = line_len <= 72 and 72 - line_len or 0 + # Truncate long lines at the first space past column 80: + trunc_space = line.find(" ", 80) + if line_len > 80 and trunc_space > -1: + line = line[:trunc_space] + if line_len > 60: + # Look for centered title, break if found: + if (leading_space > 5 and abs(leading_space - trailing_space) < 5): + break + for authformat in authformats: + match = re.search(authformat, line) + if match: + author = match.group(1) + authors += [ author ] + if line.strip() == "": + if prev_blankline: + break + have_blankline = True + prev_blankline = True + else: + prev_blankline = False + if "draft-" in line: + have_draftline = True + if have_blankline and have_draftline: + break + + found_pos = [] + for i in range(len(authors)): + author = authors[i] + if author == None: + continue + if "," in author: + last, first = author.split(",",1) + author = "%s %s" % (first.strip(), last.strip()) + if not " " in author: + if "." in author: + first, last = author.rsplit(".", 1) + first += "." + else: + author = "[A-Z].+ " + author + first, last = author.rsplit(" ", 1) + else: + first, last = author.rsplit(" ", 1) + + for author in [ "%s %s"%(first,last), "%s %s"%(last,first), ]: + # Pattern for full author information search, based on first page author name: + authpat = author + # Permit expansion of first name + authpat = re.sub("\. ", ".* ", authpat) + authpat = re.sub("\.$", ".*", authpat) + # Permit insertsion of middle name or initial + authpat = re.sub(" ", "\S*( +[^ ]+)* +", authpat) + # Permit expansion of double-name initials + authpat = re.sub("-", ".*?-", authpat) + # Some chinese names are shown with double-letter(latin) abbreviated given names, rather than + # a single-letter(latin) abbreviation: + authpat = re.sub("^([A-Z])[A-Z]+\.\*", r"\1[-\w]+", authpat) + authpat = "^(?:%s ?)?(%s)( *\(.*\)|,( [A-Z][-A-Za-z0-9]*)?)?" % (aux["honor"], authpat) + start = 0 + col = None + + # Find start of author info for this author (if any). + # Scan from the end of the file, looking for a match to authpath + try: + for j in range(len(draft_lines)-1, 15, -1): + line = draft_lines[j].strip() + forms = [ line ] + [ line.replace(short, longform[short]) for short in longform if short in line ] + for line in forms: + if re.search(authpat, line): + start = j + columns = re.split("( +)", line) + # Find which column: + cols = [ c for c in range(len(columns)) if re.search(authpat+r"$", columns[c].strip()) ] + if cols: + col = cols[0] + if not (start, col) in found_pos: + found_pos += [ (start, col) ] + beg = len("".join(columns[:col])) + if col == len(columns) or col == len(columns)-1: + end = None + else: + end = beg + len("".join(columns[col:col+2])) + author = re.search(authpat, columns[col].strip()).group(1) + if author in companies: + authors[i] = None + else: + authors[i] = author + + raise StopIteration("Found Author") + except StopIteration: + pass + if start and col != None: + break + if not authors[i]: + continue + + if start and col != None: + done = False + count = 0 + keyword = False + blanklines = 0 + for line in draft_lines[start+1:]: + # Break on the second blank line + if not line: + blanklines += 1 + if blanklines >= 3: + break + else: + continue + else: + count += 1 + authmatch = [ a for a in authors[i+1:] if a and not a in companies and re.search((r"(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)"), line.strip()) ] + if authmatch: + if count == 1 or (count == 2 and not blanklines): + # First line after an author -- this is a company + companies += authmatch + companies += [ line.strip() ] # XXX fix this for columnized author list + companies = list(set(companies)) + for k in range(i+1, len(authors)): + if authors[k] in companies: + authors[k] = None + elif not "@" in line: + break + else: + pass + + try: + column = line[beg:end].strip() + except: + column = line + column = re.sub(" *\(at\) *", "@", column) + column = re.sub(" *\(dot\) *", ".", column) + + emailmatch = re.search("[-A-Za-z0-9_.+]+@[-A-Za-z0-9_.]+", column) + if emailmatch and not "@" in authors[i]: + email = emailmatch.group(0).lower() + authors[i] = "%s <%s>" % (authors[i], email) + else: + authors[i] = None + + authors = [ re.sub(r" +"," ", a) for a in authors if a != None ] + if authors: + authors.sort() + self.parsed_info.metadraft.authors = authors + else: + self.parsed_info.errors.append("Draft authors could not be found.") + + return authors