parent
c0f0d2c237
commit
020e7f89ff
|
@ -9,6 +9,7 @@ class MetaDataDraft(object):
|
|||
revision = None
|
||||
filename = None
|
||||
group = None
|
||||
authors = None
|
||||
|
||||
|
||||
class ParseInfo(object):
|
||||
|
@ -51,6 +52,7 @@ class FileParser(object):
|
|||
method()
|
||||
if self.parsed_info.errors:
|
||||
return self.parsed_info
|
||||
return self.parsed_info
|
||||
|
||||
def parse_critical_000_invalid_chars_in_filename(self):
|
||||
name = self.fd.name
|
||||
|
|
|
@ -68,3 +68,268 @@ class PlainParser(FileParser):
|
|||
self.parsed_info.add_error('Invalid WG ID: %s' % group_acronym)
|
||||
else:
|
||||
self.parsed_info.metadraft.wg = IETFWG.objects.get(pk=NONE_WG_PK)
|
||||
|
||||
def parse_critical_authors(self):
|
||||
"""
|
||||
comes from http://svn.tools.ietf.org/svn/tools/ietfdb/branch/idsubmit/ietf/utils/draft.py
|
||||
"""
|
||||
|
||||
def _stripheaders(rawlines):
|
||||
stripped = []
|
||||
pages = []
|
||||
page = []
|
||||
line = ""
|
||||
debug = False
|
||||
newpage = False
|
||||
sentence = False
|
||||
haveblank = False
|
||||
|
||||
def endpage(pages, page, line):
|
||||
if line:
|
||||
page += [ line ]
|
||||
return begpage(pages, page)
|
||||
def begpage(pages, page, line=None):
|
||||
if page and len(page) > 5:
|
||||
pages += [ "\n".join(page) ]
|
||||
page = []
|
||||
newpage = True
|
||||
if line:
|
||||
page += [ line ]
|
||||
return pages, page
|
||||
|
||||
for line in rawlines:
|
||||
line = line.rstrip()
|
||||
if re.search("\[?[Pp]age [0-9ivx]+\]?[ \t\f]*$", line, re.I):
|
||||
pages, page = endpage(pages, page, line)
|
||||
continue
|
||||
if re.search("\f", line, re.I):
|
||||
pages, page = begpage(pages, page)
|
||||
continue
|
||||
if re.search("^ *Internet.Draft.+[12][0-9][0-9][0-9] *$", line, re.I):
|
||||
pages, page = begpage(pages, page, line)
|
||||
continue
|
||||
if re.search("^ *Draft.+[12][0-9][0-9][0-9] *$", line, re.I):
|
||||
pages, page = begpage(pages, page, line)
|
||||
continue
|
||||
if re.search("^RFC[ -]?[0-9]+.*( +)[12][0-9][0-9][0-9]$", line, re.I):
|
||||
pages, page = begpage(pages, page, line)
|
||||
continue
|
||||
if re.search("^draft-[-a-z0-9_.]+.*[0-9][0-9][0-9][0-9]$", line, re.I):
|
||||
pages, page = endpage(pages, page, line)
|
||||
continue
|
||||
if re.search(".{60,}(Jan|Feb|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|Sep|Oct|Nov|Dec) (19[89][0-9]|20[0-9][0-9]) *$", line, re.I):
|
||||
pages, page = begpage(pages, page, line)
|
||||
continue
|
||||
if newpage and re.search("^ *draft-[-a-z0-9_.]+ *$", line, re.I):
|
||||
pages, page = begpage(pages, page, line)
|
||||
continue
|
||||
if re.search("^[^ \t]+", line):
|
||||
sentence = True
|
||||
if re.search("[^ \t]", line):
|
||||
if newpage:
|
||||
if sentence:
|
||||
stripped += [""]
|
||||
else:
|
||||
if haveblank:
|
||||
stripped += [""]
|
||||
haveblank = False
|
||||
sentence = False
|
||||
newpage = False
|
||||
if re.search("[.:]$", line):
|
||||
sentence = True
|
||||
if re.search("^[ \t]*$", line):
|
||||
haveblank = True
|
||||
page += [ line ]
|
||||
continue
|
||||
page += [ line ]
|
||||
stripped += [ line ]
|
||||
pages, page = begpage(pages, page)
|
||||
return stripped, pages
|
||||
|
||||
self.fd.file.seek(0)
|
||||
raw_lines = self.fd.file.read().split("\n")
|
||||
draft_lines, draft_pages = _stripheaders(raw_lines)
|
||||
|
||||
longform = {
|
||||
"Beth": "Elizabeth",
|
||||
"Bill": "William",
|
||||
"Bob": "Robert",
|
||||
"Dick": "Richard",
|
||||
"Fred": "Alfred",
|
||||
"Jerry": "Gerald",
|
||||
"Liz": "Elizabeth",
|
||||
"Lynn": "Carolyn",
|
||||
"Ned": "Edward" ,
|
||||
"Ted":"Edward",
|
||||
}
|
||||
aux = {
|
||||
"honor" : r"(?:Dr\.?|Prof(?:\.?|essor)|Sir|Lady|Dame)",
|
||||
"prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von)",
|
||||
"suffix": r"(jr|II|2nd|III|3rd|IV|4th)",
|
||||
"first" : r"([A-Z][-A-Za-z]*)((\.?[- ]{1,2}[A-Za-z]+)*)",
|
||||
"last" : r"([-A-Za-z']{2,})",
|
||||
}
|
||||
authformats = [
|
||||
r" {6}(%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)([, ]?(.+\.?|\(.+\.?|\)))?$" % aux,
|
||||
r" {6}(((%(prefix)s )?%(last)s)( %(suffix)s)?, %(first)s)([, ]([Ee]d\.?|\([Ee]d\.?\)))?$" % aux,
|
||||
r" {6}(%(last)s)$" % aux,
|
||||
]
|
||||
|
||||
authors = []
|
||||
companies = []
|
||||
|
||||
# Collect first-page author information first
|
||||
have_blankline = False
|
||||
have_draftline = False
|
||||
prev_blankline = False
|
||||
for line in draft_lines[:15]:
|
||||
leading_space = len(re.findall("^ *", line)[0])
|
||||
line_len = len(line.rstrip())
|
||||
trailing_space = line_len <= 72 and 72 - line_len or 0
|
||||
# Truncate long lines at the first space past column 80:
|
||||
trunc_space = line.find(" ", 80)
|
||||
if line_len > 80 and trunc_space > -1:
|
||||
line = line[:trunc_space]
|
||||
if line_len > 60:
|
||||
# Look for centered title, break if found:
|
||||
if (leading_space > 5 and abs(leading_space - trailing_space) < 5):
|
||||
break
|
||||
for authformat in authformats:
|
||||
match = re.search(authformat, line)
|
||||
if match:
|
||||
author = match.group(1)
|
||||
authors += [ author ]
|
||||
if line.strip() == "":
|
||||
if prev_blankline:
|
||||
break
|
||||
have_blankline = True
|
||||
prev_blankline = True
|
||||
else:
|
||||
prev_blankline = False
|
||||
if "draft-" in line:
|
||||
have_draftline = True
|
||||
if have_blankline and have_draftline:
|
||||
break
|
||||
|
||||
found_pos = []
|
||||
for i in range(len(authors)):
|
||||
author = authors[i]
|
||||
if author == None:
|
||||
continue
|
||||
if "," in author:
|
||||
last, first = author.split(",",1)
|
||||
author = "%s %s" % (first.strip(), last.strip())
|
||||
if not " " in author:
|
||||
if "." in author:
|
||||
first, last = author.rsplit(".", 1)
|
||||
first += "."
|
||||
else:
|
||||
author = "[A-Z].+ " + author
|
||||
first, last = author.rsplit(" ", 1)
|
||||
else:
|
||||
first, last = author.rsplit(" ", 1)
|
||||
|
||||
for author in [ "%s %s"%(first,last), "%s %s"%(last,first), ]:
|
||||
# Pattern for full author information search, based on first page author name:
|
||||
authpat = author
|
||||
# Permit expansion of first name
|
||||
authpat = re.sub("\. ", ".* ", authpat)
|
||||
authpat = re.sub("\.$", ".*", authpat)
|
||||
# Permit insertsion of middle name or initial
|
||||
authpat = re.sub(" ", "\S*( +[^ ]+)* +", authpat)
|
||||
# Permit expansion of double-name initials
|
||||
authpat = re.sub("-", ".*?-", authpat)
|
||||
# Some chinese names are shown with double-letter(latin) abbreviated given names, rather than
|
||||
# a single-letter(latin) abbreviation:
|
||||
authpat = re.sub("^([A-Z])[A-Z]+\.\*", r"\1[-\w]+", authpat)
|
||||
authpat = "^(?:%s ?)?(%s)( *\(.*\)|,( [A-Z][-A-Za-z0-9]*)?)?" % (aux["honor"], authpat)
|
||||
start = 0
|
||||
col = None
|
||||
|
||||
# Find start of author info for this author (if any).
|
||||
# Scan from the end of the file, looking for a match to authpath
|
||||
try:
|
||||
for j in range(len(draft_lines)-1, 15, -1):
|
||||
line = draft_lines[j].strip()
|
||||
forms = [ line ] + [ line.replace(short, longform[short]) for short in longform if short in line ]
|
||||
for line in forms:
|
||||
if re.search(authpat, line):
|
||||
start = j
|
||||
columns = re.split("( +)", line)
|
||||
# Find which column:
|
||||
cols = [ c for c in range(len(columns)) if re.search(authpat+r"$", columns[c].strip()) ]
|
||||
if cols:
|
||||
col = cols[0]
|
||||
if not (start, col) in found_pos:
|
||||
found_pos += [ (start, col) ]
|
||||
beg = len("".join(columns[:col]))
|
||||
if col == len(columns) or col == len(columns)-1:
|
||||
end = None
|
||||
else:
|
||||
end = beg + len("".join(columns[col:col+2]))
|
||||
author = re.search(authpat, columns[col].strip()).group(1)
|
||||
if author in companies:
|
||||
authors[i] = None
|
||||
else:
|
||||
authors[i] = author
|
||||
|
||||
raise StopIteration("Found Author")
|
||||
except StopIteration:
|
||||
pass
|
||||
if start and col != None:
|
||||
break
|
||||
if not authors[i]:
|
||||
continue
|
||||
|
||||
if start and col != None:
|
||||
done = False
|
||||
count = 0
|
||||
keyword = False
|
||||
blanklines = 0
|
||||
for line in draft_lines[start+1:]:
|
||||
# Break on the second blank line
|
||||
if not line:
|
||||
blanklines += 1
|
||||
if blanklines >= 3:
|
||||
break
|
||||
else:
|
||||
continue
|
||||
else:
|
||||
count += 1
|
||||
authmatch = [ a for a in authors[i+1:] if a and not a in companies and re.search((r"(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)"), line.strip()) ]
|
||||
if authmatch:
|
||||
if count == 1 or (count == 2 and not blanklines):
|
||||
# First line after an author -- this is a company
|
||||
companies += authmatch
|
||||
companies += [ line.strip() ] # XXX fix this for columnized author list
|
||||
companies = list(set(companies))
|
||||
for k in range(i+1, len(authors)):
|
||||
if authors[k] in companies:
|
||||
authors[k] = None
|
||||
elif not "@" in line:
|
||||
break
|
||||
else:
|
||||
pass
|
||||
|
||||
try:
|
||||
column = line[beg:end].strip()
|
||||
except:
|
||||
column = line
|
||||
column = re.sub(" *\(at\) *", "@", column)
|
||||
column = re.sub(" *\(dot\) *", ".", column)
|
||||
|
||||
emailmatch = re.search("[-A-Za-z0-9_.+]+@[-A-Za-z0-9_.]+", column)
|
||||
if emailmatch and not "@" in authors[i]:
|
||||
email = emailmatch.group(0).lower()
|
||||
authors[i] = "%s <%s>" % (authors[i], email)
|
||||
else:
|
||||
authors[i] = None
|
||||
|
||||
authors = [ re.sub(r" +"," ", a) for a in authors if a != None ]
|
||||
if authors:
|
||||
authors.sort()
|
||||
self.parsed_info.metadraft.authors = authors
|
||||
else:
|
||||
self.parsed_info.errors.append("Draft authors could not be found.")
|
||||
|
||||
return authors
|
||||
|
|
Loading…
Reference in a new issue