datatracker/ietf/utils/draft.py
Henrik Levkowetz 0c49999fc9 Updated utils/draft.py and modified the submit app code accordingly.
New features (keep in mind that utils/draft.py can be run standalone
to do extraction of draft author data, too):

  * The handling of author info formatted in columns causes problems
    in the face of an author named for instance A. Author with the
    company 'Al Author and Associates', causing breakage of email
    addresses longer than 'Al Author and'.  Tweaked the recognition
    of column data to require multiple (not only one) space around
    'and'.

  * Added support for extraction of author affiliation.

  * Tweaked the meaning of -t, --timestamp and added --notimestamp; and
    made the default be to emit leading timestamps based ont the draft
    file time.

  * Added support for running author extraction on RFCs, by not bailing
    out on not finding a draft name when RFC information is available.

  * Added support for additional date formats and author name formats.

  * Improved creation date extraction -- previously, the first supported
    date format which was recognized on the first page of the draft would
    be used, rather than the first date in a supported format.  This could
    cause errors if the Status of Memo section or Abstract contained a
    date occurring at the start of a line.

  * Tweaked the honorific regex to make things work better for the case
    when the full name in the author's address section includes a first
    name which isn't part of the first-page abbreviated name.  Fixes
    problems with draft-chiappa-lisp-introduction and similar.

  * Added a special case for people who provide their email address as
    'foo&cisco.com' instead of 'foo@cisco.com'.  Bah.

  * Added an alternative, more human-readable key-value-pair attribute
    output mode with a '-a' switch.

  * Tweaded the first-name regex to capture cases where the first name
    is indicated with an alternate first letter: 'Y(J) Stein'.  Fixes
    problems with draft-anavi-tdmoip and similar.
 - Legacy-Id: 4612
2012-07-11 12:51:33 +00:00

1121 lines
48 KiB
Python
Executable file

#!/usr/bin/python
# -*- python -*-
"""
NAME
%(program)s - Extract meta-information from an IETF draft.
SYNOPSIS
%(program)s [OPTIONS] DRAFTLIST_FILE
DESCRIPTION
Extract information about authors' names and email addresses,
intended status and number of pages from Internet Drafts.
The information is emitted in the form of a line containing
xml-style attributes, prefixed with the name of the draft.
%(options)s
AUTHOR
Written by Henrik Levkowetz, <henrik@levkowetz.com>
COPYRIGHT
Copyright 2008 Henrik Levkowetz
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. There is NO WARRANTY; not even the
implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details.
"""
import datetime
import getopt
import os
import os.path
import re
import stat
import sys
import time
version = "0.26"
program = os.path.basename(sys.argv[0])
progdir = os.path.dirname(sys.argv[0])
# ----------------------------------------------------------------------
# Data
# ----------------------------------------------------------------------
opt_debug = False
opt_timestamp = False
opt_trace = False
opt_authorinfo = False
opt_getauthors = False
opt_attributes = False
# Don't forget to add the option variable to the globals list in _main below
# The following is an alias list for short forms which starts with a
# different letter than the long form.
longform = {
"Beth": "Elizabeth",
"Bill": "William",
"Bob": "Robert",
"Dick": "Richard",
"Fred": "Alfred",
"Jerry": "Gerald",
"Liz": "Elizabeth",
"Lynn": "Carolyn",
"Ned": "Edward",
"Ted":"Edward",
}
longform = dict([ (short+" ", longform[short]+" ") for short in longform ])
month_names = [ 'jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec' ]
# ----------------------------------------------------------------------
# Functions
# ----------------------------------------------------------------------
def _debug(string):
if opt_debug:
sys.stderr.write("%s\n" % (string))
# ----------------------------------------------------------------------
def _note(string):
sys.stdout.write("%s: %s\n" % (program, string))
# ----------------------------------------------------------------------
def _warn(string):
sys.stderr.write("%s: Warning: %s\n" % (program, string))
# ----------------------------------------------------------------------
def _err(string):
sys.stderr.write("%s: Error: %s\n" % (program, string))
sys.exit(1)
# ----------------------------------------------------------------------
def _gettext(file):
file = open(file)
text = file.read()
file.close()
text = re.sub(".\x08", "", text) # Get rid of inkribbon backspace-emphasis
text = text.replace("\r\n", "\n") # Convert DOS to unix
text = text.replace("\r", "\n") # Convert MAC to unix
text = text.expandtabs()
text = text.strip()
return text
def acronym_match(s, l):
acronym = re.sub("[^A-Z]", "", l)
#_debug(" s:%s; l:%s => %s; %s" % (s, l, acronym, s==acronym))
return s == acronym
# ----------------------------------------------------------------------
class Draft():
def __init__(self, text, source):
self.source = source
self.rawtext = text
text = re.sub(".\x08", "", text) # Get rid of inkribbon backspace-emphasis
text = text.replace("\r\n", "\n") # Convert DOS to unix
text = text.replace("\r", "\n") # Convert MAC to unix
text = text.strip()
self.text = text
self.errors = {}
self.rawlines = self.text.split("\n")
self.lines, self.pages = self._stripheaders()
# Some things (such as the filename) has to be on the first page. If
# we didn't get back a set of pages, only one single page with the
# whole document, then we need to do an enforced page split in order
# to limit later searches to the first page.
if len(self.pages) <= 1:
self.pages = []
for pagestart in range(0, len(self.lines), 58):
self.pages += [ "\n".join(self.lines[pagestart:pagestart+54]) ]
try:
self.filename, self.revision = self._parse_draftname()
except ValueError, e:
_warn("While processing '%s': %s" % (self.source, e))
try:
path, base = self.source.rsplit("/", 1)
except ValueError:
path, base = "", self.source
if base.startswith("draft-"):
name, ext = base.split(".", 1)
revmatch = re.search("\d\d$", name)
if revmatch:
self.filename = name[:-3]
self.revision = name[-2:]
else:
raise ValueError(e+"\n"+self.source)
else:
raise ValueError(e+"\n"+self.source)
self._authors = None
self._authors_with_firm = None
self._author_info = None
self._abstract = None
self._pagecount = None
self._status = None
self._creation_date = None
self._title = None
# ------------------------------------------------------------------
def _parse_draftname(self):
draftname_regex = r"(draft-[a-z0-9-]*)-(\d\d)(\w|\.txt|\n|$)"
draftname_match = re.search(draftname_regex, self.pages[0])
rfcnum_regex = r"(Re[qg]uests? [Ff]or Commm?ents?:? +|Request for Comments: RFC |RFC-|RFC )((# ?)?[0-9]+)( |,|\n|$)"
rfcnum_match = re.search(rfcnum_regex, self.pages[0])
if draftname_match:
return (draftname_match.group(1), draftname_match.group(2) )
elif rfcnum_match:
return ("rfc"+rfcnum_match.group(2), None )
else:
self.errors["draftname"] = "Could not find the draft name and revision on the first page."
raise ValueError, self.errors["draftname"] + "\n'"+self.text.strip()[:240] + "'..."
return ("", "")
# ----------------------------------------------------------------------
def _stripheaders(self):
stripped = []
pages = []
page = []
line = ""
debug = False
newpage = False
sentence = False
blankcount = 0
linecount = 0
# two functions with side effects
def endpage(pages, page, newpage, line):
if line:
page += [ line ]
return begpage(pages, page, newpage)
def begpage(pages, page, newpage, line=None):
if page and len(page) > 5:
pages += [ "\n".join(page) ]
page = []
newpage = True
if line:
page += [ line ]
return pages, page, newpage
for line in self.rawlines:
linecount += 1
line = line.rstrip()
if re.search("\[?[Pp]age [0-9ivx]+\]?[ \t\f]*$", line, re.I):
pages, page, newpage = endpage(pages, page, newpage, line)
continue
if re.search("\f", line, re.I):
pages, page, newpage = begpage(pages, page, newpage)
continue
if re.search("^ *Internet.Draft.+[12][0-9][0-9][0-9] *$", line, re.I):
pages, page, newpage = begpage(pages, page, newpage, line)
continue
# if re.search("^ *Internet.Draft +", line, re.I):
# newpage = True
# continue
if re.search("^ *Draft.+[12][0-9][0-9][0-9] *$", line, re.I):
pages, page, newpage = begpage(pages, page, newpage, line)
continue
if re.search("^RFC[ -]?[0-9]+.*( +)[12][0-9][0-9][0-9]$", line, re.I):
pages, page, newpage = begpage(pages, page, newpage, line)
continue
if re.search("^draft-[-a-z0-9_.]+.*[0-9][0-9][0-9][0-9]$", line, re.I):
pages, page, newpage = endpage(pages, page, newpage, line)
continue
if linecount > 15 and re.search(".{58,}(Jan|Feb|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|Sep|Oct|Nov|Dec) (19[89][0-9]|20[0-9][0-9]) *$", line, re.I):
pages, page, newpage = begpage(pages, page, newpage, line)
continue
if newpage and re.search("^ *draft-[-a-z0-9_.]+ *$", line, re.I):
pages, page, newpage = begpage(pages, page, newpage, line)
continue
if re.search("^[^ \t]+", line):
sentence = True
if re.search("[^ \t]", line):
if newpage:
if sentence:
stripped += [""]
else:
if blankcount:
stripped += [""]*blankcount
blankcount = 0
sentence = False
newpage = False
if re.search("[.:]$", line):
sentence = True
if re.search("^[ \t]*$", line):
blankcount += 1
page += [ line ]
continue
page += [ line ]
stripped += [ line ]
pages, page, newpage = begpage(pages, page, newpage)
return stripped, pages
# ----------------------------------------------------------------------
def get_pagecount(self):
if self._pagecount == None:
self._pagecount = len(re.findall("\[[Pp]age [0-9ixldv]+\]", self.text)) or len(self.lines)/58
return self._pagecount
# ----------------------------------------------------------------------
def get_status(self):
if self._status == None:
for line in self.lines[:10]:
status_match = re.search("^\s*Intended [Ss]tatus:\s*(.*?) ", line)
if status_match:
self._status = status_match.group(1)
break
return self._status
# ------------------------------------------------------------------
def get_creation_date(self):
if self._creation_date:
return self._creation_date
date_regexes = [
r'^(?P<month>\w+)\s+(?P<day>\d{1,2}),?\s+(?P<year>\d{4})',
r'^(?P<day>\d{1,2}),?\s+(?P<month>\w+)\s+(?P<year>\d{4})',
r'^(?P<day>\d{1,2})-(?P<month>\w+)-(?P<year>\d{4})',
r'^(?P<month>\w+)\s+(?P<year>\d{4})',
r'\s{3,}(?P<month>\w+)\s+(?P<day>\d{1,2}),?\s+(?P<year>\d{4})',
r'\s{3,}(?P<day>\d{1,2}),?\s+(?P<month>\w+)\s+(?P<year>\d{4})',
r'\s{3,}(?P<day>\d{1,2})-(?P<month>\w+)-(?P<year>\d{4})',
# 'October 2008' - default day to today's.
r'\s{3,}(?P<month>\w+)\s+(?P<year>\d{4})',
]
dates = []
text = self.pages[0]
for regex in date_regexes:
match = re.search(regex, text, re.MULTILINE)
if match:
start = match.start()
if not "expires" in text[start-10:start].lower():
dates += [(start, match)]
dates.sort()
for start, match in dates:
md = match.groupdict()
mon = md['month'][0:3].lower()
day = int( md.get( 'day', 0 ) )
year = int( md['year'] )
try:
month = month_names.index( mon ) + 1
today = datetime.date.today()
if day==0:
# if the date was given with only month and year, use
# today's date if month and year is today's month and
# year, otherwise pick the middle of the month.
# Don't use today's day for month and year in the past
if month==today.month and year==today.year:
day = today.day
else:
day = 15
self._creation_date = datetime.date(year, month, day)
return self._creation_date
except ValueError:
# mon abbreviation not in _MONTH_NAMES
# or month or day out of range
pass
self.errors['creation_date'] = 'Creation Date field is empty or the creation date is not in a proper format.'
return self._creation_date
# ------------------------------------------------------------------
def get_abstract(self):
if self._abstract:
return self._abstract
abstract_re = re.compile('^(\s*)abstract', re.I)
header_re = re.compile("^(\s*)([0-9]+\.? |Appendix|Status of|Table of|Full Copyright|Copyright|Intellectual Property|Acknowled|Author|Index|Disclaimer).*", re.I)
begin = False
abstract = []
abstract_indent = 0
look_for_header = False
for line in self.lines:
if not begin and abstract_re.match(line):
begin=True
abstract_indent = len(abstract_re.match(line).group(0))
continue
if begin:
if not line and not abstract:
continue
if not line:
look_for_header=True
abstract.append(line)
continue
if look_for_header and header_re.match(line):
break
look_for_header = False
abstract.append(line)
abstract = '\n'.join(abstract)
abstract = self._clean_abstract(abstract)
self._abstract = self._check_abstract_indent(abstract, abstract_indent)
return self._abstract
def _check_abstract_indent(self, abstract, indent):
indentation_re = re.compile('^(\s)*')
indent_lines = []
for line in abstract.split('\n'):
if line:
indent = len(indentation_re.match(line).group(0))
indent_lines.append(indent)
percents = {}
total = float(len(indent_lines))
formated = False
for indent in set(indent_lines):
count = indent_lines.count(indent)/total
percents[indent] = count
if count > 0.9:
formated = True
if not formated:
return abstract
new_abstract = []
for line in abstract.split('\n'):
if line:
indent = len(indentation_re.match(line).group(0))
if percents[indent] < 0.9:
break
new_abstract.append(line)
return '\n'.join(new_abstract)
def _clean_abstract(self, text):
text = re.sub("(?s)(Conventions [Uu]sed in this [Dd]ocument|Requirements [Ll]anguage)?[\n ]*The key words \"MUST\", \"MUST NOT\",.*$", "", text)
# Get rid of status/copyright boilerplate
text = re.sub("(?s)\nStatus of [tT]his Memo\n.*$", "", text)
# wrap long lines without messing up formatting of Ok paragraphs:
while re.match("([^\n]{72,}?) +", text):
text = re.sub("([^\n]{72,}?) +([^\n ]*)(\n|$)", "\\1\n\\2 ", text)
return text
# ------------------------------------------------------------------
def get_authors(self):
"""Returns a list of strings with author name and email within angle brackets"""
if self._authors == None:
self.extract_authors()
return self._authors
def get_authors_with_firm(self):
"""Returns a list of strings with author name and email within angle brackets"""
if self._authors_with_firm == None:
self.extract_authors()
return self._authors_with_firm
def get_author_list(self):
"""Returns a list of tuples, with each tuple containing (given_names,
surname, email, company). Email will be None if unknown.
"""
if self._author_info == None:
self.extract_authors()
return self._author_info
def extract_authors(self):
"""Extract author information from draft text.
"""
aux = {
"honor" : r"(?:[A-Z]\.|Dr\.?|Prof(?:\.?|essor)|Sir|Lady|Dame|Sri)",
"prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von|[Ee]l)",
"suffix": r"(jr.?|Jr.?|II|2nd|III|3rd|IV|4th)",
"first" : r"([A-Z][-A-Za-z]*)(( ?\([A-Z][-A-Za-z]*\))?(\.?[- ]{1,2}[A-Za-z]+)*)",
"last" : r"([-A-Za-z']{2,})",
"months": r"(January|February|March|April|May|June|July|August|September|October|November|December)",
"mabbr" : r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?",
}
authcompanyformats = [
r" {6}(?P<author>(%(first)s[ \.]{1,3})+((%(prefix)s )?%(last)s)( %(suffix)s)?), (?P<company>[^.]+\.?)$" % aux,
r" {6}(?P<author>(%(first)s[ \.]{1,3})+((%(prefix)s )?%(last)s)( %(suffix)s)?) *\((?P<company>[^.]+\.?)\)$" % aux,
]
authformats = [
r" {6}((%(first)s[ \.]{1,3})+((%(prefix)s )?%(last)s)( %(suffix)s)?)(, ([^.]+\.?|\([^.]+\.?|\)))?,?$" % aux,
r" {6}(((%(prefix)s )?%(last)s)( %(suffix)s)?, %(first)s)?$" % aux,
r" {6}(%(last)s)$" % aux,
]
multiauthformats = [
(
r" {6}(%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)(, ?%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)+$" % aux,
r"(%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)" % aux
),
]
editorformats = [
r"(?:, | )([Ee]d\.?|\([Ee]d\.?\)|[Ee]ditor)$",
]
companyformats = [
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(,? ?Inc\.?))$",
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(,? ?Ltd\.?))$",
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(/([A-Za-z'][-A-Za-z0-9.& ']+))+)$",
r" {6}([a-z0-9.-]+)$",
r" {6}(([A-Za-z'][-A-Za-z0-9.&']+)( [A-Za-z'][-A-Za-z0-9.&']+)*)$",
r" {6}(([A-Za-z'][-A-Za-z0-9.']+)( & [A-Za-z'][-A-Za-z0-9.']+)*)$",
r" {6}\((.+)\)$",
r" {6}(\w+\s?\(.+\))$",
]
dateformat = r"(((%(month)s|%(mabbr)s) \d+, |\d+ (%(month)s|%(mabbr)s),? |\d+/\d+/)\d\d\d\d|\d\d\d\d-\d\d-\d\d)$"
address_section = r"^ *([0-9]+\.)? *(Author|Editor)('s|s'|s|\(s\)) (Address|Addresses|Information)"
ignore = [
"Standards Track", "Current Practice", "Internet Draft", "Working Group",
"Expiration Date",
]
def make_authpat(hon, first, last, suffix):
def dotexp(s):
s = re.sub("\. ", ".* ", s)
s = re.sub("\.$", ".*", s)
return s
first = dotexp(first)
last = dotexp(last)
first = re.sub("[()]", " ", first)
if " " in first:
# if there's a middle part, let it be optional
first, middle = first.split(" ", 1)
first = "%s( +%s)?" % (first, middle)
# Some chinese names are shown with double-letter(latin) abbreviated given names, rather than
# a single-letter(latin) abbreviation:
first = re.sub("^([A-Z])[A-Z]+\.\*", r"\1[-\w]+", first)
# permit insertion of middle names between first and last, and
# add possible honorific and suffix information
authpat = "(?:^| and )(?:%(hon)s ?)?(%(first)s\S*( +[^ ]+)* +%(last)s)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| %(suffix)s| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,}
return authpat
authors = []
companies = []
author_info = []
companies_seen = []
self._docheader = ""
# Collect first-page author information first
have_blankline = False
have_draftline = False
prev_blankline = False
for line in self.lines[:30]:
self._docheader += line+"\n"
author_on_line = False
_debug( "**" + line)
leading_space = len(re.findall("^ *", line)[0])
line_len = len(line.rstrip())
trailing_space = line_len <= 72 and 72 - line_len or 0
# Truncate long lines at the first space past column 80:
trunc_space = line.find(" ", 80)
if line_len > 80 and trunc_space > -1:
line = line[:trunc_space]
if line_len > 60:
# Look for centered title, break if found:
if (leading_space > 5 and abs(leading_space - trailing_space) < 5):
_debug("Breaking for centered line")
break
if re.search(dateformat, line):
if authors:
_debug("Breaking for dateformat after author name")
for editorformat in editorformats:
if re.search(editorformat, line):
line = re.sub(editorformat, "", line)
break
for lineformat, authformat in multiauthformats:
match = re.search(lineformat, line)
if match:
_debug("Multiauth format: '%s'" % lineformat)
author_list = re.findall(authformat, line)
authors += [ a[0] for a in author_list ]
companies += [ None for a in author_list ]
author_on_line = True
#_debug("\nLine: " + line)
#_debug("Format: " + authformat)
for author in author_list:
_debug("Author: '%s'" % author[0])
break
if not author_on_line:
for lineformat in authcompanyformats:
match = re.search(lineformat, line)
if match:
_debug("Line format: '%s'" % lineformat)
author = match.group("author")
company = match.group("company")
authors += [ author, '']
companies += [ None, company ]
#_debug("\nLine: " + line)
#_debug("Format: " + authformat)
_debug("Author: '%s'" % author)
_debug("Company: '%s'" % company)
author_on_line = True
break
if not author_on_line:
for authformat in authformats:
match = re.search(authformat, line)
if match:
_debug("Auth format: '%s'" % authformat)
author = match.group(1)
authors += [ author ]
companies += [ None ]
#_debug("\nLine: " + line)
#_debug("Format: " + authformat)
_debug("Author: '%s'" % author)
author_on_line = True
break
if not author_on_line:
for authformat in companyformats:
match = re.search(authformat, line)
if match:
_debug("Auth format: '%s'" % authformat)
company = match.group(1)
authors += [ "" ]
companies += [ company ]
#_debug("\nLine: " + line)
#_debug("Format: " + authformat)
_debug("Company: '%s'" % company)
break
if authors and not author_on_line:
# Retain information about blank lines in author list
authors += [""]
companies += [ "" ]
if line.strip() == "":
if prev_blankline and authors:
_debug("Breaking for having found consecutive blank lines after author name")
break
if authors:
have_blankline = True
prev_blankline = True
else:
prev_blankline = False
if "draft-" in line:
have_draftline = True
if have_blankline and have_draftline:
_debug("Breaking for having found both blank line and draft-name line")
break
# remove trailing blank entries in the author list:
for i in range(len(authors)-1,-1,-1):
if authors[i] == "" and companies[i] == "":
del authors[i]
del companies[i]
else:
break
_debug("A:companies : %s" % str(companies))
#companies = [ None if a else '' for a in authors ]
#_debug("B:companies : %s" % str(companies))
#find authors' addresses section if it exists
last_line = len(self.lines)-1
address_section_pos = last_line/2
for i in range(last_line/2,last_line):
line = self.lines[i]
if re.search(address_section, line):
address_section_pos = i
break
found_pos = []
for i in range(len(authors)):
_debug("1: authors[%s]: %s" % (i, authors[i]))
_debug(" company[%s]: %s" % (i, companies[i]))
author = authors[i]
if author in [ None, '', ]:
continue
suffix_match = re.search(" %(suffix)s$" % aux, author)
if suffix_match:
suffix = suffix_match.group(1)
author = author[:-len(suffix)].strip()
else:
suffix = None
if "," in author:
last, first = author.split(",",1)
author = "%s %s" % (first.strip(), last.strip())
if not " " in author:
if "." in author:
first, last = author.rsplit(".", 1)
first += "."
else:
author = "[A-Z].+ " + author
first, last = author.rsplit(" ", 1)
else:
first, last = author.rsplit(" ", 1)
prefix_match = re.search(" %(prefix)s$" % aux, first)
if prefix_match:
prefix = prefix_match.group(1)
first = first[:-len(prefix)].strip()
last = prefix+" "+last
_debug("First, Last: '%s' '%s'" % (first, last))
for firstname, surname, casefixname in [ (first,last,last), (last,first,first), (first,last,last.upper()), (last,first,first.upper()), ]:
author = "%s %s" % (firstname, casefixname)
_debug("\nAuthors: "+str(authors))
_debug("Author: "+author)
# Pattern for full author information search, based on first page author name:
authpat = make_authpat(aux['honor'], firstname, casefixname, aux['suffix'])
_debug("Authpat: " + authpat)
start = 0
col = None
# Find start of author info for this author (if any).
# Scan towards the front from the end of the file, looking for a match to authpath
for j in range(last_line, address_section_pos, -1):
line = self.lines[j]
_debug( "Line: " + line)
forms = [ line ] + [ line.replace(short, longform[short]) for short in longform if short in line ]
for form in forms:
try:
if re.search(authpat, form.strip()) and not j in found_pos:
_debug( "Match")
start = j
found_pos += [ start ]
_debug( " ==> start %s, normalized '%s'" % (start, form.strip()))
# The author info could be formatted in multiple columns...
columns = re.split("( +| and )", form)
# _debug( "Columns:" + str(columns))
# Find which column:
# _debug( "Col range:" + str(range(len(columns))))
cols = [ c for c in range(len(columns)) if re.search(authpat+r"( and |, |$)", columns[c].strip()) ]
if cols:
col = cols[0]
if not (start, col) in found_pos:
found_pos += [ (start, col) ]
_debug( "Col: %d" % col)
beg = len("".join(columns[:col]))
_debug( "Beg: %d '%s'" % (beg, "".join(columns[:col])))
_debug( "Len: %d" % len(columns))
if col == len(columns) or col == len(columns)-1:
end = None
_debug( "End1: %s" % end)
else:
end = beg + len("".join(columns[col:col+2]))
_debug( "End2: %d '%s'" % (end, "".join(columns[col:col+2])))
_debug( "Cut: '%s'" % form[beg:end])
author_match = re.search(authpat, columns[col].strip()).group(1)
_debug( "AuthMatch: '%s'" % (author_match,))
if author_match in companies_seen:
companies[i] = authors[i]
authors[i] = None
else:
if casefixname in author_match:
fullname = author_match.replace(casefixname, surname)
else:
fullname = author_match
fullname = re.sub(" +", " ", fullname)
given_names, surname = fullname.rsplit(None, 1)
if " " in given_names:
first, middle = given_names.split(None, 1)
else:
first = given_names
middle = None
names = (first, middle, surname, suffix)
if suffix:
fullname = fullname+" "+suffix
if not " ".join([ n for n in names if n ]) == fullname:
_err("Author tuple doesn't match text in draft: %s, %s" % (authors[i], fullname))
authors[i] = (fullname, first, middle, surname, suffix)
companies[i] = None
#_debug( "Author: %s: %s" % (author_match, authors[author_match]))
break
except AssertionError, e:
sys.stderr.write("filename: "+self.filename+"\n")
sys.stderr.write("authpat: "+authpat+"\n")
raise
if start and col != None:
break
if start and col != None:
break
if not authors[i]:
continue
_debug("2: authors[%s]: %s" % (i, authors[i]))
if start and col != None:
_debug("\n * %s" % (authors[i], ))
done = False
nonblank_count = 0
keyword = False
blanklines = 0
email = None
for line in self.lines[start+1:]:
_debug( " " + line.strip())
# Break on the second blank line
if not line:
blanklines += 1
if blanklines >= 3:
_debug( " - Break on blanklines")
break
else:
continue
else:
nonblank_count += 1
# Maybe break on author name
# _debug("Line: %s"%line.strip())
# for a in authors:
# if a and a not in companies_seen:
# _debug("Search for: %s"%(r"(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)"))
authmatch = [ a for a in authors[i+1:] if a and not a.lower() in companies_seen and (re.search((r"(?i)(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)"), line.strip()) or acronym_match(a, line.strip()) )]
if authmatch:
_debug(" ? Other author or company ? : %s" % authmatch)
_debug(" Line: "+line.strip())
if nonblank_count == 1 or (nonblank_count == 2 and not blanklines):
# First line after an author -- this is a company
companies_seen += [ c.lower() for c in authmatch ]
companies_seen += [ line.strip().lower() ] # XXX fix this for columnized author list
companies_seen = list(set(companies_seen))
_debug(" -- Companies: " + ", ".join(companies_seen))
for k in range(i+1, len(authors)):
if authors[k] and authors[k].lower() in companies_seen:
companies[k] = authors[k]
authors[k] = None
elif blanklines and not "@" in line:
# Break on an author name
_debug( " - Break on other author name")
break
else:
pass
try:
column = line[beg:end].strip()
except:
column = line
column = re.sub(" *\(at\) *", "@", column)
column = re.sub(" *\(dot\) *", ".", column)
column = re.sub(" +at +", "@", column)
column = re.sub(" +dot +", ".", column)
column = re.sub("&cisco.com", "@cisco.com", column)
# if re.search("^\w+: \w+", column):
# keyword = True
# else:
# if keyword:
# # Break on transition from keyword line to something else
# _debug( " - Break on end of keywords")
# break
#_debug( " Column text :: " + column)
_debug("3: authors[%s]: %s" % (i, authors[i]))
emailmatch = re.search("[-A-Za-z0-9_.+]+@[-A-Za-z0-9_.]+", column)
if emailmatch and not "@" in author:
email = emailmatch.group(0).lower()
break
authors[i] = authors[i] + ( email, )
else:
if not author in ignore:
companies[i] = authors[i]
_debug("Not an author? '%s'" % (author))
authors[i] = None
assert(len(authors) == len(companies))
_debug('Author list: %s' % authors)
_debug('Company list: %s' % companies)
for i in range(len(authors)):
if authors[i]:
_debug('authors[%s]: %s' % (i, authors[i]))
company = ''
for k in range(i+1, len(companies)):
_debug('companies[%s]: %s' % (k, companies[k]))
if companies[k] != None:
company = companies[k]
break
authors[i] = authors[i] + ( company, )
authors = [ a for a in authors if a ]
_debug(" * Final author tuples: %s" % (authors,))
_debug(" * Final company list: %s" % (companies,))
_debug(" * Final companies_seen: %s" % (companies_seen,))
self._author_info = authors
self._authors_with_firm = [ "%s <%s> (%s)"%(full,email,company) for full,first,middle,last,suffix,email,company in authors ]
self._authors = [ "%s <%s>"%(full,email) if email else full for full,first,middle,last,suffix,email,company in authors ]
self._authors.sort()
_debug(" * Final author list: " + ", ".join(self._authors))
_debug("-"*72)
# ------------------------------------------------------------------
def get_title(self):
if self._title:
return self._title
match = re.search('(?:\n\s*\n\s*)((.+\n){0,2}(.+\n*))(\s+<?draft-\S+\s*\n)\s*\n', self.pages[0])
if not match:
match = re.search('(?:\n\s*\n\s*)<?draft-\S+\s*\n*((.+\n){1,3})\s*\n', self.pages[0])
if not match:
match = re.search('(?:\n\s*\n\s*)((.+\n){0,2}(.+\n*))(\s*\n){2}', self.pages[0])
if not match:
match = re.search('(?i)(.+\n|.+\n.+\n)(\s*status of this memo\s*\n)', self.pages[0])
if match:
title = match.group(1)
title = title.strip()
title = re.sub('\s*\n\s*', ' ', title)
title = re.sub(' +', ' ', title)
self._title = title
return self._title
self.errors["title"] = "Could not find the title on the first page."
# ------------------------------------------------------------------
def get_refs(self):
refs = []
normrefs = []
rfcrefs = []
refline = None
for i in range(len(self.lines)-1, 15, -1):
if re.search(r"(?i)^ *[0-9.]+ *(((normative|informative|informational|non-normative) )?references|references\W+(normative|informative))", self.lines[i]):
refline = i
break
if refline:
for i in range(refline, len(self.lines)):
line = self.lines[i].strip()
ref_match = re.search(r"(?i)^\[[a-z0-9.-]+( [a-z0-9.-]+)?\].+", line)
if ref_match:
para = line
while True:
i += 1
if i >= len(self.lines):
break
line = self.lines[i].strip()
if not line:
break
if para[-1] not in ["-", "/"]:
para += " "
para += line
refs += [ para ]
rfc_match = re.search("(?i)rfc ?\d+", para)
if rfc_match:
rfc = rfc_match.group(0).replace(" ","").lower()
rfcrefs += [ rfc ]
normrefs = list(set(normrefs))
normrefs.sort()
rfcrefs = list(set(rfcrefs))
rfcrefs.sort()
refs = list(set(refs))
refs.sort()
return normrefs, rfcrefs, refs
# ----------------------------------------------------------------------
def getmeta(fn):
# Initial values
fields = {}
fields["eventsource"] = "draft"
if " " in fn or not fn.endswith(".txt"):
_warn("Skipping unexpected draft name: '%s'" % (fn))
return {}
if os.path.exists(fn):
filename = fn
fn = os.path.basename(fn)
else:
if fn.lower().startswith('rfc'):
filename = os.path.join("/www/tools.ietf.org/rfc", fn)
elif not "/" in fn:
filename = os.path.join("/www/tools.ietf.org/id", fn)
if not os.path.exists(filename):
fn = filename
while not "-00." in fn:
revmatch = re.search("-(\d\d)\.", fn)
if revmatch:
rev = revmatch.group(1)
prev = "%02d" % (int(rev)-1)
fn = fn.replace("-%s."%rev, "-%s."%prev)
if os.path.exists(fn):
_warn("Using rev %s instead: '%s'" % (prev, filename))
filename = fn
fn = os.path.basename(fn)
break
else:
break
else:
filename = fn
if not os.path.exists(filename):
_warn("Could not find file: '%s'" % (filename))
return
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+00:00", time.gmtime(os.stat(filename)[stat.ST_MTIME]))
text = _gettext(filename)
draft = Draft(text, filename)
#_debug("\n".join(draft.lines))
fields["eventdate"] = timestamp
if draft.filename:
fields["doctag"] = draft.filename
fields["docrev"] = draft.revision
fields["doctitle"] = draft.get_title()
fields["docpages"] = str(draft.get_pagecount())
fields["docauthors"] = ", ".join(draft.get_authors())
fields["_authorlist"] = draft.get_author_list()
fields["docaffiliations"] = ", ".join(draft.get_authors_with_firm())
if opt_debug:
fields["docheader"] = draft._docheader
normrefs, rfcrefs, refs = draft.get_refs()
fields["docrfcrefs"] = ", ".join(rfcrefs)
fields["doccreationdate"] = str(draft.get_creation_date())
deststatus = draft.get_status()
if deststatus:
fields["docdeststatus"] = deststatus
abstract = draft.get_abstract()
if abstract:
fields["docabstract"] = abstract
return fields
# ----------------------------------------------------------------------
def _output(docname, fields, outfile=sys.stdout):
if opt_getauthors:
# Output an (incomplete!) getauthors-compatible format. Country
# information is always UNKNOWN, and information about security and
# iana sections presence is missing.
for full,first,middle,last,suffix,email,company in fields["_authorlist"]:
if company in company_domain:
company = company_domain[company]
else:
if email and '@' in email:
company = email.split('@')[1]
if company.endswith(".com"):
company = company[:-4]
fields["name"] = full
fields["email"] = email
fields["company"] = company
fields["country"] = "UNKNOWN"
try:
year, month, day = fields["doccreationdate"].split("-")
except ValueError:
year, month, day = "UNKNOWN", "UNKNOWN", "UNKNOWN"
fields["day"] = day
fields["month"] = month_names[int(month)] if month != "UNKNOWN" else "UNKNOWN"
fields["year"] = year
print "%(doctag)s:%(name)s:%(company)s:%(email)s:%(country)s:%(docpages)s:%(month)s:%(year)s:%(day)s:" % fields
else:
if opt_attributes:
def outputkey(key, fields):
outfile.write("%-24s: %s\n" % ( key, fields[key].strip().replace("\\", "\\\\" ).replace("'", "\\x27" )))
else:
def outputkey(key, fields):
outfile.write(" %s='%s'" % ( key.lower(), fields[key].strip().replace("\\", "\\\\" ).replace("'", "\\x27" ).replace("\n", "\\n")))
if opt_timestamp:
outfile.write("%s " % (fields["eventdate"]))
outfile.write("%s" % (os.path.basename(docname.strip())))
keys = fields.keys()
keys.sort()
for key in keys:
if fields[key] and not key in ["eventdate", ] and not key.startswith("_"):
outputkey(key, fields)
outfile.write("\n")
# ----------------------------------------------------------------------
def _printmeta(fn, outfile=sys.stdout):
if opt_trace:
t = time.time()
sys.stderr.write("%-58s" % fn[:-4])
fields = getmeta(fn)
if fields:
_output(fields.get("doctag", fn[:-7]), fields, outfile)
if opt_trace:
sys.stderr.write("%5.1f\n" % ((time.time() - t)))
# ----------------------------------------------------------------------
# Main
# ----------------------------------------------------------------------
def _main(outfile=sys.stdout):
global opt_debug, opt_timestamp, opt_trace, opt_authorinfo, opt_getauthors, files, company_domain, opt_attributes
# set default values, if any
# ----------------------------------------------------------------------
# Option processing
# ----------------------------------------------------------------------
options = ""
for line in re.findall("\n +(if|elif) +opt in \[(.+)\]:\s+#(.+)\n", open(sys.argv[0]).read()):
if not options:
options += "OPTIONS\n"
options += " %-16s %s\n" % (line[1].replace('"', ''), line[2])
options = options.strip()
# with ' < 1:' on the next line, this is a no-op:
if len(sys.argv) < 1:
vars = globals()
vars.update(locals())
print __doc__ % vars
sys.exit(1)
try:
opts, files = getopt.gnu_getopt(sys.argv[1:], "dhatTv", ["debug", "getauthors", "attribs", "attributes", "help", "timestamp", "notimestamp", "trace", "version",])
except Exception, e:
print "%s: %s" % (program, e)
sys.exit(1)
# parse options
for opt, value in opts:
if opt in ["-d", "--debug"]: # Output debug information
opt_debug = True
elif opt in ["-h", "--help"]: # Output this help text, then exit
vars = globals()
vars.update(locals())
print __doc__ % vars
sys.exit(1)
elif opt in ["-v", "--version"]: # Output version information, then exit
print program, version
sys.exit(0)
elif opt in ["--getauthors"]: # Output an (incomplete) getauthors-compatible format
opt_getauthors = True
elif opt in ["-a", "--attribs"]: # Output key-value attribute pairs
opt_attributes = True
elif opt in ["-t", ]: # Toggle leading timestamp information
opt_timestamp = not opt_timestamp
elif opt in ["--timestamp"]: # Emit leading timestamp information
opt_timestamp = True
elif opt in ["--notimestamp"]: # Omit leading timestamp information
opt_timestamp = False
elif opt in ["-T", "--trace"]: # Emit trace information while working
opt_trace = True
company_domain = {}
if opt_getauthors:
gadata = open("/www/tools.ietf.org/tools/getauthors/getauthors.data")
for line in gadata:
if line.startswith("company:"):
try:
kword, name, abbrev = line.strip().split(':')
company_domain[name] = abbrev
except ValueError:
pass
if not files:
files = [ "-" ]
for file in files:
_debug( "Reading drafts from '%s'" % file)
if file == "-":
file = sys.stdin
elif file.endswith(".gz"):
file = gzip.open(file)
else:
file = open(file)
basename = os.path.basename(file.name)
if basename.startswith("draft-"):
draft = basename
_debug( "** Processing '%s'" % draft)
_printmeta(file.name, outfile)
else:
for line in file:
draft = line.strip()
if draft.startswith("#"):
continue
if draft:
_debug( "** Processing '%s'" % draft)
_printmeta(draft, outfile)
if __name__ == "__main__":
try:
_main()
except KeyboardInterrupt:
raise
pass