datatracker/ietf/utils/draft.py
Robert Sparks b926178e62
fix: quicker calculation of status from draft text (#8111)
* fix: quicker calculation of status from draft text

* chore: remove unused import

* fix: only read a small prefix of draft text when needed
2024-10-29 11:18:31 -05:00

1466 lines
63 KiB
Python
Executable file

#!/usr/bin/python
# Copyright The IETF Trust 2009-2022, All Rights Reserved
# -*- coding: utf-8 -*-
# -*- python -*-
"""
NAME
%(program)s - Extract meta-information from an IETF draft.
SYNOPSIS
%(program)s [OPTIONS] DRAFTLIST_FILE
DESCRIPTION
Extract information about authors' names and email addresses,
intended status and number of pages from Internet-Drafts.
The information is emitted in the form of a line containing
xml-style attributes, prefixed with the name of the draft.
%(options)s
AUTHOR
Written by Henrik Levkowetz, <henrik@levkowetz.com>
COPYRIGHT
Copyright 2008 Henrik Levkowetz
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or (at
your option) any later version. There is NO WARRANTY; not even the
implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
PURPOSE. See the GNU General Public License for more details.
"""
import debug # pyflakes: ignore
import datetime
import getopt
import io
import os
import os.path
import re
import stat
import sys
import time
from typing import Dict, List # pyflakes:ignore
from .timezone import date_today
version = "0.35"
program = os.path.basename(sys.argv[0])
progdir = os.path.dirname(sys.argv[0])
# ----------------------------------------------------------------------
# Data
# ----------------------------------------------------------------------
opt_debug = False
opt_timestamp = False
opt_trace = False
opt_authorinfo = False
opt_attributes = False
# Don't forget to add the option variable to the globals list in _main below
# The following is an alias list for short forms which starts with a
# different letter than the long form.
longform = {
"Beth": "Elizabeth",
"Bill": "William",
"Bob": "Robert",
"Dick": "Richard",
"Fred": "Alfred",
"Jerry": "Gerald",
"Liz": "Elizabeth",
"Lynn": "Carolyn",
"Ned": "Edward",
"Ted":"Edward",
}
longform = dict([ (short+" ", longform[short]+" ") for short in longform ])
month_names = [ 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december' ]
month_names_abbrev3 = [ n[:3] for n in month_names ]
month_names_abbrev4 = [ n[:4] for n in month_names ]
# ----------------------------------------------------------------------
# Functions
# ----------------------------------------------------------------------
def _debug(string):
if opt_debug:
sys.stderr.write("%s\n" % (string))
# ----------------------------------------------------------------------
def _note(string):
sys.stdout.write("%s: %s\n" % (program, string))
# ----------------------------------------------------------------------
def _warn(string):
sys.stderr.write("%s: Warning: %s\n" % (program, string))
# ----------------------------------------------------------------------
def _err(string):
sys.stderr.write("%s: Error: %s\n" % (program, string))
sys.exit(1)
# ----------------------------------------------------------------------
def _gettext(file):
file = io.open(file)
text = file.read()
file.close()
text = re.sub(".\x08", "", text) # Get rid of inkribbon backspace-emphasis
text = text.replace("\r\n", "\n") # Convert DOS to unix
text = text.replace("\r", "\n") # Convert MAC to unix
text = text.expandtabs()
text = text.strip()
return text
def acronym_match(s, l):
acronym = re.sub("[^A-Z]", "", l)
#_debug(" s:%s; l:%s => %s; %s" % (s, l, acronym, s==acronym))
return s == acronym
def get_status_from_draft_text(text):
# Take prefix to shortcut work over very large drafts
# 5000 is conservatively much more than a full page of characters and we
# only want the first 10 lines.
text = text.strip()[:5000] # Take prefix to shortcut work over very large drafts
text = re.sub(".\x08", "", text) # Get rid of inkribbon backspace-emphasis
text = text.replace("\r\n", "\n") # Convert DOS to unix
text = text.replace("\r", "\n") # Convert MAC to unix
lines = text.split("\n")[:10]
status = None
for line in lines:
status_match = re.search(r"^\s*Intended [Ss]tatus:\s*(.*?) ", line)
if status_match:
status = status_match.group(1)
break
return status
class Draft:
"""Base class for drafts
Extracted from PlaintextDraft, formerly named Draft. If I missed part of its public interface
that is relevant for other draft formats, those should be added to this base class.
"""
REF_TYPE_NORMATIVE = 'norm'
REF_TYPE_INFORMATIVE = 'info'
REF_TYPE_UNKNOWN = 'unk'
def get_abstract(self):
raise NotImplementedError
def get_author_list(self):
"""Get detailed author list
Returns a list of dicts with the following keys:
full_name, first_name, middle_initial, last_name,
name_suffix, email, country, company
Values will be None if not available
"""
raise NotImplementedError
def get_authors(self):
"""Get simple author list
Get as list of strings with author name and email within angle brackets
"""
raise NotImplementedError
def get_authors_with_firm(self):
"""Get simple list of authors with firm (company) info
Get as list of strings with author name and email within angle brackets and
company in parentheses
"""
raise NotImplementedError
def get_creation_date(self):
raise NotImplementedError
def get_formal_languages(self):
raise NotImplementedError
def get_pagecount(self):
raise NotImplementedError
def get_refs(self):
raise NotImplementedError
def get_status(self):
raise NotImplementedError
def get_title(self):
raise NotImplementedError
def get_wordcount(self):
raise NotImplementedError
# ----------------------------------------------------------------------
class PlaintextDraft(Draft):
def __init__(self, text, source, name_from_source=False):
"""Initialize a Draft instance
:param text: plaintext draft contents
:param source: name of file containing the contents
:param name_from_source: if True, fall back to source to determine draft name not found from text
"""
super().__init__()
assert isinstance(text, str)
self.source = str(source)
self.rawtext = text
self.name_from_source = name_from_source
text = re.sub(".\x08", "", text) # Get rid of inkribbon backspace-emphasis
text = text.replace("\r\n", "\n") # Convert DOS to unix
text = text.replace("\r", "\n") # Convert MAC to unix
text = text.strip()
self.text = text
self.errors = {}
self.rawlines = self.text.split("\n")
self.lines, self.pages = self._stripheaders()
# Some things (such as the filename) has to be on the first page. If
# we didn't get back a set of pages, only one single page with the
# whole document, then we need to do an enforced page split in order
# to limit later searches to the first page.
if len(self.pages) <= 1:
self.pages = []
for pagestart in range(0, len(self.lines), 56):
self.pages += [ "\n".join(self.lines[pagestart:pagestart+56]) ]
self.filename, self.revision = self._parse_draftname()
self._authors = None
self._authors_with_firm = None
self._author_info = None
self._abstract = None
self._pagecount = None
self._status = None
self._creation_date = None
self._title = None
@classmethod
def from_file(cls, source, *args, **kwargs):
with open(source, 'r', encoding='utf8') as f:
return cls(text=f.read(), source=source, *args, **kwargs)
# ------------------------------------------------------------------
def _parse_draftname(self):
draftname_regex = r"(draft-[a-z0-9-]*)-(\d\d)(\w|\.txt|\n|$)"
draftname_match = re.search(draftname_regex, self.pages[0])
if not draftname_match and self.name_from_source:
draftname_match = re.search(draftname_regex, self.source)
rfcnum_regex = r"(Re[qg]uests? [Ff]or Commm?ents?:? +|Request for Comments: RFC |RFC-|RFC )((# ?)?[0-9]+)( |,|\n|$)"
rfcnum_match = re.search(rfcnum_regex, self.pages[0])
if not rfcnum_match and self.name_from_source:
rfcnum_match = re.search(rfcnum_regex, self.source)
if draftname_match:
return (draftname_match.group(1), draftname_match.group(2) )
elif rfcnum_match:
return ("rfc"+rfcnum_match.group(2), "")
else:
self.errors["draftname"] = "Could not find the draft name and revision on the first page."
filename = ""
revision = ""
try:
__, base = self.source.rsplit("/", 1)
except ValueError:
base = self.source
if base.startswith("draft-"):
if '.' in base:
name, __ = base.split(".", 1)
else:
name = base
revmatch = re.search(r"\d\d$", name)
if revmatch:
filename = name[:-3]
revision = name[-2:]
else:
filename = name
return filename, revision
# ----------------------------------------------------------------------
def _stripheaders(self):
stripped = []
pages = []
page = []
line = ""
newpage = False
sentence = False
shortprev = False
blankcount = 0
linecount = 0
# two functions with side effects
def striplines(p):
beg = end = 0
for i in range(len(p)):
l = p[i]
if l.strip() == "":
continue
else:
beg = i
break
for i in range(len(p)-1,0,-1):
l = p[i]
if l.strip() == "":
continue
else:
end = i
break
return p[beg:end]
def endpage(pages, page, newpage, line):
if line:
page += [ line ]
return begpage(pages, page, newpage)
def begpage(pages, page, newpage, line=None):
if page and len(striplines(page)) > 5:
pages += [ "\n".join(page) ]
page = []
newpage = True
if line:
page += [ line ]
return pages, page, newpage
for line in self.rawlines:
linecount += 1
line = line.rstrip()
if re.search(r"\[?page [0-9ivx]+\]?[ \t\f]*$", line, re.I):
pages, page, newpage = endpage(pages, page, newpage, line)
continue
if re.search(r"\f", line, re.I):
pages, page, newpage = begpage(pages, page, newpage)
continue
if re.search(r"^ *Internet.Draft.+ .+[12][0-9][0-9][0-9] *$", line, re.I):
pages, page, newpage = begpage(pages, page, newpage, line)
continue
# if re.search("^ *Internet.Draft +", line, re.I):
# newpage = True
# continue
if re.search(r"^ *Draft.+[12][0-9][0-9][0-9] *$", line, re.I):
pages, page, newpage = begpage(pages, page, newpage, line)
continue
if re.search(r"^RFC[ -]?[0-9]+.*( +)[12][0-9][0-9][0-9]$", line, re.I):
pages, page, newpage = begpage(pages, page, newpage, line)
continue
if re.search(r"^draft-[-a-z0-9_.]+.*[0-9][0-9][0-9][0-9]$", line, re.I):
pages, page, newpage = endpage(pages, page, newpage, line)
continue
if linecount > 15 and re.search(r".{58,}(Jan|Feb|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|Sep|Oct|Nov|Dec) (19[89][0-9]|20[0-9][0-9]) *$", line, re.I):
pages, page, newpage = begpage(pages, page, newpage, line)
continue
if newpage and re.search(r"^ *draft-[-a-z0-9_.]+ *$", line, re.I):
pages, page, newpage = begpage(pages, page, newpage, line)
continue
if re.search(r"^[^ \t]+", line):
sentence = True
if re.search(r"[^ \t]", line):
if newpage:
# 36 is a somewhat arbitrary count for a 'short' line
shortthis = len(line.strip()) < 36 # 36 is a somewhat arbitrary count for a 'short' line
if sentence or (shortprev and not shortthis):
stripped += [""]
else:
if blankcount:
stripped += [""]*blankcount
blankcount = 0
sentence = False
newpage = False
shortprev = len(line.strip()) < 36 # 36 is a somewhat arbitrary count for a 'short' line
if re.search("[.:]$", line):
sentence = True
if re.search("^[ \t]*$", line):
blankcount += 1
page += [ line ]
continue
page += [ line ]
stripped += [ line ]
pages, page, newpage = begpage(pages, page, newpage)
_debug('pages: %s' % len(pages))
return stripped, pages
# ----------------------------------------------------------------------
def get_pagecount(self):
if self._pagecount == None:
label_pages = len(re.findall(r"\[page [0-9ixldv]+\]", self.text, re.I))
count_pages = len(self.pages)
if label_pages > count_pages/2:
self._pagecount = label_pages
else:
self._pagecount = count_pages
return self._pagecount
# ------------------------------------------------------------------
def get_wordcount(self):
count = 0
# match any sequence of non-white-space characters like the Unix command "wc"
word_re = re.compile(r'\S+', re.UNICODE)
for l in self.lines:
count += sum(1 for _ in word_re.finditer(l))
return count
# ------------------------------------------------------------------
def get_formal_languages(self):
language_regexps = [
("abnf", [re.compile(r"\bABNF"), re.compile(r" +[a-zA-Z][a-zA-Z0-9_-]* +=[/ ]")]),
("asn1", [re.compile(r'DEFINITIONS +::= +BEGIN')]),
("cbor", [re.compile(r'\b(?:CBOR|CDDL)\b'), re.compile(r" +[a-zA-Z][a-zA-Z0-9_-]* += +[\{\[\(]")]),
("ccode", [re.compile(r"(?:\+\+\))|(?:for \(i)|(?: [!=]= 0\) \{)|(?: struct [a-zA-Z_0-9]+ \{)")]),
("json", [re.compile(r'\bJSON\b'), re.compile(r" \"[^\"]+\" ?: [a-zA-Z0-9\.\"\{\[]")]),
("xml", [re.compile(r"<\?xml")]),
]
already_matched = set()
for l in self.lines:
for lang_name, patterns in language_regexps:
for p in patterns:
if p not in already_matched and p.search(l):
already_matched.add(p)
return [
lang_name
for lang_name, patterns in language_regexps
if all(p in already_matched for p in patterns)
]
# ----------------------------------------------------------------------
def get_status(self):
if self._status == None:
for line in self.lines[:10]:
status_match = re.search(r"^\s*Intended [Ss]tatus:\s*(.*?) ", line)
if status_match:
self._status = status_match.group(1)
break
return self._status
# ------------------------------------------------------------------
def get_creation_date(self):
if self._creation_date:
return self._creation_date
date_regexes = [
r'^(?P<month>\w+)\s(?P<day>\d{1,2})(,|\s)+(?P<year>\d{4})',
r'^(?P<day>\d{1,2})(,|\s)+(?P<month>\w+)\s(?P<year>\d{4})',
r'^(?P<day>\d{1,2})-(?P<month>\w+)-(?P<year>\d{4})',
r'^(?P<month>\w+)\s(?P<year>\d{4})',
r'\s{3,}(?P<month>\w+)\s(?P<day>\d{1,2})(,|\s)+(?P<year>\d{4})',
r'\s{3,}(?P<day>\d{1,2})(,|\s)+(?P<month>\w+)\s(?P<year>\d{4})',
r'\s{3,}(?P<day>\d{1,2})-(?P<month>\w+)-(?P<year>\d{4})',
# RFC 3339 date (also ISO date)
r'\s{3,}(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})',
# 'October 2008' - default day to today's.
r'\s{3,}(?P<month>\w+)\s(?P<year>\d{4})',
]
dates = []
text = self.pages[0]
for regex in date_regexes:
match = re.search(regex, text, re.MULTILINE)
if match:
start = match.start()
if not "expires" in text[start-10:start].lower():
dates += [(start, match)]
dates.sort()
for start, match in dates:
md = match.groupdict()
mon = md['month'].lower()
day = int( md.get( 'day', 0 ) )
year = int( md['year'] )
try:
if mon in month_names:
month = month_names.index( mon ) + 1
elif mon in month_names_abbrev3:
month = month_names_abbrev3.index( mon ) + 1
elif mon in month_names_abbrev4:
month = month_names_abbrev4.index( mon ) + 1
elif mon.isdigit() and int(mon) in range(1,13):
month = int(mon)
else:
continue
today = date_today()
if day==0:
# if the date was given with only month and year, use
# today's date if month and year is today's month and
# year, otherwise pick the middle of the month.
# Don't use today's day for month and year in the past
if month==today.month and year==today.year:
day = today.day
else:
day = 15
self._creation_date = datetime.date(year, month, day)
return self._creation_date
except ValueError:
# mon abbreviation not in _MONTH_NAMES
# or month or day out of range
pass
self.errors['creation_date'] = 'Creation Date field is empty or the creation date is not in a proper format.'
return self._creation_date
# ------------------------------------------------------------------
def get_abstract(self):
if self._abstract:
return self._abstract
abstract_re = re.compile(r'^(\s*)abstract', re.I)
header_re = re.compile(r"^(\s*)([0-9]+\.? |Appendix|Status of|Table of|Full Copyright|Copyright|Intellectual Property|Acknowled|Author|Index|Disclaimer).*", re.I)
begin = False
abstract = []
abstract_indent = 0
look_for_header = False
for line in self.lines:
if not begin:
if abstract_re.match(line):
begin=True
abstract_indent = len(abstract_re.match(line).group(0))
continue
if begin:
if not line and not abstract:
continue
if not line:
look_for_header=True
abstract.append(line)
continue
if look_for_header and header_re.match(line):
break
look_for_header = False
abstract.append(line)
abstract = '\n'.join(abstract)
abstract = self._clean_abstract(abstract)
self._abstract = self._check_abstract_indent(abstract, abstract_indent)
return self._abstract
def _check_abstract_indent(self, abstract, indent):
indentation_re = re.compile(r'^(\s)*')
indent_lines = []
for line in abstract.split('\n'):
if line:
indent = len(indentation_re.match(line).group(0))
indent_lines.append(indent)
percents = {}
total = float(len(indent_lines))
formatted = False
for indent in set(indent_lines):
count = indent_lines.count(indent)/total
percents[indent] = count
if count > 0.9:
formatted = True
if not formatted:
return abstract
new_abstract = []
for line in abstract.split('\n'):
if line:
indent = len(indentation_re.match(line).group(0))
if percents[indent] < 0.9:
break
new_abstract.append(line)
return '\n'.join(new_abstract)
def _clean_abstract(self, text):
text = re.sub("(?s)(Conventions [Uu]sed in this [Dd]ocument|Requirements [Ll]anguage)?[\n ]*The key words \"MUST\", \"MUST NOT\",.*$", "", text)
# Get rid of status/copyright boilerplate
text = re.sub("(?s)\nStatus of [tT]his Memo\n.*$", "", text)
# wrap long lines without messing up formatting of Ok paragraphs:
while re.match("([^\n]{72,}?) +", text):
text = re.sub("([^\n]{72,}?) +([^\n ]*)(\n|$)", "\\1\n\\2 ", text)
return text
# ------------------------------------------------------------------
def get_authors(self):
"""Returns a list of strings with author name and email within angle brackets"""
if self._authors == None:
self.extract_authors()
return self._authors
def get_authors_with_firm(self):
"""Returns a list of strings with author name and email within angle brackets"""
if self._authors_with_firm == None:
self.extract_authors()
return self._authors_with_firm
def get_author_list(self): # () -> List[List[str, str, str, str, str, str, str]]
"""Returns a list of tuples, with each tuple containing (given_names,
surname, email, company). Email will be None if unknown.
Todo update to agree with superclass method signature
"""
if self._author_info == None:
self.extract_authors()
return self._author_info
def extract_authors(self):
"""Extract author information from draft text.
"""
aux = {
"honor" : r"(?:[A-Z]\.|Dr\.?|Dr\.-Ing\.|Prof(?:\.?|essor)|Sir|Lady|Dame|Sri)",
"prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von|[Ee]l)",
"suffix": r"(jr.?|Jr.?|II|2nd|III|3rd|IV|4th)",
"first" : r"([A-Z][-A-Za-z'`~,]*)(( ?\([A-Z][-A-Za-z'`~,]*\))?(\.?[- ]{1,2}[A-Za-z'`~]+)*)",
"last" : r"([-A-Za-z'`~,]+)", # single-letter last names exist
"months": r"(January|February|March|April|May|June|July|August|September|October|November|December)",
"mabbr" : r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?",
}
authcompanyformats = [
r" {6}(?P<author>(%(first)s[ \.]{1,3})+((%(prefix)s )?%(last)s)( %(suffix)s)?), (?P<company>[^.]+\.?)$" % aux,
r" {6}(?P<author>(%(first)s[ \.]{1,3})+((%(prefix)s )?%(last)s)( %(suffix)s)?) *\((?P<company>[^.]+\.?)\)$" % aux,
]
authformats = [
r" {6}((%(first)s[ \.]{1,3})+((%(prefix)s )?%(last)s)( %(suffix)s)?)(, ([^.]+\.?|\([^.]+\.?|\)))?,?$" % aux,
r" {6}(((%(prefix)s )?%(last)s)( %(suffix)s)?, %(first)s)?$" % aux,
r" {6}(%(last)s)$" % aux,
]
multiauthformats = [
(
r" {6}(%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)(, ?%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)+$" % aux,
r"(%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)" % aux
),
]
editorformats = [
r"(?:, | )([Ee]d\.?|\([Ee]d\.?\)|[Ee]ditor)$",
]
companyformats = [
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(,? ?(Inc|Ltd|AB|S\.A)\.?))$",
r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(/([A-Za-z'][-A-Za-z0-9.& ']+))+)$",
r" {6}([a-z0-9.-]+)$",
r" {6}(([A-Za-z'][-A-Za-z0-9.&']+)( [A-Za-z'][-A-Za-z0-9.&']+)*)$",
r" {6}(([A-Za-z'][-A-Za-z0-9.']+)( & [A-Za-z'][-A-Za-z0-9.']+)*)$",
r" {6}\((.+)\)$",
r" {6}(\w+\s?\(.+\))$",
]
dateformat = r"(((%(months)s|%(mabbr)s) \d+, |\d+ (%(months)s|%(mabbr)s),? |\d+/\d+/)\d\d\d\d|\d\d\d\d-\d\d-\d\d)$" % aux
address_section = r"^ *([0-9]+\.)? *(Author|Editor)('s|s'|s|\(s\)) (Address|Addresses|Information)"
# "Internet Draft" (without the dash) is correct here, because the usage is to
# suppress incorrect author name extraction
ignore = [
"Standards Track", "Current Practice", "Internet Draft", "Working Group",
"Expiration Date",
]
def make_authpat(hon, first, last, suffix):
def dotexp(s):
s = re.sub(r"\. ", r"\\w* ", s)
s = re.sub(r"\.$", r"\\w*", s)
s = re.sub(r"\.(\w)", r"\\w* \1", s)
return s
first = dotexp(first)
last = dotexp(last)
first = re.sub("[()]", " ", first)
if " " in first:
# if there's a middle part, let it be optional
first, middle = first.split(" ", 1)
first = "%s( +%s)?" % (first, middle)
# Double names (e.g., Jean-Michel) are abbreviated as two letter
# connected by a dash -- let this expand appropriately
first = re.sub(r"^([A-Z])-([A-Z])\\w\*", r"\1.*-\2.*", first)
# Some chinese names are shown with double-letter(latin) abbreviated given names, rather than
# a single-letter(latin) abbreviation:
first = re.sub(r"^([A-Z])[A-Z]+\\w\*", r"\1[-\\w]+", first)
# permit insertion of middle names between first and last, and
# add possible honorific and suffix information
if last:
authpat = r"(?:^| and )((?:%(hon)s ?)?['`]*%(first)s\S*( +[^ ]+)* +%(last)s(?: %(suffix)s)?)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,}
else:
# handle single-word names
authpat = r"(?:^| and )((?:%(hon)s ?)?['`]*%(first)s\S*( +[^ ]+)*(?: %(suffix)s)?)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "suffix":suffix,}
return authpat
authors = []
companies = []
companies_seen = []
self._docheader = ""
# Collect first-page author information first
have_blankline = False
have_draftline = False
prev_blankline = False
for line in self.lines[:30]:
self._docheader += line+"\n"
author_on_line = False
_debug( " ** " + line)
leading_space = len(re.findall("^ *", line)[0])
line_len = len(line.rstrip())
trailing_space = line_len <= 72 and 72 - line_len or 0
# Truncate long lines at the first space past column 80:
trunc_space = line.find(" ", 80)
if line_len > 80 and trunc_space > -1:
line = line[:trunc_space]
if line_len > 60:
# Look for centered title, break if found:
if (leading_space > 5 and abs(leading_space - trailing_space) < 5):
_debug("Breaking for centered line")
break
if re.search(dateformat, line):
if authors:
_debug("Breaking for dateformat after author name")
for editorformat in editorformats:
if re.search(editorformat, line):
line = re.sub(editorformat, "", line)
break
for lineformat, authformat in multiauthformats:
match = re.search(lineformat, line)
if match:
_debug("a. Multiauth format: '%s'" % lineformat)
author_list = re.findall(authformat, line)
authors += [ a[0] for a in author_list ]
companies += [ None for a in author_list ]
author_on_line = True
#_debug("\nLine: " + line)
#_debug("Format: " + authformat)
for author in author_list:
_debug("Author: '%s'" % author[0])
break
if not author_on_line:
for lineformat in authcompanyformats:
match = re.search(lineformat, line)
if match:
_debug("b. Line format: '%s'" % lineformat)
maybe_company = match.group("company").strip(" ,.")
# is the putative company name just a partial name, i.e., a part
# that commonly occurs after a comma as part of a company name,
# as in "Foo Bar, Inc."? If so, skip; else assume there's a
# company name after the comma.
if not maybe_company in ["Inc", "Ltd", "S.A", "AG", "AB", "N.V", ]:
author = match.group("author")
company = match.group("company")
authors += [ author, '']
companies += [ None, company ]
#_debug("\nLine: " + line)
#_debug("Format: " + authformat)
_debug("Author: '%s'" % author)
_debug("Company: '%s'" % company)
author_on_line = True
break
if not author_on_line:
for authformat in authformats:
match = re.search(authformat, line)
if match:
_debug("c. Auth format: '%s'" % authformat)
author = match.group(1)
authors += [ author ]
companies += [ None ]
#_debug("\nLine: " + line)
#_debug("Format: " + authformat)
_debug("Author: '%s'" % author)
author_on_line = True
break
if not author_on_line:
for authformat in companyformats:
match = re.search(authformat, line)
if match:
_debug("d. Company format: '%s'" % authformat)
company = match.group(1)
authors += [ "" ]
companies += [ company ]
#_debug("\nLine: " + line)
#_debug("Format: " + authformat)
_debug("Company: '%s'" % company)
break
if authors and not author_on_line:
# Retain information about blank lines in author list
authors += [""]
companies += [ "" ]
if line.strip() == "":
if prev_blankline and authors:
_debug("Breaking, having found consecutive blank lines after author name")
break
if authors:
have_blankline = True
prev_blankline = True
else:
prev_blankline = False
if "draft-" in line:
have_draftline = True
if have_blankline and have_draftline:
_debug("Breaking, having found both blank line and draft-name line")
break
# remove trailing blank entries in the author list:
for i in range(len(authors)-1,-1,-1):
if authors[i] == "" and companies[i] == "":
del authors[i]
del companies[i]
else:
break
_debug("A:companies : %s" % str(companies))
#companies = [ None if a else '' for a in authors ]
#_debug("B:companies : %s" % str(companies))
#find authors' addresses section if it exists
_debug("B:authors : %s" % str(authors))
last_line = len(self.lines)-1
address_section_pos = last_line//2
for i in range(last_line//2,last_line):
line = self.lines[i]
if re.search(address_section, line):
address_section_pos = i
break
found_pos = []
company_or_author = None
for i in range(len(authors)):
_debug("1: authors[%s]: %s" % (i, authors[i]))
_debug(" company[%s]: %s" % (i, companies[i]))
author = authors[i]
if i+1 < len(authors):
company_or_author = authors[i+1]
else:
company_or_author = None
if author in [ None, '', ]:
continue
suffix_match = re.search(" %(suffix)s$" % aux, author)
if suffix_match:
suffix = suffix_match.group(1)
author = author[:-len(suffix)].strip()
else:
suffix = None
if ", " in author:
last, first = author.split(",",1)
author = "%s %s" % (first.strip(), last.strip())
if not " " in author:
if "." in author:
first, last = author.rsplit(".", 1)
first += "."
else:
# handle single-word names
first = author
last = ""
else:
if "." in author:
first, last = author.rsplit(".", 1)
first += "."
else:
first, last = author.rsplit(" ", 1)
if "." in first and not ". " in first:
first = first.replace(".", ". ").strip()
first = first.strip()
last = last.strip()
prefix_match = re.search(" %(prefix)s$" % aux, first)
if prefix_match:
prefix = prefix_match.group(1)
first = first[:-len(prefix)].strip()
last = prefix+" "+last
_debug("First, Last: '%s' '%s'" % (first, last))
for firstname, surname, casefixname in [ (first,last,last), (last,first,first), (first,last,last.upper()), (last,first,first.upper()), ]:
for left, right in [(firstname, casefixname), (casefixname, firstname)]:
author = "%s %s" % (left, right)
_debug("\nAuthors: "+str(authors))
_debug("Author: "+author)
# Pattern for full author information search, based on first page author name:
authpat = make_authpat(aux['honor'], left, right, aux['suffix'])
_debug("Authpat: " + authpat)
start = 0
col = None
# Find start of author info for this author (if any).
# Scan towards the front from the end of the file, looking for a match to authpath
for j in range(last_line, address_section_pos, -1):
line = self.lines[j]
_debug( "Line: " + line)
forms = [ line ] + [ line.replace(short, longform[short]) for short in longform if short in line ]
for form in forms:
try:
if re.search(authpat, form.strip()) and not j in found_pos:
_debug( "Match")
start = j
found_pos += [ start ]
_debug( " ==> start %s, normalized '%s'" % (start, form.strip()))
# The author info could be formatted in multiple columns...
columns = re.split("( +| and )", form)
# _debug( "Columns:" + str(columns))
# Find which column:
# _debug( "Col range:" + str(range(len(columns))))
cols = [ c for c in range(len(columns)) if re.search(authpat+r"( and |, |$)", columns[c].strip()) ]
if cols:
col = cols[0]
if not (start, col) in found_pos:
found_pos += [ (start, col) ]
_debug( "Col: %d" % col)
beg = len("".join(columns[:col]))
_debug( "Beg: %d '%s'" % (beg, "".join(columns[:col])))
_debug( "Len: %d" % len(columns))
if col == len(columns) or col == len(columns)-1:
end = None
_debug( "End1: %s" % end)
else:
end = beg + len("".join(columns[col:col+2]))
_debug( "End2: %d '%s'" % (end, "".join(columns[col:col+2])))
_debug( "Cut: '%s'" % form[beg:end])
author_match = re.search(authpat, columns[col].strip()).group(1)
_debug( "AuthMatch: '%s'" % (author_match,))
if re.search(r'\(.*\)$', author_match.strip()):
author_match = author_match.rsplit('(',1)[0].strip()
if author_match in companies_seen:
companies[i] = authors[i]
authors[i] = None
else:
fullname = author_match
#if casefixname in author_match:
# fullname = author_match.replace(casefixname, surname)
#else:
# fullname = author_match
fullname = re.sub(" +", " ", fullname)
if re.search(r"\s", fullname):
if left == firstname:
given_names, surname = fullname.rsplit(None, 1)
else:
surname, given_names = fullname.split(None, 1)
else:
# handle single-word names
given_names, surname = (fullname, "")
if " " in given_names:
first, middle = given_names.split(None, 1)
else:
first = given_names
middle = None
names = (first, middle, surname, suffix)
if suffix:
fullname = fullname+" "+suffix
for names in [
(first, middle, surname, suffix),
(first, surname, middle, suffix),
(middle, first, surname, suffix),
(middle, surname, first, suffix),
(surname, first, middle, suffix),
(surname, middle, first, suffix),
]:
parts = [ n for n in names if n ]
if (" ".join(parts) == fullname):
authors[i] = (fullname, first, middle, surname, suffix)
companies[i] = None
break
else:
_warn("Author tuple doesn't match text in Internet-Draft: %s, %s" % (authors[i], fullname))
authors[i] = None
break
except AssertionError:
sys.stderr.write("filename: "+self.filename+"\n")
sys.stderr.write("authpat: "+authpat+"\n")
raise
if start and col != None:
break
if start and col != None:
break
if start and col != None:
break
# End for:
if not authors[i]:
continue
_debug("2: authors[%s]: %s" % (i, authors[i]))
if start and col != None:
_debug("\n * %s" % (authors[i], ))
nonblank_count = 0
blanklines = 0
email = None
country = None
for line_offset, line in enumerate(self.lines[start+1:]):
_debug( " " + line.strip())
# Break on the second blank line
if not line:
blanklines += 1
if blanklines >= 3:
_debug( " - Break on blanklines")
break
else:
continue
else:
nonblank_count += 1
# Maybe break on author name
# _debug("Line: %s"%line.strip())
# for a in authors:
# if a and a not in companies_seen:
# _debug("Search for: %s"%(r"(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)"))
authmatch = [ a for a in authors[i+1:] if a and not a.lower() in companies_seen and (re.search((r"(?i)(^|\W)"+re.sub(r"[. ]+", ".*", a)+r"(\W|$)"), line.strip()) or acronym_match(a, line.strip()) )]
if authmatch:
_debug(" ? Other author or company ? : %s" % authmatch)
_debug(" Line: "+line.strip())
_debug(" C or A: %s"%company_or_author)
if nonblank_count == 1 or (nonblank_count == 2 and not blanklines) or (company_or_author==line.strip() and not blanklines):
# First line after an author -- this is a company
companies_seen += [ c.lower() for c in authmatch ]
companies_seen += [ line.strip().lower() ] # XXX fix this for columnized author list
companies_seen = list(set(companies_seen))
_debug(" -- Companies: " + ", ".join(companies_seen))
for k in range(i+1, len(authors)):
if authors[k] and authors[k].lower() in companies_seen:
companies[k] = authors[k]
authors[k] = None
elif blanklines and not "@" in line:
# Break on an author name
_debug( " - Break on other author name")
break
else:
pass
def columnify(l):
try:
column = l.replace('\t', 8 * ' ')[max(0, beg - 1):end].strip()
except:
column = l
column = re.sub(r" *(?:\(at\)| <at> | at ) *", "@", column)
column = re.sub(r" *(?:\(dot\)| <dot> | dot ) *", ".", column)
column = re.sub(r"&cisco.com", "@cisco.com", column)
column = column.replace("\xa0", " ")
return column
column = columnify(line)
# if re.search("^\w+: \w+", column):
# keyword = True
# else:
# if keyword:
# # Break on transition from keyword line to something else
# _debug( " - Break on end of keywords")
# break
#_debug( " Column text :: " + column)
if nonblank_count >= 2 and blanklines == 0:
# Usually, the contact info lines will look
# like this: "Email: someone@example.com" or
# "Tel: +1 (412)-2390 23123", but sometimes
# the : is left out. That's okay for things we
# can't misinterpret, but "tel" may match "Tel
# Aviv 69710, Israel" so match
# - misc contact info
# - tel/fax [number]
# - [phone number]
# - [email]
other_contact_info_regex = re.compile(r'^(((contact )?e|\(e|e-|m|electronic )?mail|email_id|mailto|e-main|(tele)?phone|voice|mobile|work|uri|url|tel:)\b|^((ph|tel\.?|telefax|fax) *[:.]? *\(?( ?\+ ?)?[0-9]+)|^(\++[0-9]+|\(\+*[0-9]+\)|\(dsn\)|[0-9]+)([ -.]*\b|\b[ -.]*)(([0-9]{2,}|\([0-9]{2,}\)|(\([0-9]\)|[0-9])[ -][0-9]{2,}|\([0-9]\)[0-9]+)([ -.]+([0-9]+|\([0-9]+\)))+|([0-9]{7,}|\([0-9]{7,}\)))|^(<?[-a-z0-9._+]+|{([-a-z0-9._+]+, ?)+[-a-z0-9._+]+})@[-a-z0-9._]+>?|^https?://|^www\.')
next_line_index = start + 1 + line_offset + 1
if (not country
and not other_contact_info_regex.search(column.lower())
and next_line_index < len(self.lines)):
next_line_lower = columnify(self.lines[next_line_index]).lower().strip()
if not next_line_lower or other_contact_info_regex.search(next_line_lower):
# country should be here, as the last
# part of the address, right before an
# empty line or other contact info
country = column.strip() or None
_debug(" Country: %s" % country)
_debug("3: authors[%s]: %s" % (i, authors[i]))
emailmatch = re.search("[-A-Za-z0-9_.+]+@[-A-Za-z0-9_.]+", column)
if emailmatch and not "@" in author:
email = emailmatch.group(0).lower()
break
authors[i] = authors[i] + ( email, country)
else:
if not author in ignore:
companies[i] = authors[i]
_debug("Not an author? '%s'" % (author))
authors[i] = None
assert(len(authors) == len(companies))
_debug('Author list: %s' % authors)
_debug('Company list: %s' % companies)
for i in range(len(authors)):
if authors[i]:
_debug('authors[%s]: %s' % (i, authors[i]))
company = ''
for k in range(i+1, len(companies)):
_debug('companies[%s]: %s' % (k, companies[k]))
if companies[k] != None:
company = companies[k]
break
authors[i] = authors[i] + ( company, )
authors = [ a for a in authors if a ]
_debug(" * Final author tuples: %s" % (authors,))
_debug(" * Final company list: %s" % (companies,))
_debug(" * Final companies_seen: %s" % (companies_seen,))
self._author_info = authors
self._authors_with_firm = [ "%s <%s> (%s)"%(full,email,company) for full,first,middle,last,suffix,email,country,company in authors ] # pyflakes:ignore
self._authors = [ "%s <%s>"%(full,email) if email else full for full,first,middle,last,suffix,email,country,company in authors ]
self._authors.sort()
_debug(" * Final author list: " + ", ".join(self._authors))
_debug("-"*72)
# ------------------------------------------------------------------
def get_title(self):
if self._title:
return self._title
match = re.search(r'(?:\n\s*\n\s*)((.+\n){0,2}(.+\n*))(\s+<?draft-\S+\s*\n)\s*\n', self.pages[0])
if not match:
match = re.search(r'(?:\n\s*\n\s*)<?draft-\S+\s*\n*((.+\n){1,3})\s*\n', self.pages[0])
if not match:
match = re.search(r'(?:\n\s*\n\s*)((.+\n){0,2}(.+\n*))(\s*\n){2}', self.pages[0])
if not match:
match = re.search(r'(?i)(.+\n|.+\n.+\n)(\s*status of this memo\s*\n)', self.pages[0])
if match:
title = match.group(1)
title = title.strip()
title = re.sub(r'(?s)\n\s*\<?draft-.*$','', title)
title = re.sub(r'\s*\n\s*', ' ', title)
title = re.sub(r' +', ' ', title)
self._title = title
return self._title
self.errors["title"] = "Could not find the title on the first page."
# ------------------------------------------------------------------
def get_refs(self):
# Bill's horrible "references section" regexps, built up over lots of years
# of fine tuning for different formats.
# Examples:
# Appendix A. References:
# A.1. Informative References:
sectionre = re.compile( r'(?i)(?:Appendix\s+)?(?:(?:[A-Z]\.)?[0-9.]*\s+)?(?:(\S+)\s*)?references:?$' )
# 9.1 Normative
sectionre2 = re.compile( r'(?i)(?:(?:[A-Z]\.)?[0-9.]*\s+)?(\S+ormative)$' )
# One other reference section type seen:
sectionre3 = re.compile( r'(?i)References \((\S+ormative)\)$' )
# An Internet-Draft reference.
idref = re.compile( r'(?i)\b(draft-(?:[-\w]+(?=-\d\d)|[-\w]+))(-\d\d)?\b' )
# An RFC-and-other-series reference.
rfcref = re.compile( r'(?i)\b(rfc|std|bcp|fyi)[- ]?(\d+)\b' )
# False positives for std
not_our_std_ref = re.compile( r'(?i)((\b(n?csc|fed|mil|is-j)-std\b)|(\bieee\s*std\d*\b)|(\bstd\s+802\b))' )
# An Internet-Draft or series reference hyphenated by a well-meaning line break.
eol = re.compile( r'(?i)\b(draft[-\w]*-|rfc|std|bcp|fyi)$' )
# std at the front of a line can hide things like IEEE STD or MIL-STD
std_start = re.compile( r'(?i)std\n*\b' )
not_starting_regexes = [
re.compile( r'(?i) uri references:?$' ),
]
refs = {}
in_ref_sect = False
in_norm_ref_sect = False
refType = self.REF_TYPE_UNKNOWN
for i in range( 15, len( self.lines ) ):
line = self.lines[ i ].strip()
# skip over lines until we find the start of the reference section
if not in_ref_sect:
m = sectionre.match( line )
if not m:
m = sectionre2.match( line )
if not m:
m = sectionre3.match( line )
if m:
if not any( [ rule.search( line ) for rule in not_starting_regexes ]):
in_ref_sect = True
refType = self.REF_TYPE_INFORMATIVE
if line.lower().find("normative") > 1:
in_norm_ref_sect = True
refType = self.REF_TYPE_NORMATIVE
# might be subsections within a references section
if in_ref_sect and not in_norm_ref_sect:
m = sectionre.match( line )
if not m:
m = sectionre2.match( line )
if not m:
m = sectionre3.match( line )
if m:
in_ref_sect = True
if line.lower().find("normative") > 1:
in_norm_ref_sect = True
refType = self.REF_TYPE_NORMATIVE
# look for the end of the normative reference section
if in_norm_ref_sect:
m = sectionre.match( line )
if not m:
m = sectionre2.match( line )
if not m:
m = sectionre3.match( line )
if m and line.lower().find("normative") < 0:
in_norm_ref_sect = False
refType = self.REF_TYPE_INFORMATIVE
# find references within the section
if in_ref_sect:
# If something got split badly, rejoin it.
if eol.search( line ) and i < len( self.lines ) - 1:
line += self.lines[ i + 1 ].lstrip()
m = idref.search( line )
if m:
draft = m.group( 1 )
if draft not in refs:
refs[ draft ] = refType
m = rfcref.search( line )
if m:
( series, number ) = m.groups()
if series.lower()=='std' and std_start.search(line) and i > 15:
line = self.lines[i-1].rstrip()+line
if series.lower()!='std' or not not_our_std_ref.search( line ):
name = series.lower() + number.lstrip( '0' )
if name not in refs:
refs[ name ] = refType
# Don't add any references that point back into this doc
if self.filename in refs:
del refs[self.filename]
return refs
def old_get_refs( self ):
refs = []
normrefs = []
rfcrefs = []
draftrefs = []
refline = None
for i in range(len(self.lines)-1, 15, -1):
if re.search(r"(?i)^ *[0-9.]+ *(((normative|informative|informational|non-normative) )?references|references\W+(normative|informative))", self.lines[i]):
if not '. . .' in self.lines[i] and not '...' in self.lines[i]:
refline = i
if refline:
for i in range(refline, len(self.lines)):
line = self.lines[i].strip()
ref_match = re.search(r"(?i)^\[[a-z0-9.-]+( [a-z0-9.-]+)?\].+", line)
if ref_match:
para = line
while True:
i += 1
if i >= len(self.lines):
break
line = self.lines[i].strip()
if not line:
break
if para[-1] not in ["-", "/"]:
para += " "
para += line
refs += [ para ]
rfc_match = re.search(r"(?i)rfc ?\d+", para)
if rfc_match:
rfcrefs += [ rfc_match.group(0).replace(" ","").lower() ]
draft_match = re.search(r"draft-[a-z0-9-]+", para)
if draft_match:
draft = draft_match.group(0).lower()
if not draft in draftrefs:
draftrefs += [ draft ]
normrefs = list(set(normrefs))
normrefs.sort()
rfcrefs = list(set(rfcrefs))
rfcrefs.sort()
refs = list(set(refs))
refs.sort()
return normrefs, rfcrefs, draftrefs, refs
# ----------------------------------------------------------------------
def getmeta(fn):
# Initial values
fields = {}
fields["eventsource"] = "draft"
if " " in fn or not fn.endswith(".txt"):
_warn("Skipping unexpected Internet-Draft name: '%s'" % (fn))
return {}
if os.path.exists(fn):
filename = fn
fn = os.path.basename(fn)
else:
_warn("Could not find file: '%s'" % (filename))
return
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+00:00", time.gmtime(os.stat(filename)[stat.ST_MTIME]))
with io.open(filename, 'rb') as file:
try:
draft = PlaintextDraft(file.read().decode('utf8'), filename)
except UnicodeDecodeError:
draft = PlaintextDraft(file.read().decode('latin1'), filename)
#_debug("\n".join(draft.lines))
fields["eventdate"] = timestamp
if draft.filename:
fields["doctag"] = draft.filename
fields["docrev"] = draft.revision
fields["doctitle"] = draft.get_title()
fields["docpages"] = str(draft.get_pagecount())
fields["docauthors"] = ", ".join(draft.get_authors())
fields["_authorlist"] = draft.get_author_list()
fields["docaffiliations"] = ", ".join(draft.get_authors_with_firm())
if opt_debug:
fields["docheader"] = draft._docheader
normrefs, rfcrefs, draftrefs, refs = draft.old_get_refs()
fields["docrfcrefs"] = ", ".join(rfcrefs)
fields["docdraftrefs"] = ", ".join(draftrefs)
fields["doccreationdate"] = str(draft.get_creation_date())
deststatus = draft.get_status()
if deststatus:
fields["docdeststatus"] = deststatus
abstract = draft.get_abstract()
if abstract:
fields["docabstract"] = abstract
return fields
# ----------------------------------------------------------------------
def _output(docname, fields, outfile=sys.stdout):
global company_domain
if opt_attributes:
def outputkey(key, fields):
field = fields[key]
if "\n" in field:
field = "\n" + field.rstrip()
else:
field = field.strip()
outfile.write("%-24s: %s\n" % ( key, field.replace("\\", "\\\\" ).replace("'", "\\x27" )))
else:
def outputkey(key, fields):
outfile.write(" %s='%s'" % ( key.lower(), fields[key].strip().replace("\\", "\\\\" ).replace("'", "\\x27" ).replace("\n", "\\n")))
if opt_timestamp:
outfile.write("%s " % (fields["eventdate"]))
outfile.write("%s" % (os.path.basename(docname.strip())))
keys = list(fields.keys())
keys.sort()
for key in keys:
if fields[key] and not key in ["eventdate", ] and not key.startswith("_"):
outputkey(key, fields)
outfile.write("\n")
# ----------------------------------------------------------------------
def _printmeta(fn, outfile=sys.stdout):
if opt_trace:
t = time.time()
sys.stderr.write("%-58s" % fn[:-4])
fields = getmeta(fn)
if fields:
_output(fields.get("doctag", fn[:-7]), fields, outfile)
if opt_trace:
sys.stderr.write("%5.1f\n" % ((time.time() - t)))
# ----------------------------------------------------------------------
# Main
# ----------------------------------------------------------------------
company_domain = {} # type: Dict[str, str]
def _main(outfile=sys.stdout):
global opt_debug, opt_timestamp, opt_trace, opt_authorinfo, files, company_domain, opt_attributes
# set default values, if any
# ----------------------------------------------------------------------
# Option processing
# ----------------------------------------------------------------------
options = ""
for line in re.findall(r"\n +(if|elif) +opt in \[(.+)\]:\s+#(.+)\n", io.open(sys.argv[0]).read()):
if not options:
options += "OPTIONS\n"
options += " %-16s %s\n" % (line[1].replace('"', ''), line[2])
options = options.strip()
# with ' < 1:' on the next line, this is a no-op:
if len(sys.argv) < 1:
vars = globals()
vars.update(locals())
print(__doc__ % vars)
sys.exit(1)
try:
opts, files = getopt.gnu_getopt(sys.argv[1:], "dhatTv", ["debug", "getauthors", "attribs", "attributes", "help", "timestamp", "notimestamp", "trace", "version",])
except Exception as e:
print("%s: %s" % (program, e))
sys.exit(1)
# parse options
for opt, value in opts:
if opt in ["-d", "--debug"]: # Output debug information
opt_debug = True
elif opt in ["-h", "--help"]: # Output this help text, then exit
vars = globals()
vars.update(locals())
print(__doc__ % vars)
sys.exit(1)
elif opt in ["-v", "--version"]: # Output version information, then exit
print(program, version)
sys.exit(0)
elif opt in ["-a", "--attribs"]: # Output key-value attribute pairs
opt_attributes = True
elif opt in ["-t", ]: # Toggle leading timestamp information
opt_timestamp = not opt_timestamp
elif opt in ["--timestamp"]: # Emit leading timestamp information
opt_timestamp = True
elif opt in ["--notimestamp"]: # Omit leading timestamp information
opt_timestamp = False
elif opt in ["-T", "--trace"]: # Emit trace information while working
opt_trace = True
company_domain = {}
if not files:
files = [ "-" ]
for file in files:
_debug( "Reading Internet-Drafts from '%s'" % file)
if file == "-":
file = sys.stdin
elif file.endswith(".gz"):
import gzip
file = gzip.open(file)
else:
file = io.open(file)
basename = os.path.basename(file.name)
if basename.startswith("draft-"):
draft = basename
_debug( "** Processing '%s'" % draft)
_printmeta(file.name, outfile)
else:
for line in file:
draft = line.strip()
if draft.startswith("#"):
continue
if draft:
_debug( "** Processing '%s'" % draft)
_printmeta(draft, outfile)
if __name__ == "__main__":
try:
_main()
except KeyboardInterrupt:
raise
except Exception as e:
if opt_debug:
raise
else:
_err(e)