#!/usr/bin/python # Copyright The IETF Trust 2009-2021, All Rights Reserved # -*- coding: utf-8 -*- # -*- python -*- """ NAME %(program)s - Extract meta-information from an IETF draft. SYNOPSIS %(program)s [OPTIONS] DRAFTLIST_FILE DESCRIPTION Extract information about authors' names and email addresses, intended status and number of pages from Internet Drafts. The information is emitted in the form of a line containing xml-style attributes, prefixed with the name of the draft. %(options)s AUTHOR Written by Henrik Levkowetz, COPYRIGHT Copyright 2008 Henrik Levkowetz This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. There is NO WARRANTY; not even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. """ import debug # pyflakes: ignore import datetime import getopt import io import os import os.path import re import stat import sys import time from typing import Dict, List # pyflakes:ignore version = "0.35" program = os.path.basename(sys.argv[0]) progdir = os.path.dirname(sys.argv[0]) # ---------------------------------------------------------------------- # Data # ---------------------------------------------------------------------- opt_debug = False opt_timestamp = False opt_trace = False opt_authorinfo = False opt_getauthors = False opt_attributes = False # Don't forget to add the option variable to the globals list in _main below # The following is an alias list for short forms which starts with a # different letter than the long form. longform = { "Beth": "Elizabeth", "Bill": "William", "Bob": "Robert", "Dick": "Richard", "Fred": "Alfred", "Jerry": "Gerald", "Liz": "Elizabeth", "Lynn": "Carolyn", "Ned": "Edward", "Ted":"Edward", } longform = dict([ (short+" ", longform[short]+" ") for short in longform ]) month_names = [ 'january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october', 'november', 'december' ] month_names_abbrev3 = [ n[:3] for n in month_names ] month_names_abbrev4 = [ n[:4] for n in month_names ] # ---------------------------------------------------------------------- # Functions # ---------------------------------------------------------------------- def _debug(string): if opt_debug: sys.stderr.write("%s\n" % (string)) # ---------------------------------------------------------------------- def _note(string): sys.stdout.write("%s: %s\n" % (program, string)) # ---------------------------------------------------------------------- def _warn(string): sys.stderr.write("%s: Warning: %s\n" % (program, string)) # ---------------------------------------------------------------------- def _err(string): sys.stderr.write("%s: Error: %s\n" % (program, string)) sys.exit(1) # ---------------------------------------------------------------------- def _gettext(file): file = io.open(file) text = file.read() file.close() text = re.sub(".\x08", "", text) # Get rid of inkribbon backspace-emphasis text = text.replace("\r\n", "\n") # Convert DOS to unix text = text.replace("\r", "\n") # Convert MAC to unix text = text.expandtabs() text = text.strip() return text def acronym_match(s, l): acronym = re.sub("[^A-Z]", "", l) #_debug(" s:%s; l:%s => %s; %s" % (s, l, acronym, s==acronym)) return s == acronym class Draft: """Base class for drafts Extracted from PlaintextDraft, formerly named Draft. If I missed part of its public interface that is relevant for other draft formats, those should be added to this base class. """ REF_TYPE_NORMATIVE = 'norm' REF_TYPE_INFORMATIVE = 'info' REF_TYPE_UNKNOWN = 'unk' def get_abstract(self): raise NotImplementedError def get_author_list(self): raise NotImplementedError def get_authors(self): raise NotImplementedError def get_authors_with_firm(self): raise NotImplementedError def get_creation_date(self): raise NotImplementedError def get_formal_languages(self): raise NotImplementedError def get_pagecount(self): raise NotImplementedError def get_refs(self): raise NotImplementedError def get_status(self): raise NotImplementedError def get_title(self): raise NotImplementedError def get_wordcount(self): raise NotImplementedError # ---------------------------------------------------------------------- class PlaintextDraft(Draft): def __init__(self, text, source, name_from_source=False): """Initialize a Draft instance :param text: plaintext draft contents :param source: name of file containing the contents :param name_from_source: if True, fall back to source to determine draft name not found from text """ super().__init__() assert isinstance(text, str) self.source = source self.rawtext = text self.name_from_source = name_from_source text = re.sub(".\x08", "", text) # Get rid of inkribbon backspace-emphasis text = text.replace("\r\n", "\n") # Convert DOS to unix text = text.replace("\r", "\n") # Convert MAC to unix text = text.strip() self.text = text self.errors = {} self.rawlines = self.text.split("\n") self.lines, self.pages = self._stripheaders() # Some things (such as the filename) has to be on the first page. If # we didn't get back a set of pages, only one single page with the # whole document, then we need to do an enforced page split in order # to limit later searches to the first page. if len(self.pages) <= 1: self.pages = [] for pagestart in range(0, len(self.lines), 56): self.pages += [ "\n".join(self.lines[pagestart:pagestart+56]) ] self.filename, self.revision = self._parse_draftname() self._authors = None self._authors_with_firm = None self._author_info = None self._abstract = None self._pagecount = None self._status = None self._creation_date = None self._title = None @classmethod def from_file(cls, source, *args, **kwargs): with open(source, 'r', encoding='utf8') as f: return cls(text=f.read(), source=source, *args, **kwargs) # ------------------------------------------------------------------ def _parse_draftname(self): draftname_regex = r"(draft-[a-z0-9-]*)-(\d\d)(\w|\.txt|\n|$)" draftname_match = re.search(draftname_regex, self.pages[0]) if not draftname_match and self.name_from_source: draftname_match = re.search(draftname_regex, self.source) rfcnum_regex = r"(Re[qg]uests? [Ff]or Commm?ents?:? +|Request for Comments: RFC |RFC-|RFC )((# ?)?[0-9]+)( |,|\n|$)" rfcnum_match = re.search(rfcnum_regex, self.pages[0]) if not rfcnum_match and self.name_from_source: rfcnum_match = re.search(rfcnum_regex, self.source) if draftname_match: return (draftname_match.group(1), draftname_match.group(2) ) elif rfcnum_match: return ("rfc"+rfcnum_match.group(2), "") else: self.errors["draftname"] = "Could not find the draft name and revision on the first page." filename = "" revision = "" try: __, base = self.source.rsplit("/", 1) except ValueError: base = self.source if base.startswith("draft-"): if '.' in base: name, __ = base.split(".", 1) else: name = base revmatch = re.search(r"\d\d$", name) if revmatch: filename = name[:-3] revision = name[-2:] else: filename = name return filename, revision # ---------------------------------------------------------------------- def _stripheaders(self): stripped = [] pages = [] page = [] line = "" newpage = False sentence = False shortprev = False blankcount = 0 linecount = 0 # two functions with side effects def striplines(p): beg = end = 0 for i in range(len(p)): l = p[i] if l.strip() == "": continue else: beg = i break for i in range(len(p)-1,0,-1): l = p[i] if l.strip() == "": continue else: end = i break return p[beg:end] def endpage(pages, page, newpage, line): if line: page += [ line ] return begpage(pages, page, newpage) def begpage(pages, page, newpage, line=None): if page and len(striplines(page)) > 5: pages += [ "\n".join(page) ] page = [] newpage = True if line: page += [ line ] return pages, page, newpage for line in self.rawlines: linecount += 1 line = line.rstrip() if re.search(r"\[?page [0-9ivx]+\]?[ \t\f]*$", line, re.I): pages, page, newpage = endpage(pages, page, newpage, line) continue if re.search(r"\f", line, re.I): pages, page, newpage = begpage(pages, page, newpage) continue if re.search(r"^ *Internet.Draft.+ .+[12][0-9][0-9][0-9] *$", line, re.I): pages, page, newpage = begpage(pages, page, newpage, line) continue # if re.search("^ *Internet.Draft +", line, re.I): # newpage = True # continue if re.search(r"^ *Draft.+[12][0-9][0-9][0-9] *$", line, re.I): pages, page, newpage = begpage(pages, page, newpage, line) continue if re.search(r"^RFC[ -]?[0-9]+.*( +)[12][0-9][0-9][0-9]$", line, re.I): pages, page, newpage = begpage(pages, page, newpage, line) continue if re.search(r"^draft-[-a-z0-9_.]+.*[0-9][0-9][0-9][0-9]$", line, re.I): pages, page, newpage = endpage(pages, page, newpage, line) continue if linecount > 15 and re.search(r".{58,}(Jan|Feb|Mar|March|Apr|April|May|Jun|June|Jul|July|Aug|Sep|Oct|Nov|Dec) (19[89][0-9]|20[0-9][0-9]) *$", line, re.I): pages, page, newpage = begpage(pages, page, newpage, line) continue if newpage and re.search(r"^ *draft-[-a-z0-9_.]+ *$", line, re.I): pages, page, newpage = begpage(pages, page, newpage, line) continue if re.search(r"^[^ \t]+", line): sentence = True if re.search(r"[^ \t]", line): if newpage: # 36 is a somewhat arbitrary count for a 'short' line shortthis = len(line.strip()) < 36 # 36 is a somewhat arbitrary count for a 'short' line if sentence or (shortprev and not shortthis): stripped += [""] else: if blankcount: stripped += [""]*blankcount blankcount = 0 sentence = False newpage = False shortprev = len(line.strip()) < 36 # 36 is a somewhat arbitrary count for a 'short' line if re.search("[.:]$", line): sentence = True if re.search("^[ \t]*$", line): blankcount += 1 page += [ line ] continue page += [ line ] stripped += [ line ] pages, page, newpage = begpage(pages, page, newpage) _debug('pages: %s' % len(pages)) return stripped, pages # ---------------------------------------------------------------------- def get_pagecount(self): if self._pagecount == None: label_pages = len(re.findall(r"\[page [0-9ixldv]+\]", self.text, re.I)) count_pages = len(self.pages) if label_pages > count_pages/2: self._pagecount = label_pages else: self._pagecount = count_pages return self._pagecount # ------------------------------------------------------------------ def get_wordcount(self): count = 0 # match any sequence of non-white-space characters like the Unix command "wc" word_re = re.compile(r'\S+', re.UNICODE) for l in self.lines: count += sum(1 for _ in word_re.finditer(l)) return count # ------------------------------------------------------------------ def get_formal_languages(self): language_regexps = [ ("abnf", [re.compile(r"\bABNF"), re.compile(r" +[a-zA-Z][a-zA-Z0-9_-]* +=[/ ]")]), ("asn1", [re.compile(r'DEFINITIONS +::= +BEGIN')]), ("cbor", [re.compile(r'\b(?:CBOR|CDDL)\b'), re.compile(r" +[a-zA-Z][a-zA-Z0-9_-]* += +[\{\[\(]")]), ("ccode", [re.compile(r"(?:\+\+\))|(?:for \(i)|(?: [!=]= 0\) \{)|(?: struct [a-zA-Z_0-9]+ \{)")]), ("json", [re.compile(r'\bJSON\b'), re.compile(r" \"[^\"]+\" ?: [a-zA-Z0-9\.\"\{\[]")]), ("xml", [re.compile(r"<\?xml")]), ] already_matched = set() for l in self.lines: for lang_name, patterns in language_regexps: for p in patterns: if p not in already_matched and p.search(l): already_matched.add(p) return [ lang_name for lang_name, patterns in language_regexps if all(p in already_matched for p in patterns) ] # ---------------------------------------------------------------------- def get_status(self): if self._status == None: for line in self.lines[:10]: status_match = re.search(r"^\s*Intended [Ss]tatus:\s*(.*?) ", line) if status_match: self._status = status_match.group(1) break return self._status # ------------------------------------------------------------------ def get_creation_date(self): if self._creation_date: return self._creation_date date_regexes = [ r'^(?P\w+)\s(?P\d{1,2})(,|\s)+(?P\d{4})', r'^(?P\d{1,2})(,|\s)+(?P\w+)\s(?P\d{4})', r'^(?P\d{1,2})-(?P\w+)-(?P\d{4})', r'^(?P\w+)\s(?P\d{4})', r'\s{3,}(?P\w+)\s(?P\d{1,2})(,|\s)+(?P\d{4})', r'\s{3,}(?P\d{1,2})(,|\s)+(?P\w+)\s(?P\d{4})', r'\s{3,}(?P\d{1,2})-(?P\w+)-(?P\d{4})', # RFC 3339 date (also ISO date) r'\s{3,}(?P\d{4})-(?P\d{2})-(?P\d{2})', # 'October 2008' - default day to today's. r'\s{3,}(?P\w+)\s(?P\d{4})', ] dates = [] text = self.pages[0] for regex in date_regexes: match = re.search(regex, text, re.MULTILINE) if match: start = match.start() if not "expires" in text[start-10:start].lower(): dates += [(start, match)] dates.sort() for start, match in dates: md = match.groupdict() mon = md['month'].lower() day = int( md.get( 'day', 0 ) ) year = int( md['year'] ) try: if mon in month_names: month = month_names.index( mon ) + 1 elif mon in month_names_abbrev3: month = month_names_abbrev3.index( mon ) + 1 elif mon in month_names_abbrev4: month = month_names_abbrev4.index( mon ) + 1 elif mon.isdigit() and int(mon) in range(1,13): month = int(mon) else: continue today = datetime.date.today() if day==0: # if the date was given with only month and year, use # today's date if month and year is today's month and # year, otherwise pick the middle of the month. # Don't use today's day for month and year in the past if month==today.month and year==today.year: day = today.day else: day = 15 self._creation_date = datetime.date(year, month, day) return self._creation_date except ValueError: # mon abbreviation not in _MONTH_NAMES # or month or day out of range pass self.errors['creation_date'] = 'Creation Date field is empty or the creation date is not in a proper format.' return self._creation_date # ------------------------------------------------------------------ def get_abstract(self): if self._abstract: return self._abstract abstract_re = re.compile(r'^(\s*)abstract', re.I) header_re = re.compile(r"^(\s*)([0-9]+\.? |Appendix|Status of|Table of|Full Copyright|Copyright|Intellectual Property|Acknowled|Author|Index|Disclaimer).*", re.I) begin = False abstract = [] abstract_indent = 0 look_for_header = False for line in self.lines: if not begin: if abstract_re.match(line): begin=True abstract_indent = len(abstract_re.match(line).group(0)) continue if begin: if not line and not abstract: continue if not line: look_for_header=True abstract.append(line) continue if look_for_header and header_re.match(line): break look_for_header = False abstract.append(line) abstract = '\n'.join(abstract) abstract = self._clean_abstract(abstract) self._abstract = self._check_abstract_indent(abstract, abstract_indent) return self._abstract def _check_abstract_indent(self, abstract, indent): indentation_re = re.compile(r'^(\s)*') indent_lines = [] for line in abstract.split('\n'): if line: indent = len(indentation_re.match(line).group(0)) indent_lines.append(indent) percents = {} total = float(len(indent_lines)) formated = False for indent in set(indent_lines): count = indent_lines.count(indent)/total percents[indent] = count if count > 0.9: formated = True if not formated: return abstract new_abstract = [] for line in abstract.split('\n'): if line: indent = len(indentation_re.match(line).group(0)) if percents[indent] < 0.9: break new_abstract.append(line) return '\n'.join(new_abstract) def _clean_abstract(self, text): text = re.sub("(?s)(Conventions [Uu]sed in this [Dd]ocument|Requirements [Ll]anguage)?[\n ]*The key words \"MUST\", \"MUST NOT\",.*$", "", text) # Get rid of status/copyright boilerplate text = re.sub("(?s)\nStatus of [tT]his Memo\n.*$", "", text) # wrap long lines without messing up formatting of Ok paragraphs: while re.match("([^\n]{72,}?) +", text): text = re.sub("([^\n]{72,}?) +([^\n ]*)(\n|$)", "\\1\n\\2 ", text) return text # ------------------------------------------------------------------ def get_authors(self): """Returns a list of strings with author name and email within angle brackets""" if self._authors == None: self.extract_authors() return self._authors def get_authors_with_firm(self): """Returns a list of strings with author name and email within angle brackets""" if self._authors_with_firm == None: self.extract_authors() return self._authors_with_firm def get_author_list(self): # () -> List[List[str, str, str, str, str, str, str]] """Returns a list of tuples, with each tuple containing (given_names, surname, email, company). Email will be None if unknown. """ if self._author_info == None: self.extract_authors() return self._author_info def extract_authors(self): """Extract author information from draft text. """ aux = { "honor" : r"(?:[A-Z]\.|Dr\.?|Dr\.-Ing\.|Prof(?:\.?|essor)|Sir|Lady|Dame|Sri)", "prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von|[Ee]l)", "suffix": r"(jr.?|Jr.?|II|2nd|III|3rd|IV|4th)", "first" : r"([A-Z][-A-Za-z'`~]*)(( ?\([A-Z][-A-Za-z'`~]*\))?(\.?[- ]{1,2}[A-Za-z'`~]+)*)", "last" : r"([-A-Za-z'`~]{2,})", "months": r"(January|February|March|April|May|June|July|August|September|October|November|December)", "mabbr" : r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?", } authcompanyformats = [ r" {6}(?P(%(first)s[ \.]{1,3})+((%(prefix)s )?%(last)s)( %(suffix)s)?), (?P[^.]+\.?)$" % aux, r" {6}(?P(%(first)s[ \.]{1,3})+((%(prefix)s )?%(last)s)( %(suffix)s)?) *\((?P[^.]+\.?)\)$" % aux, ] authformats = [ r" {6}((%(first)s[ \.]{1,3})+((%(prefix)s )?%(last)s)( %(suffix)s)?)(, ([^.]+\.?|\([^.]+\.?|\)))?,?$" % aux, r" {6}(((%(prefix)s )?%(last)s)( %(suffix)s)?, %(first)s)?$" % aux, r" {6}(%(last)s)$" % aux, ] multiauthformats = [ ( r" {6}(%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)(, ?%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)+$" % aux, r"(%(first)s[ \.]{1,3}((%(prefix)s )?%(last)s)( %(suffix)s)?)" % aux ), ] editorformats = [ r"(?:, | )([Ee]d\.?|\([Ee]d\.?\)|[Ee]ditor)$", ] companyformats = [ r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(,? ?(Inc|Ltd|AB|S\.A)\.?))$", r" {6}(([A-Za-z'][-A-Za-z0-9.& ']+)(/([A-Za-z'][-A-Za-z0-9.& ']+))+)$", r" {6}([a-z0-9.-]+)$", r" {6}(([A-Za-z'][-A-Za-z0-9.&']+)( [A-Za-z'][-A-Za-z0-9.&']+)*)$", r" {6}(([A-Za-z'][-A-Za-z0-9.']+)( & [A-Za-z'][-A-Za-z0-9.']+)*)$", r" {6}\((.+)\)$", r" {6}(\w+\s?\(.+\))$", ] dateformat = r"(((%(months)s|%(mabbr)s) \d+, |\d+ (%(months)s|%(mabbr)s),? |\d+/\d+/)\d\d\d\d|\d\d\d\d-\d\d-\d\d)$" % aux address_section = r"^ *([0-9]+\.)? *(Author|Editor)('s|s'|s|\(s\)) (Address|Addresses|Information)" ignore = [ "Standards Track", "Current Practice", "Internet Draft", "Working Group", "Expiration Date", ] def make_authpat(hon, first, last, suffix): def dotexp(s): s = re.sub(r"\. ", r"\\w* ", s) s = re.sub(r"\.$", r"\\w*", s) s = re.sub(r"\.(\w)", r"\\w* \1", s) return s first = dotexp(first) last = dotexp(last) first = re.sub("[()]", " ", first) if " " in first: # if there's a middle part, let it be optional first, middle = first.split(" ", 1) first = "%s( +%s)?" % (first, middle) # Double names (e.g., Jean-Michel) are abbreviated as two letter # connected by a dash -- let this expand appropriately first = re.sub(r"^([A-Z])-([A-Z])\\w\*", r"\1.*-\2.*", first) # Some chinese names are shown with double-letter(latin) abbreviated given names, rather than # a single-letter(latin) abbreviation: first = re.sub(r"^([A-Z])[A-Z]+\\w\*", r"\1[-\\w]+", first) # permit insertion of middle names between first and last, and # add possible honorific and suffix information authpat = r"(?:^| and )(?:%(hon)s ?)?(['`]*%(first)s\S*( +[^ ]+)* +%(last)s)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| %(suffix)s| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,} return authpat authors = [] companies = [] companies_seen = [] self._docheader = "" # Collect first-page author information first have_blankline = False have_draftline = False prev_blankline = False for line in self.lines[:30]: self._docheader += line+"\n" author_on_line = False _debug( " ** " + line) leading_space = len(re.findall("^ *", line)[0]) line_len = len(line.rstrip()) trailing_space = line_len <= 72 and 72 - line_len or 0 # Truncate long lines at the first space past column 80: trunc_space = line.find(" ", 80) if line_len > 80 and trunc_space > -1: line = line[:trunc_space] if line_len > 60: # Look for centered title, break if found: if (leading_space > 5 and abs(leading_space - trailing_space) < 5): _debug("Breaking for centered line") break if re.search(dateformat, line): if authors: _debug("Breaking for dateformat after author name") for editorformat in editorformats: if re.search(editorformat, line): line = re.sub(editorformat, "", line) break for lineformat, authformat in multiauthformats: match = re.search(lineformat, line) if match: _debug("a. Multiauth format: '%s'" % lineformat) author_list = re.findall(authformat, line) authors += [ a[0] for a in author_list ] companies += [ None for a in author_list ] author_on_line = True #_debug("\nLine: " + line) #_debug("Format: " + authformat) for author in author_list: _debug("Author: '%s'" % author[0]) break if not author_on_line: for lineformat in authcompanyformats: match = re.search(lineformat, line) if match: _debug("b. Line format: '%s'" % lineformat) maybe_company = match.group("company").strip(" ,.") # is the putative company name just a partial name, i.e., a part # that commonly occurs after a comma as part of a company name, # as in "Foo Bar, Inc."? If so, skip; else assume there's a # company name after the comma. if not maybe_company in ["Inc", "Ltd", "S.A", "AG", "AB", "N.V", ]: author = match.group("author") company = match.group("company") authors += [ author, ''] companies += [ None, company ] #_debug("\nLine: " + line) #_debug("Format: " + authformat) _debug("Author: '%s'" % author) _debug("Company: '%s'" % company) author_on_line = True break if not author_on_line: for authformat in authformats: match = re.search(authformat, line) if match: _debug("c. Auth format: '%s'" % authformat) author = match.group(1) authors += [ author ] companies += [ None ] #_debug("\nLine: " + line) #_debug("Format: " + authformat) _debug("Author: '%s'" % author) author_on_line = True break if not author_on_line: for authformat in companyformats: match = re.search(authformat, line) if match: _debug("d. Company format: '%s'" % authformat) company = match.group(1) authors += [ "" ] companies += [ company ] #_debug("\nLine: " + line) #_debug("Format: " + authformat) _debug("Company: '%s'" % company) break if authors and not author_on_line: # Retain information about blank lines in author list authors += [""] companies += [ "" ] if line.strip() == "": if prev_blankline and authors: _debug("Breaking, having found consecutive blank lines after author name") break if authors: have_blankline = True prev_blankline = True else: prev_blankline = False if "draft-" in line: have_draftline = True if have_blankline and have_draftline: _debug("Breaking, having found both blank line and draft-name line") break # remove trailing blank entries in the author list: for i in range(len(authors)-1,-1,-1): if authors[i] == "" and companies[i] == "": del authors[i] del companies[i] else: break _debug("A:companies : %s" % str(companies)) #companies = [ None if a else '' for a in authors ] #_debug("B:companies : %s" % str(companies)) #find authors' addresses section if it exists _debug("B:authors : %s" % str(authors)) last_line = len(self.lines)-1 address_section_pos = last_line//2 for i in range(last_line//2,last_line): line = self.lines[i] if re.search(address_section, line): address_section_pos = i break found_pos = [] company_or_author = None for i in range(len(authors)): _debug("1: authors[%s]: %s" % (i, authors[i])) _debug(" company[%s]: %s" % (i, companies[i])) author = authors[i] if i+1 < len(authors): company_or_author = authors[i+1] else: company_or_author = None if author in [ None, '', ]: continue suffix_match = re.search(" %(suffix)s$" % aux, author) if suffix_match: suffix = suffix_match.group(1) author = author[:-len(suffix)].strip() else: suffix = None if "," in author: last, first = author.split(",",1) author = "%s %s" % (first.strip(), last.strip()) if not " " in author: if "." in author: first, last = author.rsplit(".", 1) first += "." else: author = "[A-Z].+ " + author first, last = author.rsplit(" ", 1) else: if "." in author: first, last = author.rsplit(".", 1) first += "." else: first, last = author.rsplit(" ", 1) if "." in first and not ". " in first: first = first.replace(".", ". ").strip() first = first.strip() last = last.strip() prefix_match = re.search(" %(prefix)s$" % aux, first) if prefix_match: prefix = prefix_match.group(1) first = first[:-len(prefix)].strip() last = prefix+" "+last _debug("First, Last: '%s' '%s'" % (first, last)) for firstname, surname, casefixname in [ (first,last,last), (last,first,first), (first,last,last.upper()), (last,first,first.upper()), ]: for left, right in [(firstname, casefixname), (casefixname, firstname)]: author = "%s %s" % (left, right) _debug("\nAuthors: "+str(authors)) _debug("Author: "+author) # Pattern for full author information search, based on first page author name: authpat = make_authpat(aux['honor'], left, right, aux['suffix']) _debug("Authpat: " + authpat) start = 0 col = None # Find start of author info for this author (if any). # Scan towards the front from the end of the file, looking for a match to authpath for j in range(last_line, address_section_pos, -1): line = self.lines[j] _debug( "Line: " + line) forms = [ line ] + [ line.replace(short, longform[short]) for short in longform if short in line ] for form in forms: try: if re.search(authpat, form.strip()) and not j in found_pos: _debug( "Match") start = j found_pos += [ start ] _debug( " ==> start %s, normalized '%s'" % (start, form.strip())) # The author info could be formatted in multiple columns... columns = re.split("( +| and )", form) # _debug( "Columns:" + str(columns)) # Find which column: # _debug( "Col range:" + str(range(len(columns)))) cols = [ c for c in range(len(columns)) if re.search(authpat+r"( and |, |$)", columns[c].strip()) ] if cols: col = cols[0] if not (start, col) in found_pos: found_pos += [ (start, col) ] _debug( "Col: %d" % col) beg = len("".join(columns[:col])) _debug( "Beg: %d '%s'" % (beg, "".join(columns[:col]))) _debug( "Len: %d" % len(columns)) if col == len(columns) or col == len(columns)-1: end = None _debug( "End1: %s" % end) else: end = beg + len("".join(columns[col:col+2])) _debug( "End2: %d '%s'" % (end, "".join(columns[col:col+2]))) _debug( "Cut: '%s'" % form[beg:end]) author_match = re.search(authpat, columns[col].strip()).group(1) _debug( "AuthMatch: '%s'" % (author_match,)) if re.search(r'\(.*\)$', author_match.strip()): author_match = author_match.rsplit('(',1)[0].strip() if author_match in companies_seen: companies[i] = authors[i] authors[i] = None else: fullname = author_match #if casefixname in author_match: # fullname = author_match.replace(casefixname, surname) #else: # fullname = author_match fullname = re.sub(" +", " ", fullname) if left == firstname: given_names, surname = fullname.rsplit(None, 1) else: surname, given_names = fullname.split(None, 1) if " " in given_names: first, middle = given_names.split(None, 1) else: first = given_names middle = None names = (first, middle, surname, suffix) if suffix: fullname = fullname+" "+suffix for names in [ (first, middle, surname, suffix), (first, surname, middle, suffix), (middle, first, surname, suffix), (middle, surname, first, suffix), (surname, first, middle, suffix), (surname, middle, first, suffix), ]: parts = [ n for n in names if n ] if (" ".join(parts) == fullname): authors[i] = (fullname, first, middle, surname, suffix) companies[i] = None break else: _warn("Author tuple doesn't match text in draft: %s, %s" % (authors[i], fullname)) authors[i] = None break except AssertionError: sys.stderr.write("filename: "+self.filename+"\n") sys.stderr.write("authpat: "+authpat+"\n") raise if start and col != None: break if start and col != None: break if start and col != None: break # End for: if not authors[i]: continue _debug("2: authors[%s]: %s" % (i, authors[i])) if start and col != None: _debug("\n * %s" % (authors[i], )) nonblank_count = 0 blanklines = 0 email = None country = None for line_offset, line in enumerate(self.lines[start+1:]): _debug( " " + line.strip()) # Break on the second blank line if not line: blanklines += 1 if blanklines >= 3: _debug( " - Break on blanklines") break else: continue else: nonblank_count += 1 # Maybe break on author name # _debug("Line: %s"%line.strip()) # for a in authors: # if a and a not in companies_seen: # _debug("Search for: %s"%(r"(^|\W)"+re.sub("\.? ", ".* ", a)+"(\W|$)")) authmatch = [ a for a in authors[i+1:] if a and not a.lower() in companies_seen and (re.search((r"(?i)(^|\W)"+re.sub(r"[. ]+", ".*", a)+r"(\W|$)"), line.strip()) or acronym_match(a, line.strip()) )] if authmatch: _debug(" ? Other author or company ? : %s" % authmatch) _debug(" Line: "+line.strip()) _debug(" C or A: %s"%company_or_author) if nonblank_count == 1 or (nonblank_count == 2 and not blanklines) or (company_or_author==line.strip() and not blanklines): # First line after an author -- this is a company companies_seen += [ c.lower() for c in authmatch ] companies_seen += [ line.strip().lower() ] # XXX fix this for columnized author list companies_seen = list(set(companies_seen)) _debug(" -- Companies: " + ", ".join(companies_seen)) for k in range(i+1, len(authors)): if authors[k] and authors[k].lower() in companies_seen: companies[k] = authors[k] authors[k] = None elif blanklines and not "@" in line: # Break on an author name _debug( " - Break on other author name") break else: pass def columnify(l): try: column = l.replace('\t', 8 * ' ')[max(0, beg - 1):end].strip() except: column = l column = re.sub(r" *(?:\(at\)| | at ) *", "@", column) column = re.sub(r" *(?:\(dot\)| | dot ) *", ".", column) column = re.sub(r"&cisco.com", "@cisco.com", column) column = column.replace("\xa0", " ") return column column = columnify(line) # if re.search("^\w+: \w+", column): # keyword = True # else: # if keyword: # # Break on transition from keyword line to something else # _debug( " - Break on end of keywords") # break #_debug( " Column text :: " + column) if nonblank_count >= 2 and blanklines == 0: # Usually, the contact info lines will look # like this: "Email: someone@example.com" or # "Tel: +1 (412)-2390 23123", but sometimes # the : is left out. That's okay for things we # can't misinterpret, but "tel" may match "Tel # Aviv 69710, Israel" so match # - misc contact info # - tel/fax [number] # - [phone number] # - [email] other_contact_info_regex = re.compile(r'^(((contact )?e|\(e|e-|m|electronic )?mail|email_id|mailto|e-main|(tele)?phone|voice|mobile|work|uri|url|tel:)\b|^((ph|tel\.?|telefax|fax) *[:.]? *\(?( ?\+ ?)?[0-9]+)|^(\++[0-9]+|\(\+*[0-9]+\)|\(dsn\)|[0-9]+)([ -.]*\b|\b[ -.]*)(([0-9]{2,}|\([0-9]{2,}\)|(\([0-9]\)|[0-9])[ -][0-9]{2,}|\([0-9]\)[0-9]+)([ -.]+([0-9]+|\([0-9]+\)))+|([0-9]{7,}|\([0-9]{7,}\)))|^(?|^https?://|^www\.') next_line_index = start + 1 + line_offset + 1 if (not country and not other_contact_info_regex.search(column.lower()) and next_line_index < len(self.lines)): next_line_lower = columnify(self.lines[next_line_index]).lower().strip() if not next_line_lower or other_contact_info_regex.search(next_line_lower): # country should be here, as the last # part of the address, right before an # empty line or other contact info country = column.strip() or None _debug(" Country: %s" % country) _debug("3: authors[%s]: %s" % (i, authors[i])) emailmatch = re.search("[-A-Za-z0-9_.+]+@[-A-Za-z0-9_.]+", column) if emailmatch and not "@" in author: email = emailmatch.group(0).lower() break authors[i] = authors[i] + ( email, country) else: if not author in ignore: companies[i] = authors[i] _debug("Not an author? '%s'" % (author)) authors[i] = None assert(len(authors) == len(companies)) _debug('Author list: %s' % authors) _debug('Company list: %s' % companies) for i in range(len(authors)): if authors[i]: _debug('authors[%s]: %s' % (i, authors[i])) company = '' for k in range(i+1, len(companies)): _debug('companies[%s]: %s' % (k, companies[k])) if companies[k] != None: company = companies[k] break authors[i] = authors[i] + ( company, ) authors = [ a for a in authors if a ] _debug(" * Final author tuples: %s" % (authors,)) _debug(" * Final company list: %s" % (companies,)) _debug(" * Final companies_seen: %s" % (companies_seen,)) self._author_info = authors self._authors_with_firm = [ "%s <%s> (%s)"%(full,email,company) for full,first,middle,last,suffix,email,country,company in authors ] # pyflakes:ignore self._authors = [ "%s <%s>"%(full,email) if email else full for full,first,middle,last,suffix,email,country,company in authors ] self._authors.sort() _debug(" * Final author list: " + ", ".join(self._authors)) _debug("-"*72) # ------------------------------------------------------------------ def get_title(self): if self._title: return self._title match = re.search(r'(?:\n\s*\n\s*)((.+\n){0,2}(.+\n*))(\s+ 1: in_norm_ref_sect = True refType = self.REF_TYPE_NORMATIVE # might be subsections within a references section if in_ref_sect and not in_norm_ref_sect: m = sectionre.match( line ) if not m: m = sectionre2.match( line ) if not m: m = sectionre3.match( line ) if m: in_ref_sect = True if line.lower().find("normative") > 1: in_norm_ref_sect = True refType = self.REF_TYPE_NORMATIVE # look for the end of the normative reference section if in_norm_ref_sect: m = sectionre.match( line ) if not m: m = sectionre2.match( line ) if not m: m = sectionre3.match( line ) if m and line.lower().find("normative") < 0: in_norm_ref_sect = False refType = self.REF_TYPE_INFORMATIVE # find references within the section if in_ref_sect: # If something got split badly, rejoin it. if eol.search( line ) and i < len( self.lines ) - 1: line += self.lines[ i + 1 ].lstrip() m = idref.search( line ) if m: draft = m.group( 1 ) if draft not in refs: refs[ draft ] = refType m = rfcref.search( line ) if m: ( series, number ) = m.groups() if series.lower()=='std' and std_start.search(line) and i > 15: line = self.lines[i-1].rstrip()+line if series.lower()!='std' or not not_our_std_ref.search( line ): name = series.lower() + number.lstrip( '0' ) if name not in refs: refs[ name ] = refType # Don't add any references that point back into this doc if self.filename in refs: del refs[self.filename] return refs def old_get_refs( self ): refs = [] normrefs = [] rfcrefs = [] draftrefs = [] refline = None for i in range(len(self.lines)-1, 15, -1): if re.search(r"(?i)^ *[0-9.]+ *(((normative|informative|informational|non-normative) )?references|references\W+(normative|informative))", self.lines[i]): if not '. . .' in self.lines[i] and not '...' in self.lines[i]: refline = i if refline: for i in range(refline, len(self.lines)): line = self.lines[i].strip() ref_match = re.search(r"(?i)^\[[a-z0-9.-]+( [a-z0-9.-]+)?\].+", line) if ref_match: para = line while True: i += 1 if i >= len(self.lines): break line = self.lines[i].strip() if not line: break if para[-1] not in ["-", "/"]: para += " " para += line refs += [ para ] rfc_match = re.search(r"(?i)rfc ?\d+", para) if rfc_match: rfcrefs += [ rfc_match.group(0).replace(" ","").lower() ] draft_match = re.search(r"draft-[a-z0-9-]+", para) if draft_match: draft = draft_match.group(0).lower() if not draft in draftrefs: draftrefs += [ draft ] normrefs = list(set(normrefs)) normrefs.sort() rfcrefs = list(set(rfcrefs)) rfcrefs.sort() refs = list(set(refs)) refs.sort() return normrefs, rfcrefs, draftrefs, refs # ---------------------------------------------------------------------- def getmeta(fn): # Initial values fields = {} fields["eventsource"] = "draft" if " " in fn or not fn.endswith(".txt"): _warn("Skipping unexpected draft name: '%s'" % (fn)) return {} if os.path.exists(fn): filename = fn fn = os.path.basename(fn) else: if fn.lower().startswith('rfc'): filename = os.path.join("/www/tools.ietf.org/rfc", fn) elif not "/" in fn: filename = os.path.join("/www/tools.ietf.org/id", fn) if not os.path.exists(filename): fn = filename while not "-00." in fn: revmatch = re.search(r"-(\d\d)\.", fn) if revmatch: rev = revmatch.group(1) prev = "%02d" % (int(rev)-1) fn = fn.replace("-%s."%rev, "-%s."%prev) if os.path.exists(fn): _warn("Using rev %s instead: '%s'" % (prev, filename)) filename = fn fn = os.path.basename(fn) break else: break else: filename = fn if not os.path.exists(filename): _warn("Could not find file: '%s'" % (filename)) return timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+00:00", time.gmtime(os.stat(filename)[stat.ST_MTIME])) with io.open(filename, 'rb') as file: try: draft = PlaintextDraft(file.read().decode('utf8'), filename) except UnicodeDecodeError: draft = PlaintextDraft(file.read().decode('latin1'), filename) #_debug("\n".join(draft.lines)) fields["eventdate"] = timestamp if draft.filename: fields["doctag"] = draft.filename fields["docrev"] = draft.revision fields["doctitle"] = draft.get_title() fields["docpages"] = str(draft.get_pagecount()) fields["docauthors"] = ", ".join(draft.get_authors()) fields["_authorlist"] = draft.get_author_list() fields["docaffiliations"] = ", ".join(draft.get_authors_with_firm()) if opt_debug: fields["docheader"] = draft._docheader normrefs, rfcrefs, draftrefs, refs = draft.old_get_refs() fields["docrfcrefs"] = ", ".join(rfcrefs) fields["docdraftrefs"] = ", ".join(draftrefs) fields["doccreationdate"] = str(draft.get_creation_date()) deststatus = draft.get_status() if deststatus: fields["docdeststatus"] = deststatus abstract = draft.get_abstract() if abstract: fields["docabstract"] = abstract return fields # ---------------------------------------------------------------------- def _output(docname, fields, outfile=sys.stdout): global company_domain if opt_getauthors: # Output an (incomplete!) getauthors-compatible format. # Information about security and iana sections presence is # missing. for full,first,middle,last,suffix,email,country,company in fields["_authorlist"]: if company in company_domain: company = company_domain[company] else: if email and '@' in email: company = email.split('@')[1] if company.endswith(".com"): company = company[:-4] fields["name"] = full fields["email"] = email fields["company"] = company fields["country"] = country or "UNKNOWN" try: year, month, day = fields["doccreationdate"].split("-") except ValueError: year, month, day = "UNKNOWN", "UNKNOWN", "UNKNOWN" fields["day"] = day fields["month"] = month_names[int(month)] if month != "UNKNOWN" else "UNKNOWN" fields["year"] = year print("%(doctag)s:%(name)s:%(company)s:%(email)s:%(country)s:%(docpages)s:%(month)s:%(year)s:%(day)s:" % fields) else: if opt_attributes: def outputkey(key, fields): field = fields[key] if "\n" in field: field = "\n" + field.rstrip() else: field = field.strip() outfile.write("%-24s: %s\n" % ( key, field.replace("\\", "\\\\" ).replace("'", "\\x27" ))) else: def outputkey(key, fields): outfile.write(" %s='%s'" % ( key.lower(), fields[key].strip().replace("\\", "\\\\" ).replace("'", "\\x27" ).replace("\n", "\\n"))) if opt_timestamp: outfile.write("%s " % (fields["eventdate"])) outfile.write("%s" % (os.path.basename(docname.strip()))) keys = list(fields.keys()) keys.sort() for key in keys: if fields[key] and not key in ["eventdate", ] and not key.startswith("_"): outputkey(key, fields) outfile.write("\n") # ---------------------------------------------------------------------- def _printmeta(fn, outfile=sys.stdout): if opt_trace: t = time.time() sys.stderr.write("%-58s" % fn[:-4]) fields = getmeta(fn) if fields: _output(fields.get("doctag", fn[:-7]), fields, outfile) if opt_trace: sys.stderr.write("%5.1f\n" % ((time.time() - t))) # ---------------------------------------------------------------------- # Main # ---------------------------------------------------------------------- company_domain = {} # type: Dict[str, str] def _main(outfile=sys.stdout): global opt_debug, opt_timestamp, opt_trace, opt_authorinfo, opt_getauthors, files, company_domain, opt_attributes # set default values, if any # ---------------------------------------------------------------------- # Option processing # ---------------------------------------------------------------------- options = "" for line in re.findall(r"\n +(if|elif) +opt in \[(.+)\]:\s+#(.+)\n", io.open(sys.argv[0]).read()): if not options: options += "OPTIONS\n" options += " %-16s %s\n" % (line[1].replace('"', ''), line[2]) options = options.strip() # with ' < 1:' on the next line, this is a no-op: if len(sys.argv) < 1: vars = globals() vars.update(locals()) print(__doc__ % vars) sys.exit(1) try: opts, files = getopt.gnu_getopt(sys.argv[1:], "dhatTv", ["debug", "getauthors", "attribs", "attributes", "help", "timestamp", "notimestamp", "trace", "version",]) except Exception as e: print("%s: %s" % (program, e)) sys.exit(1) # parse options for opt, value in opts: if opt in ["-d", "--debug"]: # Output debug information opt_debug = True elif opt in ["-h", "--help"]: # Output this help text, then exit vars = globals() vars.update(locals()) print(__doc__ % vars) sys.exit(1) elif opt in ["-v", "--version"]: # Output version information, then exit print(program, version) sys.exit(0) elif opt in ["--getauthors"]: # Output an (incomplete) getauthors-compatible format opt_getauthors = True elif opt in ["-a", "--attribs"]: # Output key-value attribute pairs opt_attributes = True elif opt in ["-t", ]: # Toggle leading timestamp information opt_timestamp = not opt_timestamp elif opt in ["--timestamp"]: # Emit leading timestamp information opt_timestamp = True elif opt in ["--notimestamp"]: # Omit leading timestamp information opt_timestamp = False elif opt in ["-T", "--trace"]: # Emit trace information while working opt_trace = True company_domain = {} if opt_getauthors: gadata = io.open("/www/tools.ietf.org/tools/getauthors/getauthors.data") for line in gadata: if line.startswith("company:"): try: kword, name, abbrev = line.strip().split(':') company_domain[name] = abbrev except ValueError: pass if not files: files = [ "-" ] for file in files: _debug( "Reading drafts from '%s'" % file) if file == "-": file = sys.stdin elif file.endswith(".gz"): import gzip file = gzip.open(file) else: file = io.open(file) basename = os.path.basename(file.name) if basename.startswith("draft-"): draft = basename _debug( "** Processing '%s'" % draft) _printmeta(file.name, outfile) else: for line in file: draft = line.strip() if draft.startswith("#"): continue if draft: _debug( "** Processing '%s'" % draft) _printmeta(draft, outfile) if __name__ == "__main__": try: _main() except KeyboardInterrupt: raise except Exception as e: if opt_debug: raise else: _err(e)