Merged in [14851] from housley@vigilsec.com:

Improve parser for references in Internet-Drafts. Fixes 
 - Legacy-Id: 14867
Note: SVN reference [14851] has been migrated to Git commit 565b10e00e
This commit is contained in:
Henrik Levkowetz 2018-03-19 22:37:03 +00:00
commit 75deb35e10

View file

@ -1019,79 +1019,103 @@ class Draft():
# ------------------------------------------------------------------ # ------------------------------------------------------------------
def get_refs(self): def get_refs(self):
refType = 'unk' # Bill's horrible "references section" regexps, built up over lots of years
refs = {} # of fine tuning for different formats.
typemap = { # Examples:
'normative': 'norm', # Appendix A. References:
'informative': 'info', # A.1. Informative References:
'informational': 'info', sectionre = re.compile( r'(?i)(?:Appendix\s+)?(?:(?:[A-Z]\.)?[0-9.]*\s+)?(?:(\S+)\s*)?references:?$' )
'non-normative': 'info', # 9.1 Normative
None: 'old' sectionre2 = re.compile( r'(?i)(?:(?:[A-Z]\.)?[0-9.]*\s+)?(\S+ormative)$' )
} # One other reference section type seen:
# Bill's horrible "references section" regexps, built up over lots of years sectionre3 = re.compile( r'(?i)References \((\S+ormative)\)$' )
# of fine tuning for different formats. # An Internet-Draft reference.
# Examples: idref = re.compile( r'(?i)\b(draft-(?:[-\w]+(?=-\d\d)|[-\w]+))(-\d\d)?\b' )
# Appendix A. References: # An RFC-and-other-series reference.
# A.1. Informative References: rfcref = re.compile( r'(?i)\b(rfc|std|bcp|fyi)[- ]?(\d+)\b' )
sectionre = re.compile( r'(?i)(?:Appendix\s+)?(?:(?:[A-Z]\.)?[0-9.]*\s+)?(?:(\S+)\s*)?references:?$' )
# 9.1 Normative
sectionre2 = re.compile( r'(?i)(?:(?:[A-Z]\.)?[0-9.]*\s+)?(\S+ormative)$' )
# One other reference section type seen:
sectionre3 = re.compile( r'(?i)References \((\S+ormative)\)$' )
# An Internet-Draft reference.
idref = re.compile( r'(?i)\b(draft-(?:[-\w]+(?=-\d\d)|[-\w]+))(-\d\d)?\b' )
# An RFC-and-other-series reference.
rfcref = re.compile( r'(?i)\b(rfc|std|bcp|fyi)[- ]?(\d+)\b' )
# False positives for std # False positives for std
not_our_std_ref = re.compile( r'(?i)((\b(n?csc|fed|mil|is-j)-std\b)|(\bieee\s*std\d*\b)|(\bstd\s+802\b))' ) not_our_std_ref = re.compile( r'(?i)((\b(n?csc|fed|mil|is-j)-std\b)|(\bieee\s*std\d*\b)|(\bstd\s+802\b))' )
# An Internet-Draft or series reference hyphenated by a well-meaning line break. # An Internet-Draft or series reference hyphenated by a well-meaning line break.
eol = re.compile( r'(?i)\b(draft[-\w]*-|rfc|std|bcp|fyi)$' ) eol = re.compile( r'(?i)\b(draft[-\w]*-|rfc|std|bcp|fyi)$' )
# std at the front of a line can hide things like IEEE STD or MIL-STD # std at the front of a line can hide things like IEEE STD or MIL-STD
std_start = re.compile( r'(?i)std\n*\b' ) std_start = re.compile( r'(?i)std\n*\b' )
for i in range( 15, len( self.lines ) ): refs = {}
line = self.lines[ i ].strip() in_ref_sect = False
m = sectionre.match( line ) in_norm_ref_sect = False
if m: refType = 'unk'
match = m.group( 1 )
if match is not None: for i in range( 15, len( self.lines ) ):
match = match.lower() line = self.lines[ i ].strip()
refType = typemap.get( match, 'unk' )
continue # skip over lines until we find the start of the reference section
m = sectionre2.match( line ) if not in_ref_sect:
if m: m = sectionre.match( line )
refType = typemap.get( m.group( 1 ).lower(), 'unk' ) if not m:
continue m = sectionre2.match( line )
m = sectionre3.match( line ) if not m:
if m: m = sectionre3.match( line )
refType = typemap.get( m.group( 1 ).lower(), 'unk' )
continue if m:
# If something got split badly, rejoin it. in_ref_sect = True
if eol.search( line ) and i < len( self.lines ) - 1: refType = 'info'
line += self.lines[ i + 1 ].lstrip() if line.lower().find("normative") > 1:
m = idref.search( line ) in_norm_ref_sect = True
if m: refType = 'norm'
draft = m.group( 1 )
refs[ draft ] = refType # might be subsections within a references section
continue if in_ref_sect and not in_norm_ref_sect:
m = rfcref.search( line ) m = sectionre.match( line )
if m: if not m:
( series, number ) = m.groups() m = sectionre2.match( line )
if series.lower()=='std' and std_start.search(line) and i > 15: if not m:
line = self.lines[i-1].rstrip()+line m = sectionre3.match( line )
if series.lower()!='std' or not not_our_std_ref.search( line ):
name = series.lower() + number.lstrip( '0' ) if m:
refs[ name ] = refType in_ref_sect = True
continue if line.lower().find("normative") > 1:
# References to BCP78 and BCP79 in boilerplate will appear as "unk". in_norm_ref_sect = True
# Remove them. refType = 'norm'
for boilerplate in ( 'bcp78', 'bcp79' ):
if refs.get( boilerplate ) == 'unk': # look for the end of the normative reference section
del refs[ boilerplate ] if in_norm_ref_sect:
m = sectionre.match( line )
if not m:
m = sectionre2.match( line )
if not m:
m = sectionre3.match( line )
if m and line.lower().find("normative") < 0:
in_norm_ref_sect = False
refType = 'info'
# find references within the section
if in_ref_sect:
# If something got split badly, rejoin it.
if eol.search( line ) and i < len( self.lines ) - 1:
line += self.lines[ i + 1 ].lstrip()
m = idref.search( line )
if m:
draft = m.group( 1 )
if draft not in refs:
refs[ draft ] = refType
m = rfcref.search( line )
if m:
( series, number ) = m.groups()
if series.lower()=='std' and std_start.search(line) and i > 15:
line = self.lines[i-1].rstrip()+line
if series.lower()!='std' or not not_our_std_ref.search( line ):
name = series.lower() + number.lstrip( '0' )
if name not in refs:
refs[ name ] = refType
# Don't add any references that point back into this doc # Don't add any references that point back into this doc
if self.filename in refs: if self.filename in refs:
del refs[self.filename] del refs[self.filename]
return refs
return refs
def old_get_refs( self ): def old_get_refs( self ):
refs = [] refs = []