Refines Bill Fenner's regex based search through documents for references.

Populates RelatedDocument with relations for references for each type draft Document.
Replaces these reference relationships with updated copies on draft submission.
Note to deployer: This migration takes around 10 minutes to complete on a fast development laptop.
 - Legacy-Id: 6572
This commit is contained in:
Robert Sparks 2013-10-30 20:51:11 +00:00
parent 8c87d60c51
commit b18249222b
4 changed files with 306894 additions and 1 deletions

File diff suppressed because it is too large Load diff

View file

@ -6,6 +6,8 @@ from django.conf import settings
from ietf.utils import markup_txt
from ietf.doc.models import *
from ietf.utils import draft
def get_state_types(doc):
res = []
@ -290,3 +292,45 @@ def update_telechat(request, doc, by, new_telechat_date, new_returning_item=None
e.desc = "Removed telechat returning item indication"
e.save()
def rebuild_reference_relations(doc):
if doc.type.slug != 'draft':
return None
if doc.get_state_slug() == 'rfc':
filename=os.path.join(settings.RFC_PATH,doc.canonical_name()+".txt")
else:
filename=os.path.join(settings.INTERNET_DRAFT_PATH,doc.filename_with_rev())
try:
refs = draft.Draft(draft._gettext(filename), filename).get_refs()
except IOError as e:
return { 'errors': ["%s :%s" % (e.strerror, filename)] }
doc.relateddocument_set.filter(relationship__slug__in=['refnorm','refinfo','refold','refunk']).delete()
warnings = []
errors = []
unfound = set()
for ( ref, refType ) in refs.iteritems():
refdoc = DocAlias.objects.filter( name=ref )
count = refdoc.count()
if count == 0:
unfound.add( "%s" % ref )
continue
elif count > 1:
errors.append("Too many DocAlias objects found for %s"%ref)
else:
RelatedDocument.objects.get_or_create( source=doc, target=refdoc[ 0 ], relationship=DocRelationshipName.objects.get( slug='ref%s' % refType ) )
if unfound:
warnings.append('There were %d references with no matching DocAlias'%len(unfound))
ret = {}
if errors:
ret['errors']=errors
if warnings:
ret['warnings']=warnings
if unfound:
ret['unfound']=list(unfound)
return ret

View file

@ -17,9 +17,10 @@ from ietf.ietfauth.decorators import has_role
from ietf.doc.models import *
from ietf.person.models import Person, Alias, Email
from ietf.doc.utils import add_state_change_event
from ietf.doc.utils import add_state_change_event, rebuild_reference_relations
from ietf.message.models import Message
# Some useful states
UPLOADED = 1
AWAITING_AUTHENTICATION = 4
@ -97,6 +98,8 @@ def perform_post(request, submission):
update_authors(draft, submission)
rebuild_reference_relations(draft)
# new revision event
e = NewRevisionDocEvent(type="new_revision", doc=draft, rev=draft.rev)
e.time = draft.time #submission.submission_date

View file

@ -908,6 +908,80 @@ class Draft():
# ------------------------------------------------------------------
def get_refs(self):
refType = 'unk'
refs = {}
typemap = {
'normative': 'norm',
'informative': 'info',
'informational': 'info',
'non-normative': 'info',
None: 'old'
}
# Bill's horrible "references section" regexps, built up over lots of years
# of fine tuning for different formats.
# Examples:
# Appendix A. References:
# A.1. Informative References:
sectionre = re.compile( r'(?i)(?:Appendix\s+)?(?:(?:[A-Z]\.)?[0-9.]*\s+)?(?:(\S+)\s*)?references:?$' )
# 9.1 Normative
sectionre2 = re.compile( r'(?i)(?:(?:[A-Z]\.)?[0-9.]*\s+)?(\S+ormative)$' )
# One other reference section type seen:
sectionre3 = re.compile( r'(?i)References \((\S+ormative)\)$' )
# An Internet-Draft reference.
idref = re.compile( r'(?i)\b(draft-(?:[-\w]+(?=-\d\d)|[-\w]+))(-\d\d)?\b' )
# An RFC-and-other-series reference.
rfcref = re.compile( r'(?i)\b(rfc|std|bcp|fyi)[- ]?(\d+)\b' )
# False positives for std
not_our_std_ref = re.compile( r'(?i)((\b(n?csc|fed|mil|is-j)-std\b)|(\bieee\s*std\d*\b)|(\bstd\s+802\b))' )
# An Internet-Draft or series reference hyphenated by a well-meaning line break.
eol = re.compile( r'(?i)\b(draft[-\w]*-|rfc|std|bcp|fyi)$' )
# std at the front of a line can hide things like IEEE STD or MIL-STD
std_start = re.compile( r'(?i)std\n*\b' )
for i in range( 15, len( self.lines ) ):
line = self.lines[ i ].strip()
m = sectionre.match( line )
if m:
match = m.group( 1 )
if match is not None:
match = match.lower()
refType = typemap.get( match, 'unk' )
continue
m = sectionre2.match( line )
if m:
refType = typemap.get( m.group( 1 ).lower(), 'unk' )
continue
m = sectionre3.match( line )
if m:
refType = typemap.get( m.group( 1 ).lower(), 'unk' )
continue
# If something got split badly, rejoin it.
if eol.search( line ) and i < len( self.lines ) - 1:
line += self.lines[ i + 1 ].lstrip()
m = idref.search( line )
if m:
draft = m.group( 1 )
refs[ draft ] = refType
continue
m = rfcref.search( line )
if m:
( series, number ) = m.groups()
if series.lower()=='std' and std_start.search(line) and i > 15:
line = self.lines[i-1].rstrip()+line
if series.lower()!='std' or not not_our_std_ref.search( line ):
name = series.lower() + number.lstrip( '0' )
refs[ name ] = refType
continue
# References to BCP78 and BCP79 in boilerplate will appear as "unk".
# Remove them.
for boilerplate in ( 'bcp78', 'bcp79' ):
if refs.get( boilerplate ) == 'unk':
del refs[ boilerplate ]
return refs
def old_get_refs( self ):
refs = []
normrefs = []
rfcrefs = []