fix: Also extract document names from XML seriesInfo attributes and XInclude URLs (#5037)

* fix: Also extract document names from XML seriesInfo attributes

The old code only looked in the anchor string for the document names of
references, which doesn't work if the anchor uses a mnemonic. This caused lots
of missed references for many documents.

* No need to import lxml anymore

* Add tests

* Handle xinclude to bibxml URLs

* Wrap line

* Apply suggestion from @rjsparks

* Undo erroneous additions

* Address suggestion from @rjsparks
This commit is contained in:
Lars Eggert 2023-02-15 01:07:54 +02:00 committed by GitHub
parent d96c8f7b75
commit 182158b5c0
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 111 additions and 18 deletions

View file

@ -763,9 +763,11 @@ def rebuild_reference_relations(doc, filenames):
errors = []
unfound = set()
for ( ref, refType ) in refs.items():
# As of Dec 2021, DocAlias has a unique constraint on the name field, so count > 1 should not occur
refdoc = DocAlias.objects.filter( name=ref )
refdoc = DocAlias.objects.filter(name=ref)
if not refdoc and re.match(r"^draft-.*-\d{2}$", ref):
refdoc = DocAlias.objects.filter(name=ref[:-3])
count = refdoc.count()
# As of Dec 2021, DocAlias has a unique constraint on the name field, so count > 1 should not occur
if count == 0:
unfound.add( "%s" % ref )
continue

View file

@ -1,6 +1,6 @@
<?xml version='1.0'?>
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
<rfc category="exp" submissionType="independent" ipr="trust200902" docName="draft-test-references-00" version="3">
<rfc xmlns:xi="http://www.w3.org/2001/XInclude" category="exp" submissionType="independent" ipr="trust200902" docName="draft-test-references-00" version="3">
<front>
<title>Test Draft with References</title>
<author fullname="Alfred Person" initials="A." surname="Person" role="editor">
@ -37,9 +37,25 @@
<seriesInfo name="RFC" value="1"/>
<seriesInfo name="DOI" value="10.17487/RFC0001"/>
</reference>
<reference anchor="MNEMONIC" target="https://www.rfc-editor.org/info/rfc2">
<front>
<title>Cloud Software</title>
<author initials="D." surname="Crocker" fullname="D. Crocker">
<organization/>
</author>
<date year="1969" month="April"/>
</front>
<seriesInfo name="RFC" value="2"/>
<seriesInfo name="DOI" value="10.17487/RFC0002"/>
</reference>
</references>
<references>
<name>Informative References</name>
<xi:include href='https://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-teas-pcecc-use-cases-00.xml'/>
<xi:include href='https://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-teas-pcecc-use-cases.xml'/>
<xi:include href="https://bib.ietf.org/public/rfc/bibxml3/reference.I-D.draft-ietf-sipcore-multiple-reasons-00.xml" />
<xi:include href="https://bib.ietf.org/public/rfc/bibxml3/reference.I-D.draft-ietf-sipcore-multiple-reasons.xml" />
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml9/reference.BCP.0014.xml" />
<reference anchor='RFC0255' target='https://www.rfc-editor.org/info/rfc255'>
<front>
<title>Status of network hosts</title>
@ -51,6 +67,34 @@
<seriesInfo name='RFC' value='255'/>
<seriesInfo name='DOI' value='10.17487/RFC0255'/>
</reference>
<reference anchor="CONSISTENCY">
<front>
<title>Key Consistency and Discovery</title>
<author fullname="Alex Davidson" initials="A." surname="Davidson">
<organization>Brave Software</organization>
</author>
<author fullname="Matthew Finkel" initials="M." surname="Finkel">
<organization>The Tor Project</organization>
</author>
<author fullname="Martin Thomson" initials="M." surname="Thomson">
<organization>Mozilla</organization>
</author>
<author fullname="Christopher A. Wood" initials="C. A." surname="Wood">
<organization>Cloudflare</organization>
</author>
<date day="17" month="August" year="2022"/>
<abstract>
<t> This document describes the key consistency and correctness
requirements of protocols such as Privacy Pass, Oblivious DoH, and
Oblivious HTTP for user privacy. It discusses several mechanisms and
proposals for enabling user privacy in varying threat models. In
concludes with discussion of open problems in this area.
</t>
</abstract>
</front>
<seriesInfo name="Internet-Draft" value="draft-wood-key-consistency-03"/>
</reference>
<referencegroup anchor="bcp6">
<reference anchor='RFC1930' target='https://www.rfc-editor.org/info/rfc1930'>
<front>
@ -191,4 +235,4 @@
</reference>
</references>
</back>
</rfc>
</rfc>

View file

@ -374,10 +374,17 @@ class XMLDraftTests(TestCase):
draft.get_refs(),
{
'rfc1': XMLDraft.REF_TYPE_NORMATIVE,
'rfc2': XMLDraft.REF_TYPE_NORMATIVE,
'draft-wood-key-consistency-03': XMLDraft.REF_TYPE_INFORMATIVE,
'rfc255': XMLDraft.REF_TYPE_INFORMATIVE,
'bcp6': XMLDraft.REF_TYPE_INFORMATIVE,
'bcp14': XMLDraft.REF_TYPE_INFORMATIVE,
'rfc1207': XMLDraft.REF_TYPE_UNKNOWN,
'rfc4086': XMLDraft.REF_TYPE_NORMATIVE,
'draft-ietf-teas-pcecc-use-cases-00': XMLDraft.REF_TYPE_INFORMATIVE,
'draft-ietf-teas-pcecc-use-cases': XMLDraft.REF_TYPE_INFORMATIVE,
'draft-ietf-sipcore-multiple-reasons-00': XMLDraft.REF_TYPE_INFORMATIVE,
'draft-ietf-sipcore-multiple-reasons': XMLDraft.REF_TYPE_INFORMATIVE,
}
)

View file

@ -60,17 +60,47 @@ class XMLDraft(Draft):
tree.tree = v2v3.convert2to3()
return tree, xml_version
def _document_name(self, anchor):
"""Guess document name from reference anchor
def _document_name(self, ref):
"""Get document name from reference."""
series = ["rfc", "bcp", "fyi", "std"]
# handle xinclude first
# FIXME: this assumes the xinclude is a bibxml href; if it isn't, there can
# still be false negatives. it would be better to expand the xinclude and parse
# its seriesInfo.
if ref.tag.endswith("}include"):
name = re.search(
rf"reference\.({'|'.join(series).upper()})\.(\d{{4}})\.xml",
ref.attrib["href"],
)
if name:
return f"{name.group(1)}{int(name.group(2))}".lower()
name = re.search(
r"reference\.I-D\.(?:draft-)?(.*)\.xml", ref.attrib["href"]
)
if name:
return f"draft-{name.group(1)}"
# can't extract the name, give up
return ""
Looks for series numbers and removes leading 0s from the number.
"""
anchor = anchor.lower() # always give back lowercase
label = anchor.rstrip('0123456789') # remove trailing digits
if label in ['rfc', 'bcp', 'fyi', 'std']:
number = int(anchor[len(label):])
return f'{label}{number}'
return anchor
# check the anchor next
anchor = ref.get("anchor").lower() # always give back lowercase
label = anchor.rstrip("0123456789") # remove trailing digits
if label in series:
number = int(anchor[len(label) :])
return f"{label}{number}"
# if we couldn't find a match so far, try the seriesInfo
series_query = " or ".join(f"@name='{x.upper()}'" for x in series)
for info in ref.xpath(
f"./seriesInfo[{series_query} or @name='Internet-Draft']"
):
if not info.attrib["value"]:
continue
if info.attrib["name"] == "Internet-Draft":
return info.attrib["value"]
else:
return f'{info.attrib["name"].lower()}{info.attrib["value"]}'
return ""
def _reference_section_type(self, section_name):
"""Determine reference type from name of references section"""
@ -154,10 +184,20 @@ class XMLDraft(Draft):
"""Extract references from the draft"""
refs = {}
# accept nested <references> sections
for section in self.xmlroot.findall('back//references'):
ref_type = self._reference_section_type(self._reference_section_name(section))
for ref in (section.findall('./reference') + section.findall('./referencegroup')):
refs[self._document_name(ref.get('anchor'))] = ref_type
for section in self.xmlroot.findall("back//references"):
ref_type = self._reference_section_type(
self._reference_section_name(section)
)
for ref in (
section.findall("./reference")
+ section.findall("./referencegroup")
+ section.findall(
"./xi:include", {"xi": "http://www.w3.org/2001/XInclude"}
)
):
name = self._document_name(ref)
if name:
refs[name] = ref_type
return refs