fix: Also extract document names from XML seriesInfo attributes and XInclude URLs (#5037)
* fix: Also extract document names from XML seriesInfo attributes The old code only looked in the anchor string for the document names of references, which doesn't work if the anchor uses a mnemonic. This caused lots of missed references for many documents. * No need to import lxml anymore * Add tests * Handle xinclude to bibxml URLs * Wrap line * Apply suggestion from @rjsparks * Undo erroneous additions * Address suggestion from @rjsparks
This commit is contained in:
parent
d96c8f7b75
commit
182158b5c0
|
@ -763,9 +763,11 @@ def rebuild_reference_relations(doc, filenames):
|
||||||
errors = []
|
errors = []
|
||||||
unfound = set()
|
unfound = set()
|
||||||
for ( ref, refType ) in refs.items():
|
for ( ref, refType ) in refs.items():
|
||||||
# As of Dec 2021, DocAlias has a unique constraint on the name field, so count > 1 should not occur
|
|
||||||
refdoc = DocAlias.objects.filter(name=ref)
|
refdoc = DocAlias.objects.filter(name=ref)
|
||||||
|
if not refdoc and re.match(r"^draft-.*-\d{2}$", ref):
|
||||||
|
refdoc = DocAlias.objects.filter(name=ref[:-3])
|
||||||
count = refdoc.count()
|
count = refdoc.count()
|
||||||
|
# As of Dec 2021, DocAlias has a unique constraint on the name field, so count > 1 should not occur
|
||||||
if count == 0:
|
if count == 0:
|
||||||
unfound.add( "%s" % ref )
|
unfound.add( "%s" % ref )
|
||||||
continue
|
continue
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
<?xml version='1.0'?>
|
<?xml version='1.0'?>
|
||||||
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
|
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
|
||||||
<rfc category="exp" submissionType="independent" ipr="trust200902" docName="draft-test-references-00" version="3">
|
<rfc xmlns:xi="http://www.w3.org/2001/XInclude" category="exp" submissionType="independent" ipr="trust200902" docName="draft-test-references-00" version="3">
|
||||||
<front>
|
<front>
|
||||||
<title>Test Draft with References</title>
|
<title>Test Draft with References</title>
|
||||||
<author fullname="Alfred Person" initials="A." surname="Person" role="editor">
|
<author fullname="Alfred Person" initials="A." surname="Person" role="editor">
|
||||||
|
@ -37,9 +37,25 @@
|
||||||
<seriesInfo name="RFC" value="1"/>
|
<seriesInfo name="RFC" value="1"/>
|
||||||
<seriesInfo name="DOI" value="10.17487/RFC0001"/>
|
<seriesInfo name="DOI" value="10.17487/RFC0001"/>
|
||||||
</reference>
|
</reference>
|
||||||
|
<reference anchor="MNEMONIC" target="https://www.rfc-editor.org/info/rfc2">
|
||||||
|
<front>
|
||||||
|
<title>Cloud Software</title>
|
||||||
|
<author initials="D." surname="Crocker" fullname="D. Crocker">
|
||||||
|
<organization/>
|
||||||
|
</author>
|
||||||
|
<date year="1969" month="April"/>
|
||||||
|
</front>
|
||||||
|
<seriesInfo name="RFC" value="2"/>
|
||||||
|
<seriesInfo name="DOI" value="10.17487/RFC0002"/>
|
||||||
|
</reference>
|
||||||
</references>
|
</references>
|
||||||
<references>
|
<references>
|
||||||
<name>Informative References</name>
|
<name>Informative References</name>
|
||||||
|
<xi:include href='https://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-teas-pcecc-use-cases-00.xml'/>
|
||||||
|
<xi:include href='https://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-teas-pcecc-use-cases.xml'/>
|
||||||
|
<xi:include href="https://bib.ietf.org/public/rfc/bibxml3/reference.I-D.draft-ietf-sipcore-multiple-reasons-00.xml" />
|
||||||
|
<xi:include href="https://bib.ietf.org/public/rfc/bibxml3/reference.I-D.draft-ietf-sipcore-multiple-reasons.xml" />
|
||||||
|
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml9/reference.BCP.0014.xml" />
|
||||||
<reference anchor='RFC0255' target='https://www.rfc-editor.org/info/rfc255'>
|
<reference anchor='RFC0255' target='https://www.rfc-editor.org/info/rfc255'>
|
||||||
<front>
|
<front>
|
||||||
<title>Status of network hosts</title>
|
<title>Status of network hosts</title>
|
||||||
|
@ -51,6 +67,34 @@
|
||||||
<seriesInfo name='RFC' value='255'/>
|
<seriesInfo name='RFC' value='255'/>
|
||||||
<seriesInfo name='DOI' value='10.17487/RFC0255'/>
|
<seriesInfo name='DOI' value='10.17487/RFC0255'/>
|
||||||
</reference>
|
</reference>
|
||||||
|
<reference anchor="CONSISTENCY">
|
||||||
|
<front>
|
||||||
|
<title>Key Consistency and Discovery</title>
|
||||||
|
<author fullname="Alex Davidson" initials="A." surname="Davidson">
|
||||||
|
<organization>Brave Software</organization>
|
||||||
|
</author>
|
||||||
|
<author fullname="Matthew Finkel" initials="M." surname="Finkel">
|
||||||
|
<organization>The Tor Project</organization>
|
||||||
|
</author>
|
||||||
|
<author fullname="Martin Thomson" initials="M." surname="Thomson">
|
||||||
|
<organization>Mozilla</organization>
|
||||||
|
</author>
|
||||||
|
<author fullname="Christopher A. Wood" initials="C. A." surname="Wood">
|
||||||
|
<organization>Cloudflare</organization>
|
||||||
|
</author>
|
||||||
|
<date day="17" month="August" year="2022"/>
|
||||||
|
<abstract>
|
||||||
|
<t> This document describes the key consistency and correctness
|
||||||
|
requirements of protocols such as Privacy Pass, Oblivious DoH, and
|
||||||
|
Oblivious HTTP for user privacy. It discusses several mechanisms and
|
||||||
|
proposals for enabling user privacy in varying threat models. In
|
||||||
|
concludes with discussion of open problems in this area.
|
||||||
|
|
||||||
|
</t>
|
||||||
|
</abstract>
|
||||||
|
</front>
|
||||||
|
<seriesInfo name="Internet-Draft" value="draft-wood-key-consistency-03"/>
|
||||||
|
</reference>
|
||||||
<referencegroup anchor="bcp6">
|
<referencegroup anchor="bcp6">
|
||||||
<reference anchor='RFC1930' target='https://www.rfc-editor.org/info/rfc1930'>
|
<reference anchor='RFC1930' target='https://www.rfc-editor.org/info/rfc1930'>
|
||||||
<front>
|
<front>
|
||||||
|
|
|
@ -374,10 +374,17 @@ class XMLDraftTests(TestCase):
|
||||||
draft.get_refs(),
|
draft.get_refs(),
|
||||||
{
|
{
|
||||||
'rfc1': XMLDraft.REF_TYPE_NORMATIVE,
|
'rfc1': XMLDraft.REF_TYPE_NORMATIVE,
|
||||||
|
'rfc2': XMLDraft.REF_TYPE_NORMATIVE,
|
||||||
|
'draft-wood-key-consistency-03': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||||
'rfc255': XMLDraft.REF_TYPE_INFORMATIVE,
|
'rfc255': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||||
'bcp6': XMLDraft.REF_TYPE_INFORMATIVE,
|
'bcp6': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||||
|
'bcp14': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||||
'rfc1207': XMLDraft.REF_TYPE_UNKNOWN,
|
'rfc1207': XMLDraft.REF_TYPE_UNKNOWN,
|
||||||
'rfc4086': XMLDraft.REF_TYPE_NORMATIVE,
|
'rfc4086': XMLDraft.REF_TYPE_NORMATIVE,
|
||||||
|
'draft-ietf-teas-pcecc-use-cases-00': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||||
|
'draft-ietf-teas-pcecc-use-cases': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||||
|
'draft-ietf-sipcore-multiple-reasons-00': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||||
|
'draft-ietf-sipcore-multiple-reasons': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
@ -60,17 +60,47 @@ class XMLDraft(Draft):
|
||||||
tree.tree = v2v3.convert2to3()
|
tree.tree = v2v3.convert2to3()
|
||||||
return tree, xml_version
|
return tree, xml_version
|
||||||
|
|
||||||
def _document_name(self, anchor):
|
def _document_name(self, ref):
|
||||||
"""Guess document name from reference anchor
|
"""Get document name from reference."""
|
||||||
|
series = ["rfc", "bcp", "fyi", "std"]
|
||||||
|
# handle xinclude first
|
||||||
|
# FIXME: this assumes the xinclude is a bibxml href; if it isn't, there can
|
||||||
|
# still be false negatives. it would be better to expand the xinclude and parse
|
||||||
|
# its seriesInfo.
|
||||||
|
if ref.tag.endswith("}include"):
|
||||||
|
name = re.search(
|
||||||
|
rf"reference\.({'|'.join(series).upper()})\.(\d{{4}})\.xml",
|
||||||
|
ref.attrib["href"],
|
||||||
|
)
|
||||||
|
if name:
|
||||||
|
return f"{name.group(1)}{int(name.group(2))}".lower()
|
||||||
|
name = re.search(
|
||||||
|
r"reference\.I-D\.(?:draft-)?(.*)\.xml", ref.attrib["href"]
|
||||||
|
)
|
||||||
|
if name:
|
||||||
|
return f"draft-{name.group(1)}"
|
||||||
|
# can't extract the name, give up
|
||||||
|
return ""
|
||||||
|
|
||||||
Looks for series numbers and removes leading 0s from the number.
|
# check the anchor next
|
||||||
"""
|
anchor = ref.get("anchor").lower() # always give back lowercase
|
||||||
anchor = anchor.lower() # always give back lowercase
|
label = anchor.rstrip("0123456789") # remove trailing digits
|
||||||
label = anchor.rstrip('0123456789') # remove trailing digits
|
if label in series:
|
||||||
if label in ['rfc', 'bcp', 'fyi', 'std']:
|
|
||||||
number = int(anchor[len(label) :])
|
number = int(anchor[len(label) :])
|
||||||
return f'{label}{number}'
|
return f"{label}{number}"
|
||||||
return anchor
|
|
||||||
|
# if we couldn't find a match so far, try the seriesInfo
|
||||||
|
series_query = " or ".join(f"@name='{x.upper()}'" for x in series)
|
||||||
|
for info in ref.xpath(
|
||||||
|
f"./seriesInfo[{series_query} or @name='Internet-Draft']"
|
||||||
|
):
|
||||||
|
if not info.attrib["value"]:
|
||||||
|
continue
|
||||||
|
if info.attrib["name"] == "Internet-Draft":
|
||||||
|
return info.attrib["value"]
|
||||||
|
else:
|
||||||
|
return f'{info.attrib["name"].lower()}{info.attrib["value"]}'
|
||||||
|
return ""
|
||||||
|
|
||||||
def _reference_section_type(self, section_name):
|
def _reference_section_type(self, section_name):
|
||||||
"""Determine reference type from name of references section"""
|
"""Determine reference type from name of references section"""
|
||||||
|
@ -154,10 +184,20 @@ class XMLDraft(Draft):
|
||||||
"""Extract references from the draft"""
|
"""Extract references from the draft"""
|
||||||
refs = {}
|
refs = {}
|
||||||
# accept nested <references> sections
|
# accept nested <references> sections
|
||||||
for section in self.xmlroot.findall('back//references'):
|
for section in self.xmlroot.findall("back//references"):
|
||||||
ref_type = self._reference_section_type(self._reference_section_name(section))
|
ref_type = self._reference_section_type(
|
||||||
for ref in (section.findall('./reference') + section.findall('./referencegroup')):
|
self._reference_section_name(section)
|
||||||
refs[self._document_name(ref.get('anchor'))] = ref_type
|
)
|
||||||
|
for ref in (
|
||||||
|
section.findall("./reference")
|
||||||
|
+ section.findall("./referencegroup")
|
||||||
|
+ section.findall(
|
||||||
|
"./xi:include", {"xi": "http://www.w3.org/2001/XInclude"}
|
||||||
|
)
|
||||||
|
):
|
||||||
|
name = self._document_name(ref)
|
||||||
|
if name:
|
||||||
|
refs[name] = ref_type
|
||||||
return refs
|
return refs
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue