fix: Also extract document names from XML seriesInfo attributes and XInclude URLs (#5037)
* fix: Also extract document names from XML seriesInfo attributes The old code only looked in the anchor string for the document names of references, which doesn't work if the anchor uses a mnemonic. This caused lots of missed references for many documents. * No need to import lxml anymore * Add tests * Handle xinclude to bibxml URLs * Wrap line * Apply suggestion from @rjsparks * Undo erroneous additions * Address suggestion from @rjsparks
This commit is contained in:
parent
d96c8f7b75
commit
182158b5c0
|
@ -763,9 +763,11 @@ def rebuild_reference_relations(doc, filenames):
|
|||
errors = []
|
||||
unfound = set()
|
||||
for ( ref, refType ) in refs.items():
|
||||
# As of Dec 2021, DocAlias has a unique constraint on the name field, so count > 1 should not occur
|
||||
refdoc = DocAlias.objects.filter( name=ref )
|
||||
refdoc = DocAlias.objects.filter(name=ref)
|
||||
if not refdoc and re.match(r"^draft-.*-\d{2}$", ref):
|
||||
refdoc = DocAlias.objects.filter(name=ref[:-3])
|
||||
count = refdoc.count()
|
||||
# As of Dec 2021, DocAlias has a unique constraint on the name field, so count > 1 should not occur
|
||||
if count == 0:
|
||||
unfound.add( "%s" % ref )
|
||||
continue
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
<?xml version='1.0'?>
|
||||
<?xml-stylesheet type='text/xsl' href='rfc2629.xslt' ?>
|
||||
<rfc category="exp" submissionType="independent" ipr="trust200902" docName="draft-test-references-00" version="3">
|
||||
<rfc xmlns:xi="http://www.w3.org/2001/XInclude" category="exp" submissionType="independent" ipr="trust200902" docName="draft-test-references-00" version="3">
|
||||
<front>
|
||||
<title>Test Draft with References</title>
|
||||
<author fullname="Alfred Person" initials="A." surname="Person" role="editor">
|
||||
|
@ -37,9 +37,25 @@
|
|||
<seriesInfo name="RFC" value="1"/>
|
||||
<seriesInfo name="DOI" value="10.17487/RFC0001"/>
|
||||
</reference>
|
||||
<reference anchor="MNEMONIC" target="https://www.rfc-editor.org/info/rfc2">
|
||||
<front>
|
||||
<title>Cloud Software</title>
|
||||
<author initials="D." surname="Crocker" fullname="D. Crocker">
|
||||
<organization/>
|
||||
</author>
|
||||
<date year="1969" month="April"/>
|
||||
</front>
|
||||
<seriesInfo name="RFC" value="2"/>
|
||||
<seriesInfo name="DOI" value="10.17487/RFC0002"/>
|
||||
</reference>
|
||||
</references>
|
||||
<references>
|
||||
<name>Informative References</name>
|
||||
<xi:include href='https://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-teas-pcecc-use-cases-00.xml'/>
|
||||
<xi:include href='https://xml.resource.org/public/rfc/bibxml3/reference.I-D.ietf-teas-pcecc-use-cases.xml'/>
|
||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml3/reference.I-D.draft-ietf-sipcore-multiple-reasons-00.xml" />
|
||||
<xi:include href="https://bib.ietf.org/public/rfc/bibxml3/reference.I-D.draft-ietf-sipcore-multiple-reasons.xml" />
|
||||
<xi:include href="https://xml2rfc.ietf.org/public/rfc/bibxml9/reference.BCP.0014.xml" />
|
||||
<reference anchor='RFC0255' target='https://www.rfc-editor.org/info/rfc255'>
|
||||
<front>
|
||||
<title>Status of network hosts</title>
|
||||
|
@ -51,6 +67,34 @@
|
|||
<seriesInfo name='RFC' value='255'/>
|
||||
<seriesInfo name='DOI' value='10.17487/RFC0255'/>
|
||||
</reference>
|
||||
<reference anchor="CONSISTENCY">
|
||||
<front>
|
||||
<title>Key Consistency and Discovery</title>
|
||||
<author fullname="Alex Davidson" initials="A." surname="Davidson">
|
||||
<organization>Brave Software</organization>
|
||||
</author>
|
||||
<author fullname="Matthew Finkel" initials="M." surname="Finkel">
|
||||
<organization>The Tor Project</organization>
|
||||
</author>
|
||||
<author fullname="Martin Thomson" initials="M." surname="Thomson">
|
||||
<organization>Mozilla</organization>
|
||||
</author>
|
||||
<author fullname="Christopher A. Wood" initials="C. A." surname="Wood">
|
||||
<organization>Cloudflare</organization>
|
||||
</author>
|
||||
<date day="17" month="August" year="2022"/>
|
||||
<abstract>
|
||||
<t> This document describes the key consistency and correctness
|
||||
requirements of protocols such as Privacy Pass, Oblivious DoH, and
|
||||
Oblivious HTTP for user privacy. It discusses several mechanisms and
|
||||
proposals for enabling user privacy in varying threat models. In
|
||||
concludes with discussion of open problems in this area.
|
||||
|
||||
</t>
|
||||
</abstract>
|
||||
</front>
|
||||
<seriesInfo name="Internet-Draft" value="draft-wood-key-consistency-03"/>
|
||||
</reference>
|
||||
<referencegroup anchor="bcp6">
|
||||
<reference anchor='RFC1930' target='https://www.rfc-editor.org/info/rfc1930'>
|
||||
<front>
|
||||
|
@ -191,4 +235,4 @@
|
|||
</reference>
|
||||
</references>
|
||||
</back>
|
||||
</rfc>
|
||||
</rfc>
|
||||
|
|
|
@ -374,10 +374,17 @@ class XMLDraftTests(TestCase):
|
|||
draft.get_refs(),
|
||||
{
|
||||
'rfc1': XMLDraft.REF_TYPE_NORMATIVE,
|
||||
'rfc2': XMLDraft.REF_TYPE_NORMATIVE,
|
||||
'draft-wood-key-consistency-03': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||
'rfc255': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||
'bcp6': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||
'bcp14': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||
'rfc1207': XMLDraft.REF_TYPE_UNKNOWN,
|
||||
'rfc4086': XMLDraft.REF_TYPE_NORMATIVE,
|
||||
'draft-ietf-teas-pcecc-use-cases-00': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||
'draft-ietf-teas-pcecc-use-cases': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||
'draft-ietf-sipcore-multiple-reasons-00': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||
'draft-ietf-sipcore-multiple-reasons': XMLDraft.REF_TYPE_INFORMATIVE,
|
||||
}
|
||||
)
|
||||
|
||||
|
|
|
@ -60,17 +60,47 @@ class XMLDraft(Draft):
|
|||
tree.tree = v2v3.convert2to3()
|
||||
return tree, xml_version
|
||||
|
||||
def _document_name(self, anchor):
|
||||
"""Guess document name from reference anchor
|
||||
def _document_name(self, ref):
|
||||
"""Get document name from reference."""
|
||||
series = ["rfc", "bcp", "fyi", "std"]
|
||||
# handle xinclude first
|
||||
# FIXME: this assumes the xinclude is a bibxml href; if it isn't, there can
|
||||
# still be false negatives. it would be better to expand the xinclude and parse
|
||||
# its seriesInfo.
|
||||
if ref.tag.endswith("}include"):
|
||||
name = re.search(
|
||||
rf"reference\.({'|'.join(series).upper()})\.(\d{{4}})\.xml",
|
||||
ref.attrib["href"],
|
||||
)
|
||||
if name:
|
||||
return f"{name.group(1)}{int(name.group(2))}".lower()
|
||||
name = re.search(
|
||||
r"reference\.I-D\.(?:draft-)?(.*)\.xml", ref.attrib["href"]
|
||||
)
|
||||
if name:
|
||||
return f"draft-{name.group(1)}"
|
||||
# can't extract the name, give up
|
||||
return ""
|
||||
|
||||
Looks for series numbers and removes leading 0s from the number.
|
||||
"""
|
||||
anchor = anchor.lower() # always give back lowercase
|
||||
label = anchor.rstrip('0123456789') # remove trailing digits
|
||||
if label in ['rfc', 'bcp', 'fyi', 'std']:
|
||||
number = int(anchor[len(label):])
|
||||
return f'{label}{number}'
|
||||
return anchor
|
||||
# check the anchor next
|
||||
anchor = ref.get("anchor").lower() # always give back lowercase
|
||||
label = anchor.rstrip("0123456789") # remove trailing digits
|
||||
if label in series:
|
||||
number = int(anchor[len(label) :])
|
||||
return f"{label}{number}"
|
||||
|
||||
# if we couldn't find a match so far, try the seriesInfo
|
||||
series_query = " or ".join(f"@name='{x.upper()}'" for x in series)
|
||||
for info in ref.xpath(
|
||||
f"./seriesInfo[{series_query} or @name='Internet-Draft']"
|
||||
):
|
||||
if not info.attrib["value"]:
|
||||
continue
|
||||
if info.attrib["name"] == "Internet-Draft":
|
||||
return info.attrib["value"]
|
||||
else:
|
||||
return f'{info.attrib["name"].lower()}{info.attrib["value"]}'
|
||||
return ""
|
||||
|
||||
def _reference_section_type(self, section_name):
|
||||
"""Determine reference type from name of references section"""
|
||||
|
@ -154,10 +184,20 @@ class XMLDraft(Draft):
|
|||
"""Extract references from the draft"""
|
||||
refs = {}
|
||||
# accept nested <references> sections
|
||||
for section in self.xmlroot.findall('back//references'):
|
||||
ref_type = self._reference_section_type(self._reference_section_name(section))
|
||||
for ref in (section.findall('./reference') + section.findall('./referencegroup')):
|
||||
refs[self._document_name(ref.get('anchor'))] = ref_type
|
||||
for section in self.xmlroot.findall("back//references"):
|
||||
ref_type = self._reference_section_type(
|
||||
self._reference_section_name(section)
|
||||
)
|
||||
for ref in (
|
||||
section.findall("./reference")
|
||||
+ section.findall("./referencegroup")
|
||||
+ section.findall(
|
||||
"./xi:include", {"xi": "http://www.w3.org/2001/XInclude"}
|
||||
)
|
||||
):
|
||||
name = self._document_name(ref)
|
||||
if name:
|
||||
refs[name] = ref_type
|
||||
return refs
|
||||
|
||||
|
||||
|
|
Loading…
Reference in a new issue