datatracker/ietf/utils/xmldraft.py
Jennifer Richards 5a2708283b
feat: Extract document creation date from XML draft (#5733)
* fix: Extract document creation date from XML draft

* test: Fix test
2023-06-01 09:58:55 -05:00

224 lines
8.2 KiB
Python

# Copyright The IETF Trust 2022, All Rights Reserved
# -*- coding: utf-8 -*-
import io
import re
import xml2rfc
import debug # pyflakes: ignore
from contextlib import ExitStack
from .draft import Draft
class XMLDraft(Draft):
"""Draft from XML source
Not all methods from the superclass are implemented yet.
"""
def __init__(self, xml_file):
"""Initialize XMLDraft instance
:parameter xml_file: path to file containing XML source
"""
super().__init__()
# cast xml_file to str so, e.g., this will work with a Path
self.xmltree, self.xml_version = self.parse_xml(str(xml_file))
self.xmlroot = self.xmltree.getroot()
self.filename, self.revision = self._parse_docname()
@staticmethod
def parse_xml(filename):
"""Parse XML draft
Converts to xml2rfc v3 schema, then returns the root of the v3 tree and the original
xml version.
"""
orig_write_out = xml2rfc.log.write_out
orig_write_err = xml2rfc.log.write_err
parser_out = io.StringIO()
parser_err = io.StringIO()
with ExitStack() as stack:
@stack.callback
def cleanup(): # called when context exited, even if there's an exception
xml2rfc.log.write_out = orig_write_out
xml2rfc.log.write_err = orig_write_err
xml2rfc.log.write_out = parser_out
xml2rfc.log.write_err = parser_err
parser = xml2rfc.XmlRfcParser(filename, quiet=True)
try:
tree = parser.parse()
except Exception as e:
raise XMLParseError(parser_out.getvalue(), parser_err.getvalue()) from e
xml_version = tree.getroot().get('version', '2')
if xml_version == '2':
v2v3 = xml2rfc.V2v3XmlWriter(tree)
tree.tree = v2v3.convert2to3()
return tree, xml_version
def _document_name(self, ref):
"""Get document name from reference."""
series = ["rfc", "bcp", "fyi", "std"]
# handle xinclude first
# FIXME: this assumes the xinclude is a bibxml href; if it isn't, there can
# still be false negatives. it would be better to expand the xinclude and parse
# its seriesInfo.
if ref.tag.endswith("}include"):
name = re.search(
rf"reference\.({'|'.join(series).upper()})\.(\d{{4}})\.xml",
ref.attrib["href"],
)
if name:
return f"{name.group(1)}{int(name.group(2))}".lower()
name = re.search(
r"reference\.I-D\.(?:draft-)?(.*)\.xml", ref.attrib["href"]
)
if name:
return f"draft-{name.group(1)}"
# can't extract the name, give up
return ""
# check the anchor next
anchor = ref.get("anchor").lower() # always give back lowercase
label = anchor.rstrip("0123456789") # remove trailing digits
if label in series:
number = int(anchor[len(label) :])
return f"{label}{number}"
# if we couldn't find a match so far, try the seriesInfo
series_query = " or ".join(f"@name='{x.upper()}'" for x in series)
for info in ref.xpath(
f"./seriesInfo[{series_query} or @name='Internet-Draft']"
):
if not info.attrib["value"]:
continue
if info.attrib["name"] == "Internet-Draft":
return info.attrib["value"]
else:
return f'{info.attrib["name"].lower()}{info.attrib["value"]}'
return ""
def _reference_section_type(self, section_name):
"""Determine reference type from name of references section"""
if section_name:
section_name = section_name.lower()
if 'normative' in section_name:
return self.REF_TYPE_NORMATIVE
elif 'informative' in section_name:
return self.REF_TYPE_INFORMATIVE
return self.REF_TYPE_UNKNOWN
def _reference_section_name(self, section_elt):
section_name = section_elt.findtext('name')
if section_name is None and 'title' in section_elt.keys():
section_name = section_elt.get('title') # fall back to title if we have it
return section_name
def _parse_docname(self):
docname = self.xmlroot.attrib.get('docName')
revmatch = re.match(
r'^(?P<filename>.+?)(?:-(?P<rev>[0-9][0-9]))?$',
docname,
)
if revmatch is None:
raise ValueError('Unable to parse docName')
# If a group had no match it is None
return revmatch.group('filename'), revmatch.group('rev')
def get_title(self):
return self.xmlroot.findtext('front/title').strip()
def get_creation_date(self):
date_elt = self.xmlroot.find("front/date")
if date_elt is not None:
try:
year = date_elt.get("year")
month = date_elt.get("month")
return self._construct_creation_date(year, month, date_elt.get("day", None))
except ValueError:
pass
return None
# todo fix the implementation of XMLDraft.get_abstract()
#
# This code was pulled from ietf.submit.forms where it existed for some time.
# It does not work, at least with modern xml2rfc. This assumes that the abstract
# is simply text in the front/abstract node, but the XML schema wraps the actual
# abstract text in <t> elements (and allows <dl>, <ol>, and <ul> as well). As a
# result, this method normally returns an empty string, which is later replaced by
# the abstract parsed from the rendered text. For now, I a commenting this out
# and making it explicit that the abstract always comes from the text format.
#
# def get_abstract(self):
# """Extract the abstract"""
# abstract = self.xmlroot.findtext('front/abstract')
# return abstract.strip() if abstract else ''
def get_author_list(self):
"""Get detailed author list
Returns a list of dicts with the following keys:
name, first_name, middle_initial, last_name,
name_suffix, email, country, affiliation
Values will be None if not available
"""
result = []
empty_author = {
k: None for k in [
'name', 'first_name', 'middle_initial', 'last_name',
'name_suffix', 'email', 'country', 'affiliation',
]
}
for author in self.xmlroot.findall('front/author'):
info = {
'name': author.attrib.get('fullname'),
'email': author.findtext('address/email'),
'affiliation': author.findtext('organization'),
}
elem = author.find('address/postal/country')
if elem is not None:
ascii_country = elem.get('ascii', None)
info['country'] = ascii_country if ascii_country else elem.text
for item in info:
if info[item]:
info[item] = info[item].strip()
result.append(empty_author | info) # merge, preferring info
return result
def get_refs(self):
"""Extract references from the draft"""
refs = {}
# accept nested <references> sections
for section in self.xmlroot.findall("back//references"):
ref_type = self._reference_section_type(
self._reference_section_name(section)
)
for ref in (
section.findall("./reference")
+ section.findall("./referencegroup")
+ section.findall(
"./xi:include", {"xi": "http://www.w3.org/2001/XInclude"}
)
):
name = self._document_name(ref)
if name:
refs[name] = ref_type
return refs
class XMLParseError(Exception):
"""An error occurred while parsing"""
def __init__(self, out: str, err: str, *args):
super().__init__(*args)
self._out = out
self._err = err
def parser_msgs(self):
return self._out.splitlines() + self._err.splitlines()