feat: Extract document creation date from XML draft (#5733)

* fix: Extract document creation date from XML draft

* test: Fix test
This commit is contained in:
Jennifer Richards 2023-06-01 11:58:55 -03:00 committed by GitHub
parent 8d4780d304
commit 5a2708283b
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 60 additions and 24 deletions

View file

@ -3354,7 +3354,7 @@ class AsyncSubmissionTests(BaseSubmitTestCase):
self.assertEqual(output["title"], "Correct Draft Title") self.assertEqual(output["title"], "Correct Draft Title")
self.assertIsNone(output["abstract"]) self.assertIsNone(output["abstract"])
self.assertEqual(len(output["authors"]), 1) # not checking in detail, parsing is unreliable self.assertEqual(len(output["authors"]), 1) # not checking in detail, parsing is unreliable
self.assertIsNone(output["document_date"]) self.assertEqual(output["document_date"], date_today())
self.assertIsNone(output["pages"]) self.assertIsNone(output["pages"])
self.assertIsNone(output["words"]) self.assertIsNone(output["words"])
self.assertIsNone(output["first_two_pages"]) self.assertIsNone(output["first_two_pages"])

View file

@ -1159,7 +1159,7 @@ def process_submission_xml(filename, revision):
for auth in xml_draft.get_author_list() for auth in xml_draft.get_author_list()
], ],
"abstract": None, # not supported from XML "abstract": None, # not supported from XML
"document_date": None, # not supported from XML "document_date": xml_draft.get_creation_date(),
"pages": None, # not supported from XML "pages": None, # not supported from XML
"words": None, # not supported from XML "words": None, # not supported from XML
"first_two_pages": None, # not supported from XML "first_two_pages": None, # not supported from XML
@ -1287,9 +1287,14 @@ def process_and_validate_submission(submission):
if not submission.title: if not submission.title:
raise SubmissionError("Could not determine the title of the draft") raise SubmissionError("Could not determine the title of the draft")
# Items to get from text only when not available from XML
if xml_metadata and xml_metadata.get("document_date", None) is not None:
submission.document_date = xml_metadata["document_date"]
else:
submission.document_date = text_metadata["document_date"]
# Items always to get from text, even when XML is available # Items always to get from text, even when XML is available
submission.abstract = text_metadata["abstract"] submission.abstract = text_metadata["abstract"]
submission.document_date = text_metadata["document_date"]
submission.pages = text_metadata["pages"] submission.pages = text_metadata["pages"]
submission.words = text_metadata["words"] submission.words = text_metadata["words"]
submission.first_two_pages = text_metadata["first_two_pages"] submission.first_two_pages = text_metadata["first_two_pages"]

View file

@ -189,6 +189,46 @@ class Draft:
def get_wordcount(self): def get_wordcount(self):
raise NotImplementedError raise NotImplementedError
@staticmethod
def _construct_creation_date(year, month, day=None):
"""Construct a date for the document
Roughly follows RFC 7991 section 2.17, but only allows missing day and
assumes the 15th if day is not specified month/year are not current.
year: integer or string with 4-digit year
month: integer or string with numeric or English month. Some abbreviations recognized.
day: integer or string with numeric day of month. Optional.
Raises ValueError if there is a problem interpreting the data
"""
year = int(year)
day = int(day)
if isinstance(month, str):
month = month.lower()
if month in month_names:
month = month_names.index(month) + 1
elif month in month_names_abbrev3:
month = month_names_abbrev3.index(month) + 1
elif month in month_names_abbrev4:
month = month_names_abbrev4.index(month) + 1
elif month.isdigit() and int(month) in range(1, 13):
month = int(month)
else:
raise ValueError("Unrecognized month")
today = date_today()
if not day:
# if the date was given with only month and year, use
# today's date if month and year is today's month and
# year, otherwise pick the middle of the month.
# Don't use today's day for month and year in the past
if month == today.month and year == today.year:
day = today.day
else:
day = 15
return datetime.date(year, month, day)
# ---------------------------------------------------------------------- # ----------------------------------------------------------------------
@ -460,27 +500,7 @@ class PlaintextDraft(Draft):
day = int( md.get( 'day', 0 ) ) day = int( md.get( 'day', 0 ) )
year = int( md['year'] ) year = int( md['year'] )
try: try:
if mon in month_names: self._creation_date = self._construct_creation_date(year, mon, day)
month = month_names.index( mon ) + 1
elif mon in month_names_abbrev3:
month = month_names_abbrev3.index( mon ) + 1
elif mon in month_names_abbrev4:
month = month_names_abbrev4.index( mon ) + 1
elif mon.isdigit() and int(mon) in range(1,13):
month = int(mon)
else:
continue
today = date_today()
if day==0:
# if the date was given with only month and year, use
# today's date if month and year is today's month and
# year, otherwise pick the middle of the month.
# Don't use today's day for month and year in the past
if month==today.month and year==today.year:
day = today.day
else:
day = 15
self._creation_date = datetime.date(year, month, day)
return self._creation_date return self._creation_date
except ValueError: except ValueError:
# mon abbreviation not in _MONTH_NAMES # mon abbreviation not in _MONTH_NAMES

View file

@ -133,6 +133,17 @@ class XMLDraft(Draft):
def get_title(self): def get_title(self):
return self.xmlroot.findtext('front/title').strip() return self.xmlroot.findtext('front/title').strip()
def get_creation_date(self):
date_elt = self.xmlroot.find("front/date")
if date_elt is not None:
try:
year = date_elt.get("year")
month = date_elt.get("month")
return self._construct_creation_date(year, month, date_elt.get("day", None))
except ValueError:
pass
return None
# todo fix the implementation of XMLDraft.get_abstract() # todo fix the implementation of XMLDraft.get_abstract()
# #
# This code was pulled from ietf.submit.forms where it existed for some time. # This code was pulled from ietf.submit.forms where it existed for some time.