fix: Fix spurious author extraction errors (#4799)
* Handle single-word author names * Some i18n names, e.g., "शिला के.सी." have a dot at the end that is also part of the ASCII, e.g., "Shilaa Kesii." That trailing dot breaks extract_authors(). Avoid this issue by stripping the dot from the ASCII. * Honorifics need to be part of the extracted ASCII name (e.g., "Lady Garcia") * feat: stop supporting pre-tzaware migration database dumps. (#4782) * feat: stop supporting pre-tzaware migration database dumps. * chore: remove unnecessary env variable * chore: Use `codespell` to fix typos in comments. (#4794) First part of replacement of #4651 * feat: Only show IPR search form when not showing search results (#4793) * feat: Only show IPR search form when not showing search results Put it into a collapsible that is only expanded by default when not showing search results. Fixes #4569 * Don't use example target name * fix: Don't show reorder UI fixtures unless user can reorder (#4785) Fixes #4773 Co-authored-by: Robert Sparks <rjsparks@nostrum.com> * chore: Update deps and fix resulting HTML validation issues (#4790) * ci: add missing build matrix config for test-playwright-legacy step * Single-letter last names exist (e.g., "Carolina de la O") * Align regex with others * Fix extraction of very long author names * Need to be more general * Add comment * Also handle i18n names with trailing semicolons * Name suffixes need to be part of the extracted author names * Handle i18n names with embedded commas Co-authored-by: Robert Sparks <rjsparks@nostrum.com> Co-authored-by: Nicolas Giard <github@ngpixel.com>
This commit is contained in:
parent
3af68b6fbb
commit
d59c64943d
|
@ -65,7 +65,10 @@ class PersonFactory(factory.django.DjangoModelFactory):
|
|||
|
||||
user = factory.SubFactory(UserFactory)
|
||||
name = factory.LazyAttribute(lambda p: normalize_name('%s %s'%(p.user.first_name, p.user.last_name)))
|
||||
ascii = factory.LazyAttribute(lambda p: force_text(unidecode_name(p.name)))
|
||||
# Some i18n names, e.g., "शिला के.सी." have a dot at the end that is also part of the ASCII, e.g., "Shilaa Kesii."
|
||||
# That trailing dot breaks extract_authors(). Avoid this issue by stripping the dot from the ASCII.
|
||||
# Some others have a trailing semicolon (e.g., "உயிரோவியம் தங்கராஐ;") - strip those, too.
|
||||
ascii = factory.LazyAttribute(lambda p: force_text(unidecode_name(p.name)).rstrip(".;"))
|
||||
|
||||
class Params:
|
||||
with_bio = factory.Trait(biography = "\n\n".join(fake.paragraphs())) # type: ignore
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
|
||||
|
||||
Network Working Group %(initials)s %(surname)s
|
||||
Network Working Group %(firstpagename)37s
|
||||
Internet-Draft Test Centre Inc.
|
||||
Intended status: Informational %(month)s %(year)s
|
||||
Expires: %(expiration)s
|
||||
|
@ -10,7 +10,6 @@ Expires: %(expiration)s
|
|||
|
||||
%(title)s
|
||||
%(name)s
|
||||
|
||||
Abstract
|
||||
|
||||
This document describes how to test tests.
|
||||
|
|
|
@ -107,6 +107,10 @@ def submission_file(name_in_doc, name_in_post, group, templatename, author=None,
|
|||
if year is None:
|
||||
year = _today.strftime("%Y")
|
||||
|
||||
# extract_authors() cuts the author line off at the first space past 80 characters
|
||||
# very long factory-generated names can hence be truncated, causing a failure
|
||||
# ietf/submit/test_submission.txt was changed so that 37-character names and shorter will work
|
||||
# this may need further adjustment if longer names still cause failures
|
||||
submission_text = template % dict(
|
||||
date=_today.strftime("%d %B %Y"),
|
||||
expiration=(_today + datetime.timedelta(days=100)).strftime("%d %B, %Y"),
|
||||
|
@ -119,6 +123,7 @@ def submission_file(name_in_doc, name_in_post, group, templatename, author=None,
|
|||
asciiAuthor=author.ascii,
|
||||
initials=author.initials(),
|
||||
surname=author.ascii_parts()[3] if ascii else author.name_parts()[3],
|
||||
firstpagename=f"{author.initials()} {author.ascii_parts()[3] if ascii else author.name_parts()[3]}",
|
||||
asciiSurname=author.ascii_parts()[3],
|
||||
email=email,
|
||||
title=title,
|
||||
|
|
|
@ -592,8 +592,8 @@ class PlaintextDraft(Draft):
|
|||
"honor" : r"(?:[A-Z]\.|Dr\.?|Dr\.-Ing\.|Prof(?:\.?|essor)|Sir|Lady|Dame|Sri)",
|
||||
"prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von|[Ee]l)",
|
||||
"suffix": r"(jr.?|Jr.?|II|2nd|III|3rd|IV|4th)",
|
||||
"first" : r"([A-Z][-A-Za-z'`~]*)(( ?\([A-Z][-A-Za-z'`~]*\))?(\.?[- ]{1,2}[A-Za-z'`~]+)*)",
|
||||
"last" : r"([-A-Za-z'`~]{2,})",
|
||||
"first" : r"([A-Z][-A-Za-z'`~,]*)(( ?\([A-Z][-A-Za-z'`~,]*\))?(\.?[- ]{1,2}[A-Za-z'`~]+)*)",
|
||||
"last" : r"([-A-Za-z'`~,]+)", # single-letter last names exist
|
||||
"months": r"(January|February|March|April|May|June|July|August|September|October|November|December)",
|
||||
"mabbr" : r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?",
|
||||
}
|
||||
|
@ -658,7 +658,12 @@ class PlaintextDraft(Draft):
|
|||
|
||||
# permit insertion of middle names between first and last, and
|
||||
# add possible honorific and suffix information
|
||||
authpat = r"(?:^| and )(?:%(hon)s ?)?(['`]*%(first)s\S*( +[^ ]+)* +%(last)s)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| %(suffix)s| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,}
|
||||
if last:
|
||||
authpat = r"(?:^| and )((?:%(hon)s ?)?['`]*%(first)s\S*( +[^ ]+)* +%(last)s(?: %(suffix)s)?)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,}
|
||||
else:
|
||||
# handle single-word names
|
||||
authpat = r"(?:^| and )((?:%(hon)s ?)?['`]*%(first)s\S*( +[^ ]+)*(?: %(suffix)s)?)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "suffix":suffix,}
|
||||
|
||||
return authpat
|
||||
|
||||
authors = []
|
||||
|
@ -812,7 +817,7 @@ class PlaintextDraft(Draft):
|
|||
author = author[:-len(suffix)].strip()
|
||||
else:
|
||||
suffix = None
|
||||
if "," in author:
|
||||
if ", " in author:
|
||||
last, first = author.split(",",1)
|
||||
author = "%s %s" % (first.strip(), last.strip())
|
||||
if not " " in author:
|
||||
|
@ -820,8 +825,9 @@ class PlaintextDraft(Draft):
|
|||
first, last = author.rsplit(".", 1)
|
||||
first += "."
|
||||
else:
|
||||
author = "[A-Z].+ " + author
|
||||
first, last = author.rsplit(" ", 1)
|
||||
# handle single-word names
|
||||
first = author
|
||||
last = ""
|
||||
else:
|
||||
if "." in author:
|
||||
first, last = author.rsplit(".", 1)
|
||||
|
@ -899,10 +905,14 @@ class PlaintextDraft(Draft):
|
|||
#else:
|
||||
# fullname = author_match
|
||||
fullname = re.sub(" +", " ", fullname)
|
||||
if left == firstname:
|
||||
given_names, surname = fullname.rsplit(None, 1)
|
||||
if re.search(r"\s", fullname):
|
||||
if left == firstname:
|
||||
given_names, surname = fullname.rsplit(None, 1)
|
||||
else:
|
||||
surname, given_names = fullname.split(None, 1)
|
||||
else:
|
||||
surname, given_names = fullname.split(None, 1)
|
||||
# handle single-word names
|
||||
given_names, surname = (fullname, "")
|
||||
if " " in given_names:
|
||||
first, middle = given_names.split(None, 1)
|
||||
else:
|
||||
|
|
Loading…
Reference in a new issue