fix: Fix spurious author extraction errors (#4799)

* Handle single-word author names

* Some i18n names, e.g., "शिला के.सी." have a dot at the end that is
also part of the ASCII, e.g., "Shilaa Kesii." That trailing dot breaks
extract_authors(). Avoid this issue by stripping the dot from the
ASCII.

* Honorifics need to be part of the extracted ASCII name (e.g., "Lady Garcia")

* feat: stop supporting pre-tzaware migration database dumps. (#4782)

* feat: stop supporting pre-tzaware migration database dumps.

* chore: remove unnecessary env variable

* chore: Use `codespell` to fix typos in comments. (#4794)

First part of replacement of #4651

* feat: Only show IPR search form when not showing search results  (#4793)

* feat: Only show IPR search form when not showing search results

Put it into a collapsible that is only expanded by default when not
showing search results.

Fixes #4569

* Don't use example target name

* fix: Don't show reorder UI fixtures unless user can reorder (#4785)

Fixes #4773

Co-authored-by: Robert Sparks <rjsparks@nostrum.com>

* chore: Update deps and fix resulting HTML validation issues (#4790)

* ci: add missing build matrix config for test-playwright-legacy step

* Single-letter last names exist (e.g., "Carolina de la O")

* Align regex with others

* Fix extraction of very long author names

* Need to be more general

* Add comment

* Also handle i18n names with trailing semicolons

* Name suffixes need to be part of the extracted author names

* Handle i18n names with embedded commas

Co-authored-by: Robert Sparks <rjsparks@nostrum.com>
Co-authored-by: Nicolas Giard <github@ngpixel.com>
This commit is contained in:
Lars Eggert 2022-12-03 00:41:21 +03:00 committed by GitHub
parent 3af68b6fbb
commit d59c64943d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 29 additions and 12 deletions

View file

@ -65,7 +65,10 @@ class PersonFactory(factory.django.DjangoModelFactory):
user = factory.SubFactory(UserFactory)
name = factory.LazyAttribute(lambda p: normalize_name('%s %s'%(p.user.first_name, p.user.last_name)))
ascii = factory.LazyAttribute(lambda p: force_text(unidecode_name(p.name)))
# Some i18n names, e.g., "शिला के.सी." have a dot at the end that is also part of the ASCII, e.g., "Shilaa Kesii."
# That trailing dot breaks extract_authors(). Avoid this issue by stripping the dot from the ASCII.
# Some others have a trailing semicolon (e.g., "உயிரோவியம் தங்கராஐ;") - strip those, too.
ascii = factory.LazyAttribute(lambda p: force_text(unidecode_name(p.name)).rstrip(".;"))
class Params:
with_bio = factory.Trait(biography = "\n\n".join(fake.paragraphs())) # type: ignore

View file

@ -2,7 +2,7 @@
Network Working Group %(initials)s %(surname)s
Network Working Group %(firstpagename)37s
Internet-Draft Test Centre Inc.
Intended status: Informational %(month)s %(year)s
Expires: %(expiration)s
@ -10,7 +10,6 @@ Expires: %(expiration)s
%(title)s
%(name)s
Abstract
This document describes how to test tests.

View file

@ -107,6 +107,10 @@ def submission_file(name_in_doc, name_in_post, group, templatename, author=None,
if year is None:
year = _today.strftime("%Y")
# extract_authors() cuts the author line off at the first space past 80 characters
# very long factory-generated names can hence be truncated, causing a failure
# ietf/submit/test_submission.txt was changed so that 37-character names and shorter will work
# this may need further adjustment if longer names still cause failures
submission_text = template % dict(
date=_today.strftime("%d %B %Y"),
expiration=(_today + datetime.timedelta(days=100)).strftime("%d %B, %Y"),
@ -119,6 +123,7 @@ def submission_file(name_in_doc, name_in_post, group, templatename, author=None,
asciiAuthor=author.ascii,
initials=author.initials(),
surname=author.ascii_parts()[3] if ascii else author.name_parts()[3],
firstpagename=f"{author.initials()} {author.ascii_parts()[3] if ascii else author.name_parts()[3]}",
asciiSurname=author.ascii_parts()[3],
email=email,
title=title,

View file

@ -592,8 +592,8 @@ class PlaintextDraft(Draft):
"honor" : r"(?:[A-Z]\.|Dr\.?|Dr\.-Ing\.|Prof(?:\.?|essor)|Sir|Lady|Dame|Sri)",
"prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von|[Ee]l)",
"suffix": r"(jr.?|Jr.?|II|2nd|III|3rd|IV|4th)",
"first" : r"([A-Z][-A-Za-z'`~]*)(( ?\([A-Z][-A-Za-z'`~]*\))?(\.?[- ]{1,2}[A-Za-z'`~]+)*)",
"last" : r"([-A-Za-z'`~]{2,})",
"first" : r"([A-Z][-A-Za-z'`~,]*)(( ?\([A-Z][-A-Za-z'`~,]*\))?(\.?[- ]{1,2}[A-Za-z'`~]+)*)",
"last" : r"([-A-Za-z'`~,]+)", # single-letter last names exist
"months": r"(January|February|March|April|May|June|July|August|September|October|November|December)",
"mabbr" : r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?",
}
@ -658,7 +658,12 @@ class PlaintextDraft(Draft):
# permit insertion of middle names between first and last, and
# add possible honorific and suffix information
authpat = r"(?:^| and )(?:%(hon)s ?)?(['`]*%(first)s\S*( +[^ ]+)* +%(last)s)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| %(suffix)s| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,}
if last:
authpat = r"(?:^| and )((?:%(hon)s ?)?['`]*%(first)s\S*( +[^ ]+)* +%(last)s(?: %(suffix)s)?)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,}
else:
# handle single-word names
authpat = r"(?:^| and )((?:%(hon)s ?)?['`]*%(first)s\S*( +[^ ]+)*(?: %(suffix)s)?)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "suffix":suffix,}
return authpat
authors = []
@ -812,7 +817,7 @@ class PlaintextDraft(Draft):
author = author[:-len(suffix)].strip()
else:
suffix = None
if "," in author:
if ", " in author:
last, first = author.split(",",1)
author = "%s %s" % (first.strip(), last.strip())
if not " " in author:
@ -820,8 +825,9 @@ class PlaintextDraft(Draft):
first, last = author.rsplit(".", 1)
first += "."
else:
author = "[A-Z].+ " + author
first, last = author.rsplit(" ", 1)
# handle single-word names
first = author
last = ""
else:
if "." in author:
first, last = author.rsplit(".", 1)
@ -899,10 +905,14 @@ class PlaintextDraft(Draft):
#else:
# fullname = author_match
fullname = re.sub(" +", " ", fullname)
if left == firstname:
given_names, surname = fullname.rsplit(None, 1)
if re.search(r"\s", fullname):
if left == firstname:
given_names, surname = fullname.rsplit(None, 1)
else:
surname, given_names = fullname.split(None, 1)
else:
surname, given_names = fullname.split(None, 1)
# handle single-word names
given_names, surname = (fullname, "")
if " " in given_names:
first, middle = given_names.split(None, 1)
else: