From d59c64943d70297ba487741a9306966e91b613ee Mon Sep 17 00:00:00 2001 From: Lars Eggert Date: Sat, 3 Dec 2022 00:41:21 +0300 Subject: [PATCH] fix: Fix spurious author extraction errors (#4799) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Handle single-word author names * Some i18n names, e.g., "शिला के.सी." have a dot at the end that is also part of the ASCII, e.g., "Shilaa Kesii." That trailing dot breaks extract_authors(). Avoid this issue by stripping the dot from the ASCII. * Honorifics need to be part of the extracted ASCII name (e.g., "Lady Garcia") * feat: stop supporting pre-tzaware migration database dumps. (#4782) * feat: stop supporting pre-tzaware migration database dumps. * chore: remove unnecessary env variable * chore: Use `codespell` to fix typos in comments. (#4794) First part of replacement of #4651 * feat: Only show IPR search form when not showing search results (#4793) * feat: Only show IPR search form when not showing search results Put it into a collapsible that is only expanded by default when not showing search results. Fixes #4569 * Don't use example target name * fix: Don't show reorder UI fixtures unless user can reorder (#4785) Fixes #4773 Co-authored-by: Robert Sparks * chore: Update deps and fix resulting HTML validation issues (#4790) * ci: add missing build matrix config for test-playwright-legacy step * Single-letter last names exist (e.g., "Carolina de la O") * Align regex with others * Fix extraction of very long author names * Need to be more general * Add comment * Also handle i18n names with trailing semicolons * Name suffixes need to be part of the extracted author names * Handle i18n names with embedded commas Co-authored-by: Robert Sparks Co-authored-by: Nicolas Giard --- ietf/person/factories.py | 5 ++++- ietf/submit/test_submission.txt | 3 +-- ietf/submit/tests.py | 5 +++++ ietf/utils/draft.py | 28 +++++++++++++++++++--------- 4 files changed, 29 insertions(+), 12 deletions(-) diff --git a/ietf/person/factories.py b/ietf/person/factories.py index 580c0bffe..8e80932c9 100644 --- a/ietf/person/factories.py +++ b/ietf/person/factories.py @@ -65,7 +65,10 @@ class PersonFactory(factory.django.DjangoModelFactory): user = factory.SubFactory(UserFactory) name = factory.LazyAttribute(lambda p: normalize_name('%s %s'%(p.user.first_name, p.user.last_name))) - ascii = factory.LazyAttribute(lambda p: force_text(unidecode_name(p.name))) + # Some i18n names, e.g., "शिला के.सी." have a dot at the end that is also part of the ASCII, e.g., "Shilaa Kesii." + # That trailing dot breaks extract_authors(). Avoid this issue by stripping the dot from the ASCII. + # Some others have a trailing semicolon (e.g., "உயிரோவியம் தங்கராஐ;") - strip those, too. + ascii = factory.LazyAttribute(lambda p: force_text(unidecode_name(p.name)).rstrip(".;")) class Params: with_bio = factory.Trait(biography = "\n\n".join(fake.paragraphs())) # type: ignore diff --git a/ietf/submit/test_submission.txt b/ietf/submit/test_submission.txt index 37a433ef8..5e828e53f 100644 --- a/ietf/submit/test_submission.txt +++ b/ietf/submit/test_submission.txt @@ -2,7 +2,7 @@ -Network Working Group %(initials)s %(surname)s +Network Working Group %(firstpagename)37s Internet-Draft Test Centre Inc. Intended status: Informational %(month)s %(year)s Expires: %(expiration)s @@ -10,7 +10,6 @@ Expires: %(expiration)s %(title)s %(name)s - Abstract This document describes how to test tests. diff --git a/ietf/submit/tests.py b/ietf/submit/tests.py index 03606b23f..ad1891c95 100644 --- a/ietf/submit/tests.py +++ b/ietf/submit/tests.py @@ -107,6 +107,10 @@ def submission_file(name_in_doc, name_in_post, group, templatename, author=None, if year is None: year = _today.strftime("%Y") + # extract_authors() cuts the author line off at the first space past 80 characters + # very long factory-generated names can hence be truncated, causing a failure + # ietf/submit/test_submission.txt was changed so that 37-character names and shorter will work + # this may need further adjustment if longer names still cause failures submission_text = template % dict( date=_today.strftime("%d %B %Y"), expiration=(_today + datetime.timedelta(days=100)).strftime("%d %B, %Y"), @@ -119,6 +123,7 @@ def submission_file(name_in_doc, name_in_post, group, templatename, author=None, asciiAuthor=author.ascii, initials=author.initials(), surname=author.ascii_parts()[3] if ascii else author.name_parts()[3], + firstpagename=f"{author.initials()} {author.ascii_parts()[3] if ascii else author.name_parts()[3]}", asciiSurname=author.ascii_parts()[3], email=email, title=title, diff --git a/ietf/utils/draft.py b/ietf/utils/draft.py index 0a379b0e9..b89ce1bd7 100755 --- a/ietf/utils/draft.py +++ b/ietf/utils/draft.py @@ -592,8 +592,8 @@ class PlaintextDraft(Draft): "honor" : r"(?:[A-Z]\.|Dr\.?|Dr\.-Ing\.|Prof(?:\.?|essor)|Sir|Lady|Dame|Sri)", "prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von|[Ee]l)", "suffix": r"(jr.?|Jr.?|II|2nd|III|3rd|IV|4th)", - "first" : r"([A-Z][-A-Za-z'`~]*)(( ?\([A-Z][-A-Za-z'`~]*\))?(\.?[- ]{1,2}[A-Za-z'`~]+)*)", - "last" : r"([-A-Za-z'`~]{2,})", + "first" : r"([A-Z][-A-Za-z'`~,]*)(( ?\([A-Z][-A-Za-z'`~,]*\))?(\.?[- ]{1,2}[A-Za-z'`~]+)*)", + "last" : r"([-A-Za-z'`~,]+)", # single-letter last names exist "months": r"(January|February|March|April|May|June|July|August|September|October|November|December)", "mabbr" : r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?", } @@ -658,7 +658,12 @@ class PlaintextDraft(Draft): # permit insertion of middle names between first and last, and # add possible honorific and suffix information - authpat = r"(?:^| and )(?:%(hon)s ?)?(['`]*%(first)s\S*( +[^ ]+)* +%(last)s)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| %(suffix)s| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,} + if last: + authpat = r"(?:^| and )((?:%(hon)s ?)?['`]*%(first)s\S*( +[^ ]+)* +%(last)s(?: %(suffix)s)?)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,} + else: + # handle single-word names + authpat = r"(?:^| and )((?:%(hon)s ?)?['`]*%(first)s\S*( +[^ ]+)*(?: %(suffix)s)?)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "suffix":suffix,} + return authpat authors = [] @@ -812,7 +817,7 @@ class PlaintextDraft(Draft): author = author[:-len(suffix)].strip() else: suffix = None - if "," in author: + if ", " in author: last, first = author.split(",",1) author = "%s %s" % (first.strip(), last.strip()) if not " " in author: @@ -820,8 +825,9 @@ class PlaintextDraft(Draft): first, last = author.rsplit(".", 1) first += "." else: - author = "[A-Z].+ " + author - first, last = author.rsplit(" ", 1) + # handle single-word names + first = author + last = "" else: if "." in author: first, last = author.rsplit(".", 1) @@ -899,10 +905,14 @@ class PlaintextDraft(Draft): #else: # fullname = author_match fullname = re.sub(" +", " ", fullname) - if left == firstname: - given_names, surname = fullname.rsplit(None, 1) + if re.search(r"\s", fullname): + if left == firstname: + given_names, surname = fullname.rsplit(None, 1) + else: + surname, given_names = fullname.split(None, 1) else: - surname, given_names = fullname.split(None, 1) + # handle single-word names + given_names, surname = (fullname, "") if " " in given_names: first, middle = given_names.split(None, 1) else: