From 2c1438c2401b5f2666764eb2543f19c3f62ce0f4 Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Wed, 20 Sep 2017 15:36:30 +0000 Subject: [PATCH] Moved unidecode_name from utils.text to person.name. Modified UserFactory to use a new locale for each new user, instead of the same locale for a whole test run. This (almost) ensures the exercise of code to deal with non-ascii names, something which would not happen if a locale with ascii names was chosen at the start of a run. Modified name.initials() to not use non-word characters as initials. Modified unidecode_name() to do more normalization, to conform to the conventions used in internet-drafts. Added saving of the factory-boy random state in order to be able to re-run a test suite with the same pseudo-random sequence as in a previous failed run. Fixed an issue with email formatting in test_api_submit_ok(). Modified the draft author extraction code to deal better with names with embedded apostrophes. - Legacy-Id: 14141 --- ietf/nomcom/utils.py | 2 +- ietf/person/factories.py | 13 +++--- ietf/person/models.py | 2 +- ietf/person/name.py | 55 ++++++++++++++++++++++++-- ietf/review/import_from_review_tool.py | 2 +- ietf/settings.py | 3 ++ ietf/stats/utils.py | 2 +- ietf/submit/tests.py | 2 +- ietf/submit/utils.py | 2 +- ietf/utils/draft.py | 6 +-- ietf/utils/test_data.py | 2 +- ietf/utils/test_runner.py | 16 ++++++++ ietf/utils/text.py | 9 ----- 13 files changed, 89 insertions(+), 27 deletions(-) diff --git a/ietf/nomcom/utils.py b/ietf/nomcom/utils.py index 5929d90d9..6e9c5e05a 100644 --- a/ietf/nomcom/utils.py +++ b/ietf/nomcom/utils.py @@ -22,7 +22,7 @@ from ietf.mailtrigger.utils import gather_address_lists from ietf.utils.pipe import pipe from ietf.utils.mail import send_mail_text, send_mail from ietf.utils.log import log -from ietf.utils.text import unidecode_name +from ietf.person.name import unidecode_name import debug # pyflakes:ignore diff --git a/ietf/person/factories.py b/ietf/person/factories.py index 20be62f14..6f64c7967 100644 --- a/ietf/person/factories.py +++ b/ietf/person/factories.py @@ -13,20 +13,23 @@ from django.utils.text import slugify import debug # pyflakes:ignore from ietf.person.models import Person, Alias, Email -from ietf.utils.text import unidecode_name +from ietf.person.name import unidecode_name fake = faker.Factory.create() +def random_faker(): + return faker.Faker(random.sample(faker.config.AVAILABLE_LOCALES, 1)[0]) + class UserFactory(factory.DjangoModelFactory): class Meta: model = User django_get_or_create = ('username',) - exclude = ['locale', ] + exclude = ['faker', ] - locale = random.sample(faker.config.AVAILABLE_LOCALES, 1)[0] - first_name = factory.Faker('first_name', locale) - last_name = factory.Faker('last_name', locale) + faker = factory.LazyFunction(random_faker) + first_name = factory.LazyAttribute(lambda o: o.faker.first_name()) + last_name = factory.LazyAttribute(lambda o: o.faker.last_name()) email = factory.LazyAttributeSequence(lambda u, n: '%s.%s_%d@%s'%( slugify(unidecode(u.first_name)), slugify(unidecode(u.last_name)), n, fake.domain_name())) username = factory.LazyAttribute(lambda u: u.email) diff --git a/ietf/person/models.py b/ietf/person/models.py index 8988e458a..217720fbe 100644 --- a/ietf/person/models.py +++ b/ietf/person/models.py @@ -20,7 +20,7 @@ from ietf.person.name import name_parts, initials, plain_name from ietf.utils.mail import send_mail_preformatted from ietf.utils.storage import NoLocationMigrationFileSystemStorage from ietf.utils.mail import formataddr -from ietf.utils.text import unidecode_name +from ietf.person.name import unidecode_name class PersonInfo(models.Model): diff --git a/ietf/person/name.py b/ietf/person/name.py index b10b1bc6d..a6ff5d7a6 100644 --- a/ietf/person/name.py +++ b/ietf/person/name.py @@ -1,7 +1,12 @@ import re +import unidecode import debug # pyflakes:ignore + +def name_particle_match(name): + return re.search(r" (af|al|Al|de|der|di|Di|du|el|El|Hadi|in 't|Le|st\.?|St\.?|ten|ter|van|van der|Van|von|von der|Von|zu) ", name) + def name_parts(name): prefix, first, middle, last, suffix = u"", u"", u"", u"", u"" @@ -36,7 +41,7 @@ def name_parts(name): full = full.lower() # adjust case for all-uppercase input # This is an incomplete list. Adjust as needed to handle known ietf # participant names correctly: - particle = re.search(r" (af|al|Al|de|der|di|Di|du|el|El|Hadi|in 't|Le|st\.?|St\.?|ten|ter|van|van der|Van|von|von der|Von|zu) ", full) + particle = name_particle_match(full) if particle: pos = particle.start() parts = full[:pos].split() + [full[pos+1:]] @@ -52,19 +57,63 @@ def name_parts(name): else: last = parts[0] return prefix, first, middle, last, suffix - + def initials(name): prefix, first, middle, last, suffix = name_parts(name) given = first if middle: given += u" "+middle - initials = u" ".join([ n[0]+'.' for n in given.split() ]) + # Don't use non-word characters as initials. + # Example: The Bulgarian transcribed name "'Rnest Balkanska" should not have an initial of "'". + given = re.sub('[^ .\w]', '', given) + initials = u" ".join([ n[0].upper()+'.' for n in given.split() ]) return initials def plain_name(name): prefix, first, middle, last, suffix = name_parts(name) return u" ".join([first, last]) +def capfirst(s): + # Capitalize the first word character, skipping non-word characters and + # leaving following word characters untouched: + letters = list(s) + for i,l in enumerate(letters): + if l.isalpha(): + letters[i] = l.capitalize() + break + return ''.join(letters) + +def unidecode_name(uname): + """ + unidecode() of cjk ideograms can produce strings which contain spaces. + Strip leading and trailing spaces, and reduce double-spaces to single. + + For some other ranges, unidecode returns all-lowercase names; fix these + up with capitalization. + """ + # Fix double spacing + name = unidecode.unidecode(uname) + if name == uname: + return name + name = name.strip().replace(' ', ' ') + # Fix all-upper and all-lower names: + # Check for name particles -- don't capitalize those + m = name_particle_match(name) + particle = m.group(1) if m else None + # Get the name parts + prefix, first, middle, last, suffix = name_parts(name) + # Capitalize names + first = capfirst(first) + middle = ' '.join([ capfirst(p) for p in middle.split() ]) + last = ' '.join([ capfirst(p) for p in last.split() ]) + # Restore the particle, if any + if particle and last.startswith(capfirst(particle)+' '): + last = ' '.join([ particle, last[len(particle)+1:] ]) + # Recombine the parts + parts = prefix, first, middle, last, suffix + name = ' '.join([ p for p in parts if p and p.strip() != '' ]) + return name + if __name__ == "__main__": import sys name = u" ".join(sys.argv[1:]) diff --git a/ietf/review/import_from_review_tool.py b/ietf/review/import_from_review_tool.py index f91aa50ce..93635d813 100755 --- a/ietf/review/import_from_review_tool.py +++ b/ietf/review/import_from_review_tool.py @@ -25,7 +25,7 @@ from ietf.person.models import Person, Email, Alias from ietf.doc.models import Document, DocAlias, ReviewRequestDocEvent, NewRevisionDocEvent, DocTypeName, State from ietf.utils.text import strip_prefix, xslugify from ietf.review.utils import possibly_advance_next_reviewer_for_team -from ietf.utils.text import unidecode_name +from ietf.person.name import unidecode_name parser = argparse.ArgumentParser() parser.add_argument("database", help="database must be included in settings") diff --git a/ietf/settings.py b/ietf/settings.py index 3dc5814f1..abcaa6010 100644 --- a/ietf/settings.py +++ b/ietf/settings.py @@ -920,6 +920,9 @@ SILENCED_SYSTEM_CHECKS = [ STATS_NAMES_LIMIT = 25 +UTILS_TEST_RANDOM_STATE_FILE = '.factoryboy_random_state' + + # Put the production SECRET_KEY in settings_local.py, and also any other # sensitive or site-specific changes. DO NOT commit settings_local.py to svn. from settings_local import * # pyflakes:ignore pylint: disable=wildcard-import diff --git a/ietf/stats/utils.py b/ietf/stats/utils.py index d48e8d2d0..fbf0fbd2b 100644 --- a/ietf/stats/utils.py +++ b/ietf/stats/utils.py @@ -8,7 +8,7 @@ from django.contrib.auth.models import User from ietf.stats.models import AffiliationAlias, AffiliationIgnoredEnding, CountryAlias, MeetingRegistration from ietf.name.models import CountryName from ietf.person.models import Person, Email, Alias -from ietf.utils.text import unidecode_name +from ietf.person.name import unidecode_name def compile_affiliation_ending_stripping_regexp(): diff --git a/ietf/submit/tests.py b/ietf/submit/tests.py index f0da29f83..d936bed06 100644 --- a/ietf/submit/tests.py +++ b/ietf/submit/tests.py @@ -1588,7 +1588,7 @@ class ApiSubmitTests(TestCase): def test_api_submit_ok(self): r, author, name = self.post_submission('00') - expected = "Upload of %s OK, confirmation requests sent to:\n %s" % (name, author.formatted_email()) + expected = "Upload of %s OK, confirmation requests sent to:\n %s" % (name, author.formatted_email().replace('\n','')) self.assertContains(r, expected, status_code=200) def test_api_submit_no_user(self): diff --git a/ietf/submit/utils.py b/ietf/submit/utils.py index 4f7b013de..ccabac8df 100644 --- a/ietf/submit/utils.py +++ b/ietf/submit/utils.py @@ -30,7 +30,7 @@ from ietf.utils import log from ietf.utils.accesstoken import generate_random_key from ietf.utils.draft import Draft from ietf.utils.mail import is_valid_email -from ietf.utils.text import unidecode_name +from ietf.person.name import unidecode_name def validate_submission(submission): diff --git a/ietf/utils/draft.py b/ietf/utils/draft.py index aa0815d6d..cc9f2b2c0 100755 --- a/ietf/utils/draft.py +++ b/ietf/utils/draft.py @@ -509,8 +509,8 @@ class Draft(): "honor" : r"(?:[A-Z]\.|Dr\.?|Dr\.-Ing\.|Prof(?:\.?|essor)|Sir|Lady|Dame|Sri)", "prefix": r"([Dd]e|Hadi|van|van de|van der|Ver|von|[Ee]l)", "suffix": r"(jr.?|Jr.?|II|2nd|III|3rd|IV|4th)", - "first" : r"([A-Z][-A-Za-z]*)(( ?\([A-Z][-A-Za-z]*\))?(\.?[- ]{1,2}[A-Za-z]+)*)", - "last" : r"([-A-Za-z']{2,})", + "first" : r"([A-Z][-A-Za-z'`]*)(( ?\([A-Z][-A-Za-z'`]*\))?(\.?[- ]{1,2}[A-Za-z'`]+)*)", + "last" : r"([-A-Za-z'`]{2,})", "months": r"(January|February|March|April|May|June|July|August|September|October|November|December)", "mabbr" : r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\.?", } @@ -575,7 +575,7 @@ class Draft(): # permit insertion of middle names between first and last, and # add possible honorific and suffix information - authpat = r"(?:^| and )(?:%(hon)s ?)?(%(first)s\S*( +[^ ]+)* +%(last)s)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| %(suffix)s| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,} + authpat = r"(?:^| and )(?:%(hon)s ?)?([`']?%(first)s\S*( +[^ ]+)* +%(last)s)( *\(.*|,( [A-Z][-A-Za-z0-9]*)?| %(suffix)s| [A-Z][a-z]+)?" % {"hon":hon, "first":first, "last":last, "suffix":suffix,} return authpat authors = [] diff --git a/ietf/utils/test_data.py b/ietf/utils/test_data.py index 0b34188fe..264eb9763 100644 --- a/ietf/utils/test_data.py +++ b/ietf/utils/test_data.py @@ -20,7 +20,7 @@ from ietf.name.models import StreamName, DocRelationshipName, RoomResourceName from ietf.person.models import Person, Email from ietf.group.utils import setup_default_community_list_for_group from ietf.review.models import (ReviewRequest, ReviewerSettings, ReviewResultName, ReviewTypeName, ReviewTeamSettings ) -from ietf.utils.text import unidecode_name +from ietf.person.name import unidecode_name def create_person(group, role_name, name=None, username=None, email_address=None, password=None, is_staff=False, is_superuser=False): diff --git a/ietf/utils/test_runner.py b/ietf/utils/test_runner.py index bb6f8912a..a6d6e76af 100644 --- a/ietf/utils/test_runner.py +++ b/ietf/utils/test_runner.py @@ -45,6 +45,7 @@ import datetime import codecs import gzip import unittest +import factory.random from fnmatch import fnmatch from coverage.report import Reporter @@ -557,6 +558,18 @@ class IetfTestRunner(DiscoverRunner): maybe_create_svn_symlinks(settings) + if os.path.exists(settings.UTILS_TEST_RANDOM_STATE_FILE): + print " Loading factory-boy random state from .random-state" + with open(settings.UTILS_TEST_RANDOM_STATE_FILE) as f: + s = json.load(f) + s[1] = tuple(s[1]) # random.setstate() won't accept a list in lieus of a tuple + factory.random.set_random_state(s) + else: + print " Saving factory-boy random state to .random-state" + with open(settings.UTILS_TEST_RANDOM_STATE_FILE, 'w') as f: + s = factory.random.get_random_state() + json.dump(s, f) + super(IetfTestRunner, self).setup_test_environment(**kwargs) def teardown_test_environment(self, **kwargs): @@ -683,4 +696,7 @@ class IetfTestRunner(DiscoverRunner): save_test_results(failures, test_labels) + if not failures and os.path.exists(settings.UTILS_TEST_RANDOM_STATE_FILE): + os.unlink(settings.UTILS_TEST_RANDOM_STATE_FILE) + return failures diff --git a/ietf/utils/text.py b/ietf/utils/text.py index d7a6bb3ca..0d074f158 100644 --- a/ietf/utils/text.py +++ b/ietf/utils/text.py @@ -4,7 +4,6 @@ import re import textwrap import types import unicodedata -import unidecode from django.utils.functional import allow_lazy from django.utils import six @@ -125,11 +124,3 @@ def isascii(text): return True except UnicodeEncodeError: return False - -def unidecode_name(name): - """ - unidecode() of cjk ideograms can produce strings which contain spaces. - Strip leading and trailing spaces, and reduce double-spaces to single. - """ - return unidecode.unidecode(name).strip().replace(' ', ' ') -