From 33b275b04f7f70140448f69827a73446bb7f40fc Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Sun, 17 Sep 2017 15:12:18 +0000 Subject: [PATCH] Added ietf.utils.text.unidecode_name() and replaced various uses of unidecode() with it, in order to normalize the generation of ascii versions of names, to avoid different practices in space stripping and space normalization in different parts of the code. - Legacy-Id: 14128 --- ietf/nomcom/utils.py | 4 ++-- ietf/person/factories.py | 4 +++- ietf/person/models.py | 10 +++++----- ietf/review/import_from_review_tool.py | 6 +++--- ietf/stats/utils.py | 6 +++--- ietf/submit/utils.py | 4 ++-- ietf/utils/test_data.py | 5 +++-- ietf/utils/text.py | 10 +++++++++- 8 files changed, 30 insertions(+), 19 deletions(-) diff --git a/ietf/nomcom/utils.py b/ietf/nomcom/utils.py index 21d2b17cc..5929d90d9 100644 --- a/ietf/nomcom/utils.py +++ b/ietf/nomcom/utils.py @@ -20,9 +20,9 @@ from ietf.dbtemplate.models import DBTemplate from ietf.person.models import Email, Person from ietf.mailtrigger.utils import gather_address_lists from ietf.utils.pipe import pipe -from unidecode import unidecode from ietf.utils.mail import send_mail_text, send_mail from ietf.utils.log import log +from ietf.utils.text import unidecode_name import debug # pyflakes:ignore @@ -365,7 +365,7 @@ def make_nomineeposition_for_newperson(nomcom, candidate_name, candidate_email, # This is expected to fail if called with an existing email address email = Email.objects.create(address=candidate_email) person = Person.objects.create(name=candidate_name, - ascii=unidecode(candidate_name), + ascii=unidecode_name(candidate_name), address=candidate_email) email.person = person email.save() diff --git a/ietf/person/factories.py b/ietf/person/factories.py index 014c741b9..20be62f14 100644 --- a/ietf/person/factories.py +++ b/ietf/person/factories.py @@ -13,6 +13,8 @@ from django.utils.text import slugify import debug # pyflakes:ignore from ietf.person.models import Person, Alias, Email +from ietf.utils.text import unidecode_name + fake = faker.Factory.create() @@ -39,7 +41,7 @@ class PersonFactory(factory.DjangoModelFactory): user = factory.SubFactory(UserFactory) name = factory.LazyAttribute(lambda p: u'%s %s'%(p.user.first_name,p.user.last_name)) - ascii = factory.LazyAttribute(lambda p: unicode(unidecode(p.name).strip())) + ascii = factory.LazyAttribute(lambda p: unicode(unidecode_name(p.name))) class Params: with_bio = factory.Trait(biography = u"\n\n".join(fake.paragraphs())) diff --git a/ietf/person/models.py b/ietf/person/models.py index d4e3c6924..8988e458a 100644 --- a/ietf/person/models.py +++ b/ietf/person/models.py @@ -4,7 +4,6 @@ import datetime import email.utils import email.header from hashids import Hashids -from unidecode import unidecode from urlparse import urljoin from django.conf import settings @@ -21,6 +20,7 @@ from ietf.person.name import name_parts, initials, plain_name from ietf.utils.mail import send_mail_preformatted from ietf.utils.storage import NoLocationMigrationFileSystemStorage from ietf.utils.mail import formataddr +from ietf.utils.text import unidecode_name class PersonInfo(models.Model): @@ -61,18 +61,18 @@ class PersonInfo(models.Model): # we're validating the content of the ascii field, and have # verified that the field is ascii clean in the database: if not all(ord(c) < 128 for c in self.ascii): - self._cached_ascii_name = unidecode(self.ascii).strip() + self._cached_ascii_name = unidecode_name(self.ascii) else: self._cached_ascii_name = self.ascii else: - self._cached_ascii_name = unidecode(self.plain_name()).strip() + self._cached_ascii_name = unidecode_name(self.plain_name()) return self._cached_ascii_name def plain_ascii(self): if not hasattr(self, '_cached_plain_ascii'): if self.ascii: - ascii = unidecode(self.ascii).strip() + ascii = unidecode_name(self.ascii) else: - ascii = unidecode(self.name).strip() + ascii = unidecode_name(self.name) prefix, first, middle, last, suffix = name_parts(ascii) self._cached_plain_ascii = u" ".join([first, last]) return self._cached_plain_ascii diff --git a/ietf/review/import_from_review_tool.py b/ietf/review/import_from_review_tool.py index 6f8f931aa..f91aa50ce 100755 --- a/ietf/review/import_from_review_tool.py +++ b/ietf/review/import_from_review_tool.py @@ -1,6 +1,7 @@ #!/usr/bin/env python import sys, os +import argparse # boilerplate basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) @@ -24,8 +25,7 @@ from ietf.person.models import Person, Email, Alias from ietf.doc.models import Document, DocAlias, ReviewRequestDocEvent, NewRevisionDocEvent, DocTypeName, State from ietf.utils.text import strip_prefix, xslugify from ietf.review.utils import possibly_advance_next_reviewer_for_team -import argparse -from unidecode import unidecode +from ietf.utils.text import unidecode_name parser = argparse.ArgumentParser() parser.add_argument("database", help="database must be included in settings") @@ -92,7 +92,7 @@ with db_con.cursor() as c: if not email: person = Person.objects.filter(alias__name=row.name).first() if not person: - person, created = Person.objects.get_or_create(name=row.name, ascii=unidecode(row.name)) + person, created = Person.objects.get_or_create(name=row.name, ascii=unidecode_name(row.name)) if created: print "created person", unicode(person).encode("utf-8") existing_aliases = set(Alias.objects.filter(person=person).values_list("name", flat=True)) diff --git a/ietf/stats/utils.py b/ietf/stats/utils.py index d30171eff..d48e8d2d0 100644 --- a/ietf/stats/utils.py +++ b/ietf/stats/utils.py @@ -3,12 +3,12 @@ import requests from collections import defaultdict from django.conf import settings +from django.contrib.auth.models import User from ietf.stats.models import AffiliationAlias, AffiliationIgnoredEnding, CountryAlias, MeetingRegistration from ietf.name.models import CountryName from ietf.person.models import Person, Email, Alias -from django.contrib.auth.models import User -from unidecode import unidecode +from ietf.utils.text import unidecode_name def compile_affiliation_ending_stripping_regexp(): @@ -269,7 +269,7 @@ def get_meeting_registration_data(meeting): last_name = last_name.capitalize() regname = "%s %s" % (first_name, last_name) # if there are any unicode characters decode the string to ascii - ascii_name = unidecode(regname).strip() + ascii_name = unidecode_name(regname) # Create a new user object if it does not exist already # if the user already exists do not try to create a new one diff --git a/ietf/submit/utils.py b/ietf/submit/utils.py index 66c9fec57..4f7b013de 100644 --- a/ietf/submit/utils.py +++ b/ietf/submit/utils.py @@ -4,7 +4,6 @@ import os import datetime import six # pyflakes:ignore import xml2rfc -from unidecode import unidecode from django.conf import settings from django.core.validators import validate_email, ValidationError @@ -31,6 +30,7 @@ from ietf.utils import log from ietf.utils.accesstoken import generate_random_key from ietf.utils.draft import Draft from ietf.utils.mail import is_valid_email +from ietf.utils.text import unidecode_name def validate_submission(submission): @@ -407,7 +407,7 @@ def ensure_person_email_info_exists(name, email): person = Person() person.name = name log.assertion('isinstance(person.name, six.text_type)') - person.ascii = unidecode(person.name).decode('ascii') + person.ascii = unidecode_name(person.name).decode('ascii') person.save() # make sure we have an email address diff --git a/ietf/utils/test_data.py b/ietf/utils/test_data.py index 5342da8a4..0b34188fe 100644 --- a/ietf/utils/test_data.py +++ b/ietf/utils/test_data.py @@ -4,7 +4,6 @@ from __future__ import unicode_literals import datetime -from unidecode import unidecode from django.conf import settings from django.contrib.auth.models import User @@ -21,6 +20,8 @@ from ietf.name.models import StreamName, DocRelationshipName, RoomResourceName from ietf.person.models import Person, Email from ietf.group.utils import setup_default_community_list_for_group from ietf.review.models import (ReviewRequest, ReviewerSettings, ReviewResultName, ReviewTypeName, ReviewTeamSettings ) +from ietf.utils.text import unidecode_name + def create_person(group, role_name, name=None, username=None, email_address=None, password=None, is_staff=False, is_superuser=False): """Add person/user/email and role.""" @@ -36,7 +37,7 @@ def create_person(group, role_name, name=None, username=None, email_address=None user = User.objects.create(username=username,is_staff=is_staff,is_superuser=is_superuser) user.set_password(password) user.save() - person = Person.objects.create(name=name, ascii=unidecode(smart_text(name)), user=user) + person = Person.objects.create(name=name, ascii=unidecode_name(smart_text(name)), user=user) email = Email.objects.create(address=email_address, person=person) Role.objects.create(group=group, name_id=role_name, person=person, email=email) return person diff --git a/ietf/utils/text.py b/ietf/utils/text.py index c19852e0c..d7a6bb3ca 100644 --- a/ietf/utils/text.py +++ b/ietf/utils/text.py @@ -1,9 +1,10 @@ from __future__ import unicode_literals import re -import unicodedata import textwrap import types +import unicodedata +import unidecode from django.utils.functional import allow_lazy from django.utils import six @@ -125,3 +126,10 @@ def isascii(text): except UnicodeEncodeError: return False +def unidecode_name(name): + """ + unidecode() of cjk ideograms can produce strings which contain spaces. + Strip leading and trailing spaces, and reduce double-spaces to single. + """ + return unidecode.unidecode(name).strip().replace(' ', ' ') +