Replaced the use of unaccent.asciify(), which has similar functionality to unidecode.unidecode(). Changed the draft parser to work exclusively with unicode text, which both makes the removal of unaccent easier, and takes us closer to Py35 compatibility. Adjusted callers of the draft parser to send in unicode.
- Legacy-Id: 13673
This commit is contained in:
parent
ad57b107f2
commit
b42f1cbeb5
|
@ -498,9 +498,10 @@ def rebuild_reference_relations(doc,filename=None):
|
|||
filename=os.path.join(settings.INTERNET_DRAFT_PATH,doc.filename_with_rev())
|
||||
|
||||
try:
|
||||
refs = draft.Draft(draft._gettext(filename), filename).get_refs()
|
||||
with open(filename, 'rb') as file:
|
||||
refs = draft.Draft(file.read().decode('utf8'), filename).get_refs()
|
||||
except IOError as e:
|
||||
return { 'errors': ["%s :%s" % (e.strerror, filename)] }
|
||||
return { 'errors': ["%s :%s" % (e.strerror, filename)] }
|
||||
|
||||
doc.relateddocument_set.filter(relationship__slug__in=['refnorm','refinfo','refold','refunk']).delete()
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ from ietf.dbtemplate.models import DBTemplate
|
|||
from ietf.person.models import Email, Person
|
||||
from ietf.mailtrigger.utils import gather_address_lists
|
||||
from ietf.utils.pipe import pipe
|
||||
from ietf.utils import unaccent
|
||||
from unidecode import unidecode
|
||||
from ietf.utils.mail import send_mail_text, send_mail
|
||||
from ietf.utils.log import log
|
||||
|
||||
|
@ -365,7 +365,7 @@ def make_nomineeposition_for_newperson(nomcom, candidate_name, candidate_email,
|
|||
# This is expected to fail if called with an existing email address
|
||||
email = Email.objects.create(address=candidate_email)
|
||||
person = Person.objects.create(name=candidate_name,
|
||||
ascii=unaccent.asciify(candidate_name),
|
||||
ascii=unidecode(candidate_name),
|
||||
address=candidate_email)
|
||||
email.person = person
|
||||
email.save()
|
||||
|
|
|
@ -108,7 +108,7 @@ def process_files(request,draft):
|
|||
file_type_list.append(extension)
|
||||
if extension == '.txt':
|
||||
txt_size = file.size
|
||||
wrapper = Draft(file.read(),file.name)
|
||||
wrapper = Draft(file.read().decode('utf8'),file.name)
|
||||
handle_uploaded_file(file)
|
||||
|
||||
# create Submission record, leaved unsaved
|
||||
|
|
|
@ -6,6 +6,7 @@ import sys
|
|||
import os
|
||||
import os.path
|
||||
import argparse
|
||||
import six
|
||||
import time
|
||||
|
||||
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
||||
|
@ -59,13 +60,14 @@ def unicode(text):
|
|||
for encoding in ['ascii', 'utf8', 'latin1', ]:
|
||||
try:
|
||||
utext = text.decode(encoding)
|
||||
if encoding == 'latin1':
|
||||
say("Warning: falling back to latin1 decoding for %s" % utext)
|
||||
# if encoding == 'latin1':
|
||||
# say("Warning: falling back to latin1 decoding for %s ..." % utext[:216]])
|
||||
return utext
|
||||
except UnicodeDecodeError:
|
||||
pass
|
||||
|
||||
start = time.time()
|
||||
say("Running query for documents to process ...")
|
||||
for doc in docs_qs.prefetch_related("docalias_set", "formal_languages", "documentauthor_set", "documentauthor_set__person", "documentauthor_set__person__alias_set"):
|
||||
canonical_name = doc.name
|
||||
for n in doc.docalias_set.all():
|
||||
|
@ -81,10 +83,10 @@ for doc in docs_qs.prefetch_related("docalias_set", "formal_languages", "documen
|
|||
say("Skipping %s, no txt file found at %s" % (doc.name, path))
|
||||
continue
|
||||
|
||||
with open(path, 'r') as f:
|
||||
with open(path, 'rb') as f:
|
||||
say("\nProcessing %s" % doc.name)
|
||||
sys.stdout.flush()
|
||||
d = Draft(f.read(), path)
|
||||
d = Draft(unicode(f.read()), path)
|
||||
|
||||
updated = False
|
||||
|
||||
|
@ -126,7 +128,11 @@ for doc in docs_qs.prefetch_related("docalias_set", "formal_languages", "documen
|
|||
# it's an extra author - skip those extra authors
|
||||
seen = set()
|
||||
for full, _, _, _, _, email, country, company in d.get_author_list():
|
||||
full, email, country, company = [ unicode(s) for s in [full, email, country, company, ] ]
|
||||
assert full is None or isinstance(full, six.text_type)
|
||||
assert email is None or isinstance(email, six.text_type)
|
||||
assert country is None or isinstance(country, six.text_type)
|
||||
assert company is None or isinstance(company, six.text_type)
|
||||
#full, email, country, company = [ unicode(s) for s in [full, email, country, company, ] ]
|
||||
if email in seen:
|
||||
continue
|
||||
seen.add(email)
|
||||
|
|
|
@ -203,7 +203,7 @@ class SubmissionUploadForm(forms.Form):
|
|||
# try to parse it
|
||||
txt_file = self.cleaned_data['txt']
|
||||
txt_file.seek(0)
|
||||
self.parsed_draft = Draft(txt_file.read(), txt_file.name)
|
||||
self.parsed_draft = Draft(txt_file.read().decode('utf8'), txt_file.name)
|
||||
self.filename = self.parsed_draft.filename
|
||||
self.revision = self.parsed_draft.revision
|
||||
self.title = self.parsed_draft.get_title()
|
||||
|
|
|
@ -1,4 +1,6 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
|
||||
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
|
@ -302,11 +304,11 @@ class SubmitTests(TestCase):
|
|||
draft.save_with_history([DocEvent.objects.create(doc=draft, rev=draft.rev, type="added_comment", by=Person.objects.get(user__username="secretary"), desc="Test")])
|
||||
if not change_authors:
|
||||
draft.documentauthor_set.all().delete()
|
||||
author_person, author_email = ensure_person_email_info_exists('Author Name','author@example.com')
|
||||
author_person, author_email = ensure_person_email_info_exists(u'Author Name',u'author@example.com')
|
||||
draft.documentauthor_set.create(person=author_person, email=author_email)
|
||||
else:
|
||||
# Make it such that one of the previous authors has an invalid email address
|
||||
bogus_person, bogus_email = ensure_person_email_info_exists('Bogus Person',None)
|
||||
bogus_person, bogus_email = ensure_person_email_info_exists(u'Bogus Person',None)
|
||||
DocumentAuthor.objects.create(document=draft, person=bogus_person, email=bogus_email, order=draft.documentauthor_set.latest('order').order+1)
|
||||
|
||||
prev_author = draft.documentauthor_set.all()[0]
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
import os
|
||||
import datetime
|
||||
import six
|
||||
import six # pyflakes:ignore
|
||||
from unidecode import unidecode
|
||||
|
||||
from django.conf import settings
|
||||
|
@ -24,7 +24,6 @@ from ietf.community.utils import update_name_contains_indexes_with_new_doc
|
|||
from ietf.submit.mail import announce_to_lists, announce_new_version, announce_to_authors
|
||||
from ietf.submit.models import Submission, SubmissionEvent, Preapproval, DraftSubmissionStateName
|
||||
from ietf.utils import log
|
||||
from ietf.utils import unaccent
|
||||
from ietf.utils.mail import is_valid_email
|
||||
|
||||
|
||||
|
@ -401,10 +400,8 @@ def ensure_person_email_info_exists(name, email):
|
|||
if not person:
|
||||
person = Person()
|
||||
person.name = name
|
||||
if isinstance(person.name, six.text_type):
|
||||
person.ascii = unidecode(person.name).decode('ascii')
|
||||
else:
|
||||
person.ascii = unaccent.asciify(person.name).decode('ascii')
|
||||
log.assertion('isinstance(person.name, six.text_type)')
|
||||
person.ascii = unidecode(person.name).decode('ascii')
|
||||
person.save()
|
||||
|
||||
# make sure we have an email address
|
||||
|
|
|
@ -71,7 +71,7 @@ def upload_submission(request):
|
|||
# be retrieved from the generated text file. Provide a
|
||||
# parsed draft object to get at that kind of information.
|
||||
with open(file_name['txt']) as txt_file:
|
||||
form.parsed_draft = Draft(txt_file.read(), txt_file.name)
|
||||
form.parsed_draft = Draft(txt_file.read().decode('utf8'), txt_file.name)
|
||||
|
||||
else:
|
||||
file_size = form.cleaned_data['txt'].size
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
#!/usr/bin/python
|
||||
# -*- python -*-
|
||||
|
||||
from __future__ import unicode_literals
|
||||
"""
|
||||
NAME
|
||||
%(program)s - Extract meta-information from an IETF draft.
|
||||
|
@ -37,9 +37,11 @@ import os
|
|||
import os.path
|
||||
import re
|
||||
import stat
|
||||
import six
|
||||
import sys
|
||||
import time
|
||||
|
||||
|
||||
version = "0.35"
|
||||
program = os.path.basename(sys.argv[0])
|
||||
progdir = os.path.dirname(sys.argv[0])
|
||||
|
@ -124,6 +126,7 @@ def acronym_match(s, l):
|
|||
class Draft():
|
||||
|
||||
def __init__(self, text, source):
|
||||
assert isinstance(text, six.text_type)
|
||||
self.source = source
|
||||
self.rawtext = text
|
||||
|
||||
|
@ -1168,8 +1171,8 @@ def getmeta(fn):
|
|||
return
|
||||
|
||||
timestamp = time.strftime("%Y-%m-%dT%H:%M:%S+00:00", time.gmtime(os.stat(filename)[stat.ST_MTIME]))
|
||||
text = _gettext(filename)
|
||||
draft = Draft(text, filename)
|
||||
with open(filename, 'rb') as file:
|
||||
draft = Draft(file.read().decode('utf8'), filename)
|
||||
#_debug("\n".join(draft.lines))
|
||||
|
||||
fields["eventdate"] = timestamp
|
||||
|
|
|
@ -1,146 +0,0 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
# use a dynamically populated translation dictionary to remove accents
|
||||
# from a string
|
||||
# (by Chris Mulligan, http://chmullig.com/2009/12/python-unicode-ascii-ifier/)
|
||||
|
||||
import unicodedata, sys
|
||||
from ietf.utils.log import unreachable
|
||||
|
||||
class unaccented_map(dict):
|
||||
# Translation dictionary. Translation entries are added to this dictionary as needed.
|
||||
CHAR_REPLACEMENT = {
|
||||
0xc6: u"AE", # Æ LATIN CAPITAL LETTER AE
|
||||
0xd0: u"D", # Ð LATIN CAPITAL LETTER ETH
|
||||
0xd8: u"OE", # Ø LATIN CAPITAL LETTER O WITH STROKE
|
||||
0xde: u"Th", # Þ LATIN CAPITAL LETTER THORN
|
||||
0xc4: u'Ae', # Ä LATIN CAPITAL LETTER A WITH DIAERESIS
|
||||
0xd6: u'Oe', # Ö LATIN CAPITAL LETTER O WITH DIAERESIS
|
||||
0xdc: u'Ue', # Ü LATIN CAPITAL LETTER U WITH DIAERESIS
|
||||
|
||||
0xc0: u"A", # À LATIN CAPITAL LETTER A WITH GRAVE
|
||||
0xc1: u"A", # Á LATIN CAPITAL LETTER A WITH ACUTE
|
||||
0xc3: u"A", # Ã LATIN CAPITAL LETTER A WITH TILDE
|
||||
0xc7: u"C", # Ç LATIN CAPITAL LETTER C WITH CEDILLA
|
||||
0xc8: u"E", # È LATIN CAPITAL LETTER E WITH GRAVE
|
||||
0xc9: u"E", # É LATIN CAPITAL LETTER E WITH ACUTE
|
||||
0xca: u"E", # Ê LATIN CAPITAL LETTER E WITH CIRCUMFLEX
|
||||
0xcc: u"I", # Ì LATIN CAPITAL LETTER I WITH GRAVE
|
||||
0xcd: u"I", # Í LATIN CAPITAL LETTER I WITH ACUTE
|
||||
0xd2: u"O", # Ò LATIN CAPITAL LETTER O WITH GRAVE
|
||||
0xd3: u"O", # Ó LATIN CAPITAL LETTER O WITH ACUTE
|
||||
0xd5: u"O", # Õ LATIN CAPITAL LETTER O WITH TILDE
|
||||
0xd9: u"U", # Ù LATIN CAPITAL LETTER U WITH GRAVE
|
||||
0xda: u"U", # Ú LATIN CAPITAL LETTER U WITH ACUTE
|
||||
|
||||
0xdf: u"ss", # ß LATIN SMALL LETTER SHARP S
|
||||
0xe6: u"ae", # æ LATIN SMALL LETTER AE
|
||||
0xf0: u"d", # ð LATIN SMALL LETTER ETH
|
||||
0xf8: u"oe", # ø LATIN SMALL LETTER O WITH STROKE
|
||||
0xfe: u"th", # þ LATIN SMALL LETTER THORN,
|
||||
0xe4: u'ae', # ä LATIN SMALL LETTER A WITH DIAERESIS
|
||||
0xf6: u'oe', # ö LATIN SMALL LETTER O WITH DIAERESIS
|
||||
0xfc: u'ue', # ü LATIN SMALL LETTER U WITH DIAERESIS
|
||||
|
||||
0xe0: u"a", # à LATIN SMALL LETTER A WITH GRAVE
|
||||
0xe1: u"a", # á LATIN SMALL LETTER A WITH ACUTE
|
||||
0xe3: u"a", # ã LATIN SMALL LETTER A WITH TILDE
|
||||
0xe7: u"c", # ç LATIN SMALL LETTER C WITH CEDILLA
|
||||
0xe8: u"e", # è LATIN SMALL LETTER E WITH GRAVE
|
||||
0xe9: u"e", # é LATIN SMALL LETTER E WITH ACUTE
|
||||
0xea: u"e", # ê LATIN SMALL LETTER E WITH CIRCUMFLEX
|
||||
0xec: u"i", # ì LATIN SMALL LETTER I WITH GRAVE
|
||||
0xed: u"i", # í LATIN SMALL LETTER I WITH ACUTE
|
||||
0xf2: u"o", # ò LATIN SMALL LETTER O WITH GRAVE
|
||||
0xf3: u"o", # ó LATIN SMALL LETTER O WITH ACUTE
|
||||
0xf5: u"o", # õ LATIN SMALL LETTER O WITH TILDE
|
||||
0xf9: u"u", # ù LATIN SMALL LETTER U WITH GRAVE
|
||||
0xfa: u"u", # ú LATIN SMALL LETTER U WITH ACUTE
|
||||
|
||||
0x2018: u"'", # ‘ LEFT SINGLE QUOTATION MARK
|
||||
0x2019: u"'", # ’ RIGHT SINGLE QUOTATION MARK
|
||||
0x201c: u'"', # “ LEFT DOUBLE QUOTATION MARK
|
||||
0x201d: u'"', # ” RIGHT DOUBLE QUOTATION MARK
|
||||
|
||||
}
|
||||
|
||||
# Maps a unicode character code (the key) to a replacement code
|
||||
# (either a character code or a unicode string).
|
||||
def mapchar(self, key):
|
||||
ch = self.get(key)
|
||||
if ch is not None:
|
||||
return ch
|
||||
try:
|
||||
de = unicodedata.decomposition(unichr(key))
|
||||
p1, p2 = [int(x, 16) for x in de.split(None, 1)]
|
||||
if p2 == 0x308:
|
||||
ch = self.CHAR_REPLACEMENT.get(key)
|
||||
else:
|
||||
ch = int(p1)
|
||||
|
||||
except (IndexError, ValueError):
|
||||
ch = self.CHAR_REPLACEMENT.get(key, key)
|
||||
self[key] = ch
|
||||
return ch
|
||||
|
||||
if sys.version <= "2.5":
|
||||
# use __missing__ where available
|
||||
__missing__ = mapchar
|
||||
else:
|
||||
# otherwise, use standard __getitem__ hook (this is slower,
|
||||
# since it's called for each character)
|
||||
__getitem__ = mapchar
|
||||
|
||||
map = unaccented_map()
|
||||
|
||||
def asciify(input):
|
||||
unreachable("18 Jun 2017")
|
||||
try:
|
||||
return input.encode('ascii')
|
||||
except AttributeError:
|
||||
return str(input).encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
return unicodedata.normalize('NFKD', input.translate(map)).encode('ascii', 'replace')
|
||||
|
||||
text = u"""
|
||||
|
||||
##Norwegian
|
||||
"Jo, når'n da ha gått ett støck te, så kommer'n te e å,
|
||||
å i åa æ e ø."
|
||||
"Vasa", sa'n.
|
||||
"Å i åa æ e ø", sa ja.
|
||||
"Men va i all ti æ dæ ni sæjer, a, o?", sa'n.
|
||||
"D'æ e å, vett ja", skrek ja, før ja ble rasen, "å i åa
|
||||
æ e ø, hører han lite, d'æ e å, å i åa æ e ø."
|
||||
"A, o, ø", sa'n å dæmmæ geck'en.
|
||||
Jo, den va nôe te dum den.
|
||||
|
||||
(taken from the short story "Dumt fôlk" in Gustaf Fröding's
|
||||
"Räggler å paschaser på våra mål tå en bonne" (1895).
|
||||
|
||||
##Danish
|
||||
|
||||
Nu bliver Mølleren sikkert sur, og dog, han er stadig den største på verdensplan.
|
||||
|
||||
Userneeds A/S er en dansk virksomhed, der udfører statistiske undersøgelser på internettet. Den blev etableret i 2001 som et anpartsselskab af David Jensen og Henrik Vincentz.
|
||||
Frem til 2004 var det primære fokus på at forbedre hjemmesiderne for andre virksomheder. Herefter blev fokus omlagt, så man også beskæftigede sig med statistiske målinger. Ledelsen vurderede, at dette marked ville vokse betragteligt i de kommende år, hvilket man ønskede at udnytte.
|
||||
Siden omlægningen er der blevet fokuseret på at etablere meget store forbrugerpaneler. Således udgjorde det danske panel i 2005 65.000 personer og omfatter per 2008 100.000 personer.
|
||||
I 2007 blev Userneeds ApS konverteret til aktieselskabet Userneeds A/S
|
||||
Efterhånden er aktiviteterne blevet udvidet til de nordiske lande (med undtagelse af Island) og besidder i 2009 et forbrugerpanel med i alt mere end 250.000 personer bosat i de fire store nordiske lande.
|
||||
Selskabet tegnes udadtil af en direktion på tre personer, der foruden Henrik Vincentz tæller Palle Viby Morgen og Simon Andersen.
|
||||
De primære konkurrenter er andre analysebureauer som AC Nielsen, Analysedanmark, Gallup, Norstat, Synnovate og Zapera.
|
||||
|
||||
##Finnish
|
||||
Titus Aurelius Fulvus Boionius Arrius Antoninus eli Antoninus Pius (19. syyskuuta 86 – 7. maaliskuuta 161) oli Rooman keisari vuosina 138–161. Antoninus sai lisänimensä Pius (suom. velvollisuudentuntoinen) noustuaan valtaan vuonna 138. Hän kuului Nerva–Antoninusten hallitsijasukuun ja oli suosittu ja kunnioitettu keisari, joka tunnettiin lempeydestään ja oikeamielisyydestään. Hänen valtakauttaan on usein sanottu Rooman valtakunnan kultakaudeksi, jolloin talous kukoisti, poliittinen tilanne oli vakaa ja armeija vahva. Hän hallitsi pitempään kuin yksikään Rooman keisari Augustuksen jälkeen, ja hänen kautensa tunnetaan erityisen rauhallisena, joskaan ei sodattomana. Antoninus adoptoi Marcus Aureliuksen ja Lucius Veruksen vallanperijöikseen. Hän kuoli vuonna 161.
|
||||
|
||||
#German
|
||||
So heißt ein altes Märchen: "Der Ehre Dornenpfad", und es handelt von einem Schützen mit Namen Bryde, der wohl zu großen Ehren und Würden kam, aber nicht ohne lange und vielfältige Widerwärtigkeiten und Fährnisse des Lebens durchzumachen. Manch einer von uns hat es gewiß als Kind gehört oder es vielleicht später gelesen und dabei an seinen eigenen stillen Dornenweg und die vielen Widerwärtigkeiten gedacht. Märchen und Wirklichkeit liegen einander so nahe, aber das Märchen hat seine harmonische Lösung hier auf Erden, während die Wirklichkeit sie meist aus dem Erdenleben hinaus in Zeit und Ewigkeit verlegt.
|
||||
|
||||
12\xbd inch
|
||||
"""
|
||||
|
||||
if __name__ == "__main__":
|
||||
for i, line in enumerate(text.splitlines()):
|
||||
line = line.strip()
|
||||
print line
|
||||
if line and not line.startswith('#'):
|
||||
print '\tTrans: ', asciify(line).strip()
|
Loading…
Reference in a new issue