Made the stats backfill script slightly more verbose, including some blank lines for readability. Changed output to go both to screen and to file. Made the unicode/latin1/ascii decoding more general.

- Legacy-Id: 13535
This commit is contained in:
Henrik Levkowetz 2017-06-05 22:37:14 +00:00
parent 20b4250ea5
commit 8b72b6a29a

63
ietf/stats/backfill_data.py Normal file → Executable file
View file

@ -1,6 +1,12 @@
#!/usr/bin/env python
import sys, os, argparse
from __future__ import print_function, unicode_literals
import sys
import os
import os.path
import argparse
import time
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path = [ basedir ] + sys.path
@ -15,6 +21,8 @@ django.setup()
from django.conf import settings
import debug # pyflakes:ignore
from ietf.doc.models import Document
from ietf.name.models import FormalLanguageName
from ietf.utils.draft import Draft
@ -33,6 +41,31 @@ docs_qs = Document.objects.filter(type="draft")
if args.document:
docs_qs = docs_qs.filter(docalias__name=args.document)
ts = time.strftime("%Y-%m-%d_%H:%M%z")
logfile = open('backfill-authorstats-%s.log'%ts, 'w')
print("Writing log to %s" % os.path.abspath(logfile.name))
def say(msg):
msg = msg.encode('utf8')
sys.stderr.write(msg)
sys.stderr.write('\n')
logfile.write(msg)
logfile.write('\n')
def unicode(text):
if text is None:
return text
# order matters here:
for encoding in ['ascii', 'utf8', 'latin1', ]:
try:
utext = text.decode(encoding)
if encoding == 'latin1':
say("Warning: falling back to latin1 decoding for %s" % utext)
return utext
except UnicodeDecodeError:
pass
start = time.time()
for doc in docs_qs.prefetch_related("docalias_set", "formal_languages", "documentauthor_set", "documentauthor_set__person", "documentauthor_set__person__alias_set"):
canonical_name = doc.name
for n in doc.docalias_set.all():
@ -45,11 +78,12 @@ for doc in docs_qs.prefetch_related("docalias_set", "formal_languages", "documen
path = os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, canonical_name + "-" + doc.rev + ".txt")
if not os.path.exists(path):
print "skipping", doc.name, "no txt file found at", path
say("Skipping %s, no txt file found at %s" % (doc.name, path))
continue
with open(path, 'r') as f:
print "\nProcessing %s" % doc.name
say("\nProcessing %s" % doc.name)
sys.stdout.flush()
d = Draft(f.read(), path)
updated = False
@ -92,6 +126,7 @@ for doc in docs_qs.prefetch_related("docalias_set", "formal_languages", "documen
# it's an extra author - skip those extra authors
seen = set()
for full, _, _, _, _, email, country, company in d.get_author_list():
full, email, country, company = [ unicode(s) for s in [full, email, country, company, ] ]
if email in seen:
continue
seen.add(email)
@ -103,11 +138,11 @@ for doc in docs_qs.prefetch_related("docalias_set", "formal_languages", "documen
old_author = old_authors_by_name.get(full)
if not old_author:
print "UNKNOWN AUTHOR", doc.name, full, email, country, company
say("UNKNOWN AUTHOR: %s, %s, %s, %s, %s" % (doc.name, full, email, country, company))
continue
if old_author.affiliation != company:
print "new affiliation", canonical_name, "[", full, "]", old_author.affiliation, "->", company
say("new affiliation: %s [ %s <%s> ] %s -> %s" % (canonical_name, full, email, old_author.affiliation, company))
old_author.affiliation = company
old_author.save(update_fields=["affiliation"])
updated = True
@ -115,13 +150,8 @@ for doc in docs_qs.prefetch_related("docalias_set", "formal_languages", "documen
if country is None:
country = ""
try:
country = country.decode("utf-8")
except UnicodeDecodeError:
country = country.decode("latin-1")
if old_author.country != country:
print "new country", canonical_name ,"[", full, email, "]", old_author.country.encode("utf-8"), "->", country.encode("utf-8")
say("new country: %s [ %s <%s> ] %s -> %s" % (canonical_name , full, email, old_author.country, country))
old_author.country = country
old_author.save(update_fields=["country"])
updated = True
@ -132,5 +162,14 @@ for doc in docs_qs.prefetch_related("docalias_set", "formal_languages", "documen
updated = True
if updated:
print "updated", canonical_name
say("updated: %s" % canonical_name)
stop = time.time()
dur = stop-start
sec = dur%60
min = dur//60
say("Processing time %d:%02d" % (min, sec))
print("\n\nWrote log to %s" % os.path.abspath(logfile.name))
logfile.close()