datatracker/ietf/stats/backfill_data.py
Ole Laursen b2ff10b0f2 Add support for extracting the country line from the author addresses
to the draft parser (incorporating patch from trunk), store the
extracted country instead of trying to turn it into an ISO country
code, add country and continent name models and add initial data for
those, add helper function for cleaning the countries, add author
country and continent charts, move the affiliation models to
stats/models.py, fix a bunch of bugs.
 - Legacy-Id: 12846
2017-02-15 18:43:57 +00:00

134 lines
4.9 KiB
Python

import sys, os, argparse
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path = [ basedir ] + sys.path
os.environ["DJANGO_SETTINGS_MODULE"] = "ietf.settings"
virtualenv_activation = os.path.join(basedir, "env", "bin", "activate_this.py")
if os.path.exists(virtualenv_activation):
execfile(virtualenv_activation, dict(__file__=virtualenv_activation))
import django
django.setup()
from django.conf import settings
from ietf.doc.models import Document
from ietf.name.models import FormalLanguageName
from ietf.utils.draft import Draft
parser = argparse.ArgumentParser()
parser.add_argument("--document", help="specific document name")
parser.add_argument("--words", action="store_true", help="fill in word count")
parser.add_argument("--formlang", action="store_true", help="fill in formal languages")
parser.add_argument("--authors", action="store_true", help="fill in author info")
args = parser.parse_args()
formal_language_dict = { l.pk: l for l in FormalLanguageName.objects.all() }
docs_qs = Document.objects.filter(type="draft")
if args.document:
docs_qs = docs_qs.filter(docalias__name=args.document)
for doc in docs_qs.prefetch_related("docalias_set", "formal_languages", "documentauthor_set", "documentauthor_set__person", "documentauthor_set__person__alias_set"):
canonical_name = doc.name
for n in doc.docalias_set.all():
if n.name.startswith("rfc"):
canonical_name = n.name
if canonical_name.startswith("rfc"):
path = os.path.join(settings.RFC_PATH, canonical_name + ".txt")
else:
path = os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, canonical_name + "-" + doc.rev + ".txt")
if not os.path.exists(path):
print "skipping", doc.name, "no txt file found at", path
continue
with open(path, 'r') as f:
d = Draft(f.read(), path)
updated = False
updates = {}
if args.words:
words = d.get_wordcount()
if words != doc.words:
updates["words"] = words
if args.formlang:
langs = d.get_formal_languages()
new_formal_languages = set(formal_language_dict[l] for l in langs)
old_formal_languages = set(doc.formal_languages.all())
if new_formal_languages != old_formal_languages:
for l in new_formal_languages - old_formal_languages:
doc.formal_languages.add(l)
updated = True
for l in old_formal_languages - new_formal_languages:
doc.formal_languages.remove(l)
updated = True
if args.authors:
old_authors = doc.documentauthor_set.all()
old_authors_by_name = {}
old_authors_by_email = {}
for author in old_authors:
for alias in author.person.alias_set.all():
old_authors_by_name[alias.name] = author
old_authors_by_name[author.person.plain_name()] = author
if author.email_id:
old_authors_by_email[author.email_id] = author
# the draft parser sometimes has a problem if affiliation
# isn't in the second line, then it will report an extra
# author - skip those
seen = set()
for full, _, _, _, _, email, country, company in d.get_author_list():
if email in seen:
continue
seen.add(email)
old_author = None
if email:
old_author = old_authors_by_email.get(email)
if not old_author:
old_author = old_authors_by_name.get(full)
if not old_author:
print "UNKNOWN AUTHOR", doc.name, full, email, country, company
continue
if old_author.affiliation != company:
print "new affiliation", canonical_name, "[", full, "]", old_author.affiliation, "->", company
old_author.affiliation = company
old_author.save(update_fields=["affiliation"])
updated = True
if country is None:
country = ""
try:
country = country.decode("utf-8")
except UnicodeDecodeError:
country = country.decode("latin-1")
if old_author.country != country:
print "new country", canonical_name ,"[", full, "]", old_author.country.encode("utf-8"), "->", country.encode("utf-8")
old_author.country = country
old_author.save(update_fields=["country"])
updated = True
if updates:
Document.objects.filter(pk=doc.pk).update(**updates)
updated = True
if updated:
print "updated", canonical_name