datatracker/ietf/stats/backfill_data.py
Ole Laursen 5b8ac15c0e Improve debug output
- Legacy-Id: 13143
2017-03-27 08:35:34 +00:00

134 lines
4.9 KiB
Python

import sys, os, argparse
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path = [ basedir ] + sys.path
os.environ["DJANGO_SETTINGS_MODULE"] = "ietf.settings"
virtualenv_activation = os.path.join(basedir, "env", "bin", "activate_this.py")
if os.path.exists(virtualenv_activation):
execfile(virtualenv_activation, dict(__file__=virtualenv_activation))
import django
django.setup()
from django.conf import settings
from ietf.doc.models import Document
from ietf.name.models import FormalLanguageName
from ietf.utils.draft import Draft
parser = argparse.ArgumentParser()
parser.add_argument("--document", help="specific document name")
parser.add_argument("--words", action="store_true", help="fill in word count")
parser.add_argument("--formlang", action="store_true", help="fill in formal languages")
parser.add_argument("--authors", action="store_true", help="fill in author info")
args = parser.parse_args()
formal_language_dict = { l.pk: l for l in FormalLanguageName.objects.all() }
docs_qs = Document.objects.filter(type="draft")
if args.document:
docs_qs = docs_qs.filter(docalias__name=args.document)
for doc in docs_qs.prefetch_related("docalias_set", "formal_languages", "documentauthor_set", "documentauthor_set__person", "documentauthor_set__person__alias_set"):
canonical_name = doc.name
for n in doc.docalias_set.all():
if n.name.startswith("rfc"):
canonical_name = n.name
if canonical_name.startswith("rfc"):
path = os.path.join(settings.RFC_PATH, canonical_name + ".txt")
else:
path = os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, canonical_name + "-" + doc.rev + ".txt")
if not os.path.exists(path):
print "skipping", doc.name, "no txt file found at", path
continue
with open(path, 'r') as f:
d = Draft(f.read(), path)
updated = False
updates = {}
if args.words:
words = d.get_wordcount()
if words != doc.words:
updates["words"] = words
if args.formlang:
langs = d.get_formal_languages()
new_formal_languages = set(formal_language_dict[l] for l in langs)
old_formal_languages = set(doc.formal_languages.all())
if new_formal_languages != old_formal_languages:
for l in new_formal_languages - old_formal_languages:
doc.formal_languages.add(l)
updated = True
for l in old_formal_languages - new_formal_languages:
doc.formal_languages.remove(l)
updated = True
if args.authors:
old_authors = doc.documentauthor_set.all()
old_authors_by_name = {}
old_authors_by_email = {}
for author in old_authors:
for alias in author.person.alias_set.all():
old_authors_by_name[alias.name] = author
old_authors_by_name[author.person.plain_name()] = author
if author.email_id:
old_authors_by_email[author.email_id] = author
# the draft parser sometimes has a problem when
# affiliation isn't in the second line and it then thinks
# it's an extra author - skip those extra authors
seen = set()
for full, _, _, _, _, email, country, company in d.get_author_list():
if email in seen:
continue
seen.add(email)
old_author = None
if email:
old_author = old_authors_by_email.get(email)
if not old_author:
old_author = old_authors_by_name.get(full)
if not old_author:
print "UNKNOWN AUTHOR", doc.name, full, email, country, company
continue
if old_author.affiliation != company:
print "new affiliation", canonical_name, "[", full, "]", old_author.affiliation, "->", company
old_author.affiliation = company
old_author.save(update_fields=["affiliation"])
updated = True
if country is None:
country = ""
try:
country = country.decode("utf-8")
except UnicodeDecodeError:
country = country.decode("latin-1")
if old_author.country != country:
print "new country", canonical_name ,"[", full, email, "]", old_author.country.encode("utf-8"), "->", country.encode("utf-8")
old_author.country = country
old_author.save(update_fields=["country"])
updated = True
if updates:
Document.objects.filter(pk=doc.pk).update(**updates)
updated = True
if updated:
print "updated", canonical_name