From 63785940339726dc335f42e64430e954c5ada788 Mon Sep 17 00:00:00 2001 From: Ole Laursen Date: Mon, 16 Jan 2017 11:36:38 +0000 Subject: [PATCH] Add word count and submit format statistics - Legacy-Id: 12656 --- .../doc/migrations/0020_auto_20170112_0753.py | 24 +++ ietf/doc/models.py | 1 + ietf/settings.py | 2 + ietf/stats/backfill_data.py | 58 +++++++ ietf/stats/tests.py | 2 +- ietf/stats/urls.py | 2 +- ietf/stats/views.py | 148 +++++++++++++++--- ietf/templates/stats/document_stats.html | 6 +- .../stats/document_stats_format.html | 60 +++++++ .../templates/stats/document_stats_pages.html | 4 +- .../templates/stats/document_stats_words.html | 58 +++++++ 11 files changed, 337 insertions(+), 28 deletions(-) create mode 100644 ietf/doc/migrations/0020_auto_20170112_0753.py create mode 100644 ietf/stats/backfill_data.py create mode 100644 ietf/templates/stats/document_stats_format.html create mode 100644 ietf/templates/stats/document_stats_words.html diff --git a/ietf/doc/migrations/0020_auto_20170112_0753.py b/ietf/doc/migrations/0020_auto_20170112_0753.py new file mode 100644 index 000000000..da22265f7 --- /dev/null +++ b/ietf/doc/migrations/0020_auto_20170112_0753.py @@ -0,0 +1,24 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('doc', '0019_auto_20161207_1036'), + ] + + operations = [ + migrations.AddField( + model_name='dochistory', + name='words', + field=models.IntegerField(null=True, blank=True), + ), + migrations.AddField( + model_name='document', + name='words', + field=models.IntegerField(null=True, blank=True), + ), + ] diff --git a/ietf/doc/models.py b/ietf/doc/models.py index 6b850e98a..6ef127dc3 100644 --- a/ietf/doc/models.py +++ b/ietf/doc/models.py @@ -75,6 +75,7 @@ class DocumentInfo(models.Model): abstract = models.TextField(blank=True) rev = models.CharField(verbose_name="revision", max_length=16, blank=True) pages = models.IntegerField(blank=True, null=True) + words = models.IntegerField(blank=True, null=True) order = models.IntegerField(default=1, blank=True) # This is probably obviated by SessionPresentaion.order intended_std_level = models.ForeignKey(IntendedStdLevelName, verbose_name="Intended standardization level", blank=True, null=True) std_level = models.ForeignKey(StdLevelName, verbose_name="Standardization level", blank=True, null=True) diff --git a/ietf/settings.py b/ietf/settings.py index 321029f0f..38ab48e83 100644 --- a/ietf/settings.py +++ b/ietf/settings.py @@ -468,6 +468,8 @@ INTERNET_DRAFT_ARCHIVE_DIR = '/a/www/www6s/draft-archive' INTERNET_ALL_DRAFTS_ARCHIVE_DIR = '/a/www/www6s/archive/id' MEETING_RECORDINGS_DIR = '/a/www/audio' +DOCUMENT_FORMAT_BLACKLIST = ["tar", "dtd", "p7s"] + # Mailing list info URL for lists hosted on the IETF servers MAILING_LIST_INFO_URL = "https://www.ietf.org/mailman/listinfo/%(list_addr)s" MAILING_LIST_ARCHIVE_URL = "https://mailarchive.ietf.org" diff --git a/ietf/stats/backfill_data.py b/ietf/stats/backfill_data.py new file mode 100644 index 000000000..f088de081 --- /dev/null +++ b/ietf/stats/backfill_data.py @@ -0,0 +1,58 @@ +import sys, os, argparse + +basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +sys.path = [ basedir ] + sys.path +os.environ["DJANGO_SETTINGS_MODULE"] = "ietf.settings" + +virtualenv_activation = os.path.join(basedir, "env", "bin", "activate_this.py") +if os.path.exists(virtualenv_activation): + execfile(virtualenv_activation, dict(__file__=virtualenv_activation)) + +import django +django.setup() + +from django.conf import settings + +from ietf.doc.models import Document +from ietf.utils.draft import Draft + +parser = argparse.ArgumentParser() +parser.add_argument("--document", help="specific document name") +parser.add_argument("--words", action="store_true", help="fill in word count") +args = parser.parse_args() + + +docs_qs = Document.objects.filter(type="draft") + +if args.document: + docs_qs = docs_qs.filter(docalias__name=args.document) + +for doc in docs_qs.prefetch_related("docalias_set"): + canonical_name = doc.name + for n in doc.docalias_set.all(): + if n.name.startswith("rfc"): + canonical_name = n.name + + if canonical_name.startswith("rfc"): + path = os.path.join(settings.RFC_PATH, canonical_name + ".txt") + else: + path = os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, canonical_name + "-" + doc.rev + ".txt") + + if not os.path.exists(path): + print "skipping", doc.name, "no txt file found at", path + continue + + with open(path, 'r') as f: + d = Draft(f.read(), path) + + updates = {} + + if args.words: + words = d.get_wordcount() + if words != doc.words: + updates["words"] = words + + if updates: + Document.objects.filter(pk=doc.pk).update(**updates) + print "updated", canonical_name + diff --git a/ietf/stats/tests.py b/ietf/stats/tests.py index 33c76409f..54017208e 100644 --- a/ietf/stats/tests.py +++ b/ietf/stats/tests.py @@ -31,7 +31,7 @@ class StatisticsTests(TestCase): self.assertTrue(authors_all_url in r["Location"]) # check various stats types - for stats_type in ["authors", "pages"]: + for stats_type in ["authors", "pages", "words", "format"]: for document_type in ["all", "rfc", "draft"]: url = urlreverse(ietf.stats.views.document_stats, kwargs={ "stats_type": stats_type, "document_type": document_type }) r = self.client.get(url) diff --git a/ietf/stats/urls.py b/ietf/stats/urls.py index 82fcd5743..6616d8fdd 100644 --- a/ietf/stats/urls.py +++ b/ietf/stats/urls.py @@ -5,6 +5,6 @@ import ietf.stats.views urlpatterns = patterns('', url("^$", ietf.stats.views.stats_index), - url("^document/(?:(?Pauthors|pages|format|spectech)/)?(?:(?Pall|rfc|draft)/)?$", ietf.stats.views.document_stats), + url("^document/(?:(?Pauthors|pages|words|format|formlang)/)?(?:(?Pall|rfc|draft)/)?$", ietf.stats.views.document_stats), url("^review/(?:(?Pcompletion|results|states|time)/)?(?:%(acronym)s/)?$" % settings.URL_REGEXPS, ietf.stats.views.review_stats), ) diff --git a/ietf/stats/views.py b/ietf/stats/views.py index 5e1211daa..6601d0266 100644 --- a/ietf/stats/views.py +++ b/ietf/stats/views.py @@ -1,4 +1,9 @@ -import datetime, itertools, json, calendar +import datetime +import itertools +import json +import calendar +import os +import re from collections import defaultdict from django.shortcuts import render @@ -7,6 +12,7 @@ from django.core.urlresolvers import reverse as urlreverse from django.http import HttpResponseRedirect, HttpResponseForbidden from django.db.models import Count from django.utils.safestring import mark_safe +from django.conf import settings import dateutil.relativedelta @@ -15,10 +21,11 @@ from ietf.review.utils import (extract_review_request_data, ReviewRequestData, compute_review_request_stats, sum_raw_review_request_aggregations) +from ietf.submit.models import Submission from ietf.group.models import Role, Group from ietf.person.models import Person from ietf.name.models import ReviewRequestStateName, ReviewResultName -from ietf.doc.models import Document +from ietf.doc.models import DocAlias from ietf.ietfauth.utils import has_role def stats_index(request): @@ -48,7 +55,6 @@ def generate_query_string(query_dict, overrides): return query_part - def document_stats(request, stats_type=None, document_type=None): def build_document_stats_url(stats_type_override=Ellipsis, document_type_override=Ellipsis, get_overrides={}): kwargs = { @@ -60,10 +66,11 @@ def document_stats(request, stats_type=None, document_type=None): # statistics type - one of the tables or the chart possible_stats_types = [ - ("authors", "Number of authors"), + ("authors", "Authors"), ("pages", "Pages"), -# ("format", "Format"), -# ("spectech", "Specification techniques"), + ("words", "Words"), + ("format", "Format"), + ("formlang", "Formal languages"), ] possible_stats_types = [ (slug, label, build_document_stats_url(stats_type_override=slug)) @@ -85,13 +92,34 @@ def document_stats(request, stats_type=None, document_type=None): return HttpResponseRedirect(build_document_stats_url(document_type_override=possible_document_types[0][0])) + def put_into_bin(value, bin_size): + if value is None: + return (value, value) + + v = (value // bin_size) * bin_size + return (v, "{} - {}".format(v, v + bin_size - 1)) + + def generate_canonical_names(docalias_qs): + for doc_id, ts in itertools.groupby(docalias_qs.order_by("document"), lambda t: t[0]): + chosen = None + for t in ts: + if chosen is None: + chosen = t + else: + if t[0].startswith("rfc"): + chosen = t + elif t[0].startswith("draft") and not chosen[0].startswith("rfc"): + chosen = t + + yield chosen + # filter documents - doc_qs = Document.objects.filter(type="draft") + docalias_qs = DocAlias.objects.filter(document__type="draft") if document_type == "rfc": - doc_qs = doc_qs.filter(states__type="draft", states__slug="rfc") + docalias_qs = docalias_qs.filter(document__states__type="draft", document__states__slug="rfc") elif document_type == "draft": - doc_qs = doc_qs.exclude(states__type="draft", states__slug="rfc") + docalias_qs = docalias_qs.exclude(document__states__type="draft", document__states__slug="rfc") chart_data = [] table_data = [] @@ -104,19 +132,20 @@ def document_stats(request, stats_type=None, document_type=None): doc_label = "draft" stats_title = "" + bin_size = 1 if stats_type == "authors": stats_title = "Number of authors for each {}".format(doc_label) - groups = defaultdict(list) + bins = defaultdict(list) - for name, author_count in doc_qs.values_list("name").annotate(Count("authors")).iterator(): - groups[author_count].append(name) + for name, author_count in generate_canonical_names(docalias_qs.values_list("name").annotate(Count("document__authors"))): + bins[author_count].append(name) - total_docs = sum(len(names) for author_count, names in groups.iteritems()) + total_docs = sum(len(names) for author_count, names in bins.iteritems()) series_data = [] - for author_count, names in sorted(groups.iteritems(), key=lambda t: t[0]): + for author_count, names in sorted(bins.iteritems(), key=lambda t: t[0]): percentage = len(names) * 100.0 / total_docs series_data.append((author_count, percentage)) table_data.append((author_count, percentage, names)) @@ -129,15 +158,15 @@ def document_stats(request, stats_type=None, document_type=None): elif stats_type == "pages": stats_title = "Number of pages for each {}".format(doc_label) - groups = defaultdict(list) + bins = defaultdict(list) - for name, pages in doc_qs.values_list("name", "pages"): - groups[pages].append(name) + for name, pages in generate_canonical_names(docalias_qs.values_list("name", "document__pages")): + bins[pages].append(name) - total_docs = sum(len(names) for pages, names in groups.iteritems()) + total_docs = sum(len(names) for pages, names in bins.iteritems()) series_data = [] - for pages, names in sorted(groups.iteritems(), key=lambda t: t[0]): + for pages, names in sorted(bins.iteritems(), key=lambda t: t[0]): percentage = len(names) * 100.0 / total_docs if pages is not None: series_data.append((pages, len(names))) @@ -148,7 +177,86 @@ def document_stats(request, stats_type=None, document_type=None): "animation": False, }) + elif stats_type == "words": + stats_title = "Number of words for each {}".format(doc_label) + bin_size = 500 + + bins = defaultdict(list) + + for name, words in generate_canonical_names(docalias_qs.values_list("name", "document__words")): + bins[put_into_bin(words, bin_size)].append(name) + + total_docs = sum(len(names) for words, names in bins.iteritems()) + + series_data = [] + for (value, words), names in sorted(bins.iteritems(), key=lambda t: t[0][0]): + percentage = len(names) * 100.0 / total_docs + if words is not None: + series_data.append((value, len(names))) + + table_data.append((words, percentage, names)) + + chart_data.append({ + "data": series_data, + "animation": False, + }) + + elif stats_type == "format": + stats_title = "Formats for each {}".format(doc_label) + + bins = defaultdict(list) + + # on new documents, we should have a Submission row with the file types + submission_types = {} + + for doc_name, file_types in Submission.objects.values_list("draft", "file_types").order_by("submission_date", "id"): + submission_types[doc_name] = file_types + + doc_names_with_missing_types = {} + for canonical_name, rev, doc_name in generate_canonical_names(docalias_qs.values_list("name", "document__rev", "document__name")): + types = submission_types.get(doc_name) + if types: + for dot_ext in types.split(","): + bins[dot_ext.lstrip(".").upper()].append(canonical_name) + + else: + + if canonical_name.startswith("rfc"): + filename = canonical_name + else: + filename = canonical_name + "-" + rev + + doc_names_with_missing_types[filename] = canonical_name + + # look up the remaining documents on disk + for filename in itertools.chain(os.listdir(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR), os.listdir(settings.RFC_PATH)): + t = filename.split(".", 1) + if len(t) != 2: + continue + + basename, ext = t + if any(ext.lower().endswith(blacklisted_ext.lower()) for blacklisted_ext in settings.DOCUMENT_FORMAT_BLACKLIST): + continue + + canonical_name = doc_names_with_missing_types.get(basename) + + if canonical_name: + bins[ext.upper()].append(canonical_name) + + total_docs = sum(len(names) for fmt, names in bins.iteritems()) + + series_data = [] + for fmt, names in sorted(bins.iteritems(), key=lambda t: t[0]): + percentage = len(names) * 100.0 / total_docs + series_data.append((fmt, len(names))) + + table_data.append((fmt, percentage, names)) + + chart_data.append({ + "data": series_data, + "animation": False, + }) return render(request, "stats/document_stats.html", { "chart_data": mark_safe(json.dumps(chart_data)), @@ -159,6 +267,8 @@ def document_stats(request, stats_type=None, document_type=None): "possible_document_types": possible_document_types, "document_type": document_type, "doc_label": doc_label, + "bin_size": bin_size, + "content_template": "stats/document_stats_{}.html".format(stats_type), }) @login_required diff --git a/ietf/templates/stats/document_stats.html b/ietf/templates/stats/document_stats.html index 576a62ed5..532329203 100644 --- a/ietf/templates/stats/document_stats.html +++ b/ietf/templates/stats/document_stats.html @@ -35,11 +35,7 @@ - {% if stats_type == "authors" %} - {% include "stats/document_stats_authors.html" %} - {% elif stats_type == "pages" %} - {% include "stats/document_stats_pages.html" %} - {% endif %} + {% include content_template %} {% endblock %} {% block js %} diff --git a/ietf/templates/stats/document_stats_format.html b/ietf/templates/stats/document_stats_format.html new file mode 100644 index 000000000..7e701343f --- /dev/null +++ b/ietf/templates/stats/document_stats_format.html @@ -0,0 +1,60 @@ +

{{ stats_title }}

+ +
+ + + +

Data

+ + + + + + + + + + + {% for pages, percentage, names in table_data %} + + + + + + {% endfor %} + +
FormatPercentage of {{ doc_label }}s{{ doc_label|capfirst }}s
{{ pages }}{{ percentage|floatformat:2 }}%{% include "stats/includes/docnames_cell.html" %}
diff --git a/ietf/templates/stats/document_stats_pages.html b/ietf/templates/stats/document_stats_pages.html index c9a725d0b..f4c930e46 100644 --- a/ietf/templates/stats/document_stats_pages.html +++ b/ietf/templates/stats/document_stats_pages.html @@ -41,9 +41,9 @@ - + - + diff --git a/ietf/templates/stats/document_stats_words.html b/ietf/templates/stats/document_stats_words.html new file mode 100644 index 000000000..d5983f1d6 --- /dev/null +++ b/ietf/templates/stats/document_stats_words.html @@ -0,0 +1,58 @@ +

{{ stats_title }}

+ +
+ + + +

Data

+ +
AuthorsPages Percentage of {{ doc_label }}s{{ doc_label }}s{{ doc_label|capfirst }}s
+ + + + + + + + + {% for pages, percentage, names in table_data %} + + + + + + {% endfor %} + +
WordsPercentage of {{ doc_label }}s{{ doc_label|capfirst }}s
{{ pages }}{{ percentage|floatformat:2 }}%{% include "stats/includes/docnames_cell.html" %}