Add word count and submit format statistics

- Legacy-Id: 12656
This commit is contained in:
Ole Laursen 2017-01-16 11:36:38 +00:00
parent 34a9f36534
commit 6378594033
11 changed files with 337 additions and 28 deletions

View file

@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('doc', '0019_auto_20161207_1036'),
]
operations = [
migrations.AddField(
model_name='dochistory',
name='words',
field=models.IntegerField(null=True, blank=True),
),
migrations.AddField(
model_name='document',
name='words',
field=models.IntegerField(null=True, blank=True),
),
]

View file

@ -75,6 +75,7 @@ class DocumentInfo(models.Model):
abstract = models.TextField(blank=True)
rev = models.CharField(verbose_name="revision", max_length=16, blank=True)
pages = models.IntegerField(blank=True, null=True)
words = models.IntegerField(blank=True, null=True)
order = models.IntegerField(default=1, blank=True) # This is probably obviated by SessionPresentaion.order
intended_std_level = models.ForeignKey(IntendedStdLevelName, verbose_name="Intended standardization level", blank=True, null=True)
std_level = models.ForeignKey(StdLevelName, verbose_name="Standardization level", blank=True, null=True)

View file

@ -468,6 +468,8 @@ INTERNET_DRAFT_ARCHIVE_DIR = '/a/www/www6s/draft-archive'
INTERNET_ALL_DRAFTS_ARCHIVE_DIR = '/a/www/www6s/archive/id'
MEETING_RECORDINGS_DIR = '/a/www/audio'
DOCUMENT_FORMAT_BLACKLIST = ["tar", "dtd", "p7s"]
# Mailing list info URL for lists hosted on the IETF servers
MAILING_LIST_INFO_URL = "https://www.ietf.org/mailman/listinfo/%(list_addr)s"
MAILING_LIST_ARCHIVE_URL = "https://mailarchive.ietf.org"

View file

@ -0,0 +1,58 @@
import sys, os, argparse
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
sys.path = [ basedir ] + sys.path
os.environ["DJANGO_SETTINGS_MODULE"] = "ietf.settings"
virtualenv_activation = os.path.join(basedir, "env", "bin", "activate_this.py")
if os.path.exists(virtualenv_activation):
execfile(virtualenv_activation, dict(__file__=virtualenv_activation))
import django
django.setup()
from django.conf import settings
from ietf.doc.models import Document
from ietf.utils.draft import Draft
parser = argparse.ArgumentParser()
parser.add_argument("--document", help="specific document name")
parser.add_argument("--words", action="store_true", help="fill in word count")
args = parser.parse_args()
docs_qs = Document.objects.filter(type="draft")
if args.document:
docs_qs = docs_qs.filter(docalias__name=args.document)
for doc in docs_qs.prefetch_related("docalias_set"):
canonical_name = doc.name
for n in doc.docalias_set.all():
if n.name.startswith("rfc"):
canonical_name = n.name
if canonical_name.startswith("rfc"):
path = os.path.join(settings.RFC_PATH, canonical_name + ".txt")
else:
path = os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, canonical_name + "-" + doc.rev + ".txt")
if not os.path.exists(path):
print "skipping", doc.name, "no txt file found at", path
continue
with open(path, 'r') as f:
d = Draft(f.read(), path)
updates = {}
if args.words:
words = d.get_wordcount()
if words != doc.words:
updates["words"] = words
if updates:
Document.objects.filter(pk=doc.pk).update(**updates)
print "updated", canonical_name

View file

@ -31,7 +31,7 @@ class StatisticsTests(TestCase):
self.assertTrue(authors_all_url in r["Location"])
# check various stats types
for stats_type in ["authors", "pages"]:
for stats_type in ["authors", "pages", "words", "format"]:
for document_type in ["all", "rfc", "draft"]:
url = urlreverse(ietf.stats.views.document_stats, kwargs={ "stats_type": stats_type, "document_type": document_type })
r = self.client.get(url)

View file

@ -5,6 +5,6 @@ import ietf.stats.views
urlpatterns = patterns('',
url("^$", ietf.stats.views.stats_index),
url("^document/(?:(?P<stats_type>authors|pages|format|spectech)/)?(?:(?P<document_type>all|rfc|draft)/)?$", ietf.stats.views.document_stats),
url("^document/(?:(?P<stats_type>authors|pages|words|format|formlang)/)?(?:(?P<document_type>all|rfc|draft)/)?$", ietf.stats.views.document_stats),
url("^review/(?:(?P<stats_type>completion|results|states|time)/)?(?:%(acronym)s/)?$" % settings.URL_REGEXPS, ietf.stats.views.review_stats),
)

View file

@ -1,4 +1,9 @@
import datetime, itertools, json, calendar
import datetime
import itertools
import json
import calendar
import os
import re
from collections import defaultdict
from django.shortcuts import render
@ -7,6 +12,7 @@ from django.core.urlresolvers import reverse as urlreverse
from django.http import HttpResponseRedirect, HttpResponseForbidden
from django.db.models import Count
from django.utils.safestring import mark_safe
from django.conf import settings
import dateutil.relativedelta
@ -15,10 +21,11 @@ from ietf.review.utils import (extract_review_request_data,
ReviewRequestData,
compute_review_request_stats,
sum_raw_review_request_aggregations)
from ietf.submit.models import Submission
from ietf.group.models import Role, Group
from ietf.person.models import Person
from ietf.name.models import ReviewRequestStateName, ReviewResultName
from ietf.doc.models import Document
from ietf.doc.models import DocAlias
from ietf.ietfauth.utils import has_role
def stats_index(request):
@ -48,7 +55,6 @@ def generate_query_string(query_dict, overrides):
return query_part
def document_stats(request, stats_type=None, document_type=None):
def build_document_stats_url(stats_type_override=Ellipsis, document_type_override=Ellipsis, get_overrides={}):
kwargs = {
@ -60,10 +66,11 @@ def document_stats(request, stats_type=None, document_type=None):
# statistics type - one of the tables or the chart
possible_stats_types = [
("authors", "Number of authors"),
("authors", "Authors"),
("pages", "Pages"),
# ("format", "Format"),
# ("spectech", "Specification techniques"),
("words", "Words"),
("format", "Format"),
("formlang", "Formal languages"),
]
possible_stats_types = [ (slug, label, build_document_stats_url(stats_type_override=slug))
@ -85,13 +92,34 @@ def document_stats(request, stats_type=None, document_type=None):
return HttpResponseRedirect(build_document_stats_url(document_type_override=possible_document_types[0][0]))
def put_into_bin(value, bin_size):
if value is None:
return (value, value)
v = (value // bin_size) * bin_size
return (v, "{} - {}".format(v, v + bin_size - 1))
def generate_canonical_names(docalias_qs):
for doc_id, ts in itertools.groupby(docalias_qs.order_by("document"), lambda t: t[0]):
chosen = None
for t in ts:
if chosen is None:
chosen = t
else:
if t[0].startswith("rfc"):
chosen = t
elif t[0].startswith("draft") and not chosen[0].startswith("rfc"):
chosen = t
yield chosen
# filter documents
doc_qs = Document.objects.filter(type="draft")
docalias_qs = DocAlias.objects.filter(document__type="draft")
if document_type == "rfc":
doc_qs = doc_qs.filter(states__type="draft", states__slug="rfc")
docalias_qs = docalias_qs.filter(document__states__type="draft", document__states__slug="rfc")
elif document_type == "draft":
doc_qs = doc_qs.exclude(states__type="draft", states__slug="rfc")
docalias_qs = docalias_qs.exclude(document__states__type="draft", document__states__slug="rfc")
chart_data = []
table_data = []
@ -104,19 +132,20 @@ def document_stats(request, stats_type=None, document_type=None):
doc_label = "draft"
stats_title = ""
bin_size = 1
if stats_type == "authors":
stats_title = "Number of authors for each {}".format(doc_label)
groups = defaultdict(list)
bins = defaultdict(list)
for name, author_count in doc_qs.values_list("name").annotate(Count("authors")).iterator():
groups[author_count].append(name)
for name, author_count in generate_canonical_names(docalias_qs.values_list("name").annotate(Count("document__authors"))):
bins[author_count].append(name)
total_docs = sum(len(names) for author_count, names in groups.iteritems())
total_docs = sum(len(names) for author_count, names in bins.iteritems())
series_data = []
for author_count, names in sorted(groups.iteritems(), key=lambda t: t[0]):
for author_count, names in sorted(bins.iteritems(), key=lambda t: t[0]):
percentage = len(names) * 100.0 / total_docs
series_data.append((author_count, percentage))
table_data.append((author_count, percentage, names))
@ -129,15 +158,15 @@ def document_stats(request, stats_type=None, document_type=None):
elif stats_type == "pages":
stats_title = "Number of pages for each {}".format(doc_label)
groups = defaultdict(list)
bins = defaultdict(list)
for name, pages in doc_qs.values_list("name", "pages"):
groups[pages].append(name)
for name, pages in generate_canonical_names(docalias_qs.values_list("name", "document__pages")):
bins[pages].append(name)
total_docs = sum(len(names) for pages, names in groups.iteritems())
total_docs = sum(len(names) for pages, names in bins.iteritems())
series_data = []
for pages, names in sorted(groups.iteritems(), key=lambda t: t[0]):
for pages, names in sorted(bins.iteritems(), key=lambda t: t[0]):
percentage = len(names) * 100.0 / total_docs
if pages is not None:
series_data.append((pages, len(names)))
@ -148,7 +177,86 @@ def document_stats(request, stats_type=None, document_type=None):
"animation": False,
})
elif stats_type == "words":
stats_title = "Number of words for each {}".format(doc_label)
bin_size = 500
bins = defaultdict(list)
for name, words in generate_canonical_names(docalias_qs.values_list("name", "document__words")):
bins[put_into_bin(words, bin_size)].append(name)
total_docs = sum(len(names) for words, names in bins.iteritems())
series_data = []
for (value, words), names in sorted(bins.iteritems(), key=lambda t: t[0][0]):
percentage = len(names) * 100.0 / total_docs
if words is not None:
series_data.append((value, len(names)))
table_data.append((words, percentage, names))
chart_data.append({
"data": series_data,
"animation": False,
})
elif stats_type == "format":
stats_title = "Formats for each {}".format(doc_label)
bins = defaultdict(list)
# on new documents, we should have a Submission row with the file types
submission_types = {}
for doc_name, file_types in Submission.objects.values_list("draft", "file_types").order_by("submission_date", "id"):
submission_types[doc_name] = file_types
doc_names_with_missing_types = {}
for canonical_name, rev, doc_name in generate_canonical_names(docalias_qs.values_list("name", "document__rev", "document__name")):
types = submission_types.get(doc_name)
if types:
for dot_ext in types.split(","):
bins[dot_ext.lstrip(".").upper()].append(canonical_name)
else:
if canonical_name.startswith("rfc"):
filename = canonical_name
else:
filename = canonical_name + "-" + rev
doc_names_with_missing_types[filename] = canonical_name
# look up the remaining documents on disk
for filename in itertools.chain(os.listdir(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR), os.listdir(settings.RFC_PATH)):
t = filename.split(".", 1)
if len(t) != 2:
continue
basename, ext = t
if any(ext.lower().endswith(blacklisted_ext.lower()) for blacklisted_ext in settings.DOCUMENT_FORMAT_BLACKLIST):
continue
canonical_name = doc_names_with_missing_types.get(basename)
if canonical_name:
bins[ext.upper()].append(canonical_name)
total_docs = sum(len(names) for fmt, names in bins.iteritems())
series_data = []
for fmt, names in sorted(bins.iteritems(), key=lambda t: t[0]):
percentage = len(names) * 100.0 / total_docs
series_data.append((fmt, len(names)))
table_data.append((fmt, percentage, names))
chart_data.append({
"data": series_data,
"animation": False,
})
return render(request, "stats/document_stats.html", {
"chart_data": mark_safe(json.dumps(chart_data)),
@ -159,6 +267,8 @@ def document_stats(request, stats_type=None, document_type=None):
"possible_document_types": possible_document_types,
"document_type": document_type,
"doc_label": doc_label,
"bin_size": bin_size,
"content_template": "stats/document_stats_{}.html".format(stats_type),
})
@login_required

View file

@ -35,11 +35,7 @@
</div>
</div>
{% if stats_type == "authors" %}
{% include "stats/document_stats_authors.html" %}
{% elif stats_type == "pages" %}
{% include "stats/document_stats_pages.html" %}
{% endif %}
{% include content_template %}
{% endblock %}
{% block js %}

View file

@ -0,0 +1,60 @@
<h3>{{ stats_title }}</h3>
<div id="chart"></div>
<script>
var chartConf = {
chart: {
type: 'column'
},
title: {
text: '{{ stats_title|escapejs }}'
},
xAxis: {
type: "category",
title: {
text: 'Format'
}
},
yAxis: {
title: {
text: 'Number of {{ doc_label }}s'
}
},
tooltip: {
formatter: function () {
console.log(this);
var s = '<b>' + this.points[0].key + '</b>';
$.each(this.points, function () {
s += '<br/>' + chartConf.yAxis.title.text + ': ' + this.y;
});
return s;
},
shared: true
},
series: {{ chart_data }}
};
</script>
<h3>Data</h3>
<table class="table table-condensed stats-data">
<thead>
<tr>
<th>Format</th>
<th>Percentage of {{ doc_label }}s</th>
<th>{{ doc_label|capfirst }}s</th>
</tr>
</thead>
<tbody>
{% for pages, percentage, names in table_data %}
<tr>
<td>{{ pages }}</td>
<td>{{ percentage|floatformat:2 }}%</td>
<td>{% include "stats/includes/docnames_cell.html" %}</td>
</tr>
{% endfor %}
</tbody>
</table>

View file

@ -41,9 +41,9 @@
<table class="table table-condensed stats-data">
<thead>
<tr>
<th>Authors</th>
<th>Pages</th>
<th>Percentage of {{ doc_label }}s</th>
<th>{{ doc_label }}s</th>
<th>{{ doc_label|capfirst }}s</th>
</tr>
</thead>
<tbody>

View file

@ -0,0 +1,58 @@
<h3>{{ stats_title }}</h3>
<div id="chart"></div>
<script>
var chartConf = {
chart: {
type: 'line'
},
title: {
text: '{{ stats_title|escapejs }}'
},
xAxis: {
title: {
text: 'Number of words'
}
},
yAxis: {
title: {
text: 'Number of {{ doc_label }}s'
}
},
tooltip: {
formatter: function () {
var s = '<b>' + this.x + ' - ' + (this.x + {{ bin_size }} - 1) + ' ' + (this.x == 1 ? "word" : 'words') + '</b>';
$.each(this.points, function () {
s += '<br/>' + chartConf.yAxis.title.text + ': ' + this.y;
});
return s;
},
shared: true
},
series: {{ chart_data }}
};
</script>
<h3>Data</h3>
<table class="table table-condensed stats-data">
<thead>
<tr>
<th>Words</th>
<th>Percentage of {{ doc_label }}s</th>
<th>{{ doc_label|capfirst }}s</th>
</tr>
</thead>
<tbody>
{% for pages, percentage, names in table_data %}
<tr>
<td>{{ pages }}</td>
<td>{{ percentage|floatformat:2 }}%</td>
<td>{% include "stats/includes/docnames_cell.html" %}</td>
</tr>
{% endfor %}
</tbody>
</table>