Add word count and submit format statistics
- Legacy-Id: 12656
This commit is contained in:
parent
34a9f36534
commit
6378594033
24
ietf/doc/migrations/0020_auto_20170112_0753.py
Normal file
24
ietf/doc/migrations/0020_auto_20170112_0753.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('doc', '0019_auto_20161207_1036'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='dochistory',
|
||||
name='words',
|
||||
field=models.IntegerField(null=True, blank=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='document',
|
||||
name='words',
|
||||
field=models.IntegerField(null=True, blank=True),
|
||||
),
|
||||
]
|
|
@ -75,6 +75,7 @@ class DocumentInfo(models.Model):
|
|||
abstract = models.TextField(blank=True)
|
||||
rev = models.CharField(verbose_name="revision", max_length=16, blank=True)
|
||||
pages = models.IntegerField(blank=True, null=True)
|
||||
words = models.IntegerField(blank=True, null=True)
|
||||
order = models.IntegerField(default=1, blank=True) # This is probably obviated by SessionPresentaion.order
|
||||
intended_std_level = models.ForeignKey(IntendedStdLevelName, verbose_name="Intended standardization level", blank=True, null=True)
|
||||
std_level = models.ForeignKey(StdLevelName, verbose_name="Standardization level", blank=True, null=True)
|
||||
|
|
|
@ -468,6 +468,8 @@ INTERNET_DRAFT_ARCHIVE_DIR = '/a/www/www6s/draft-archive'
|
|||
INTERNET_ALL_DRAFTS_ARCHIVE_DIR = '/a/www/www6s/archive/id'
|
||||
MEETING_RECORDINGS_DIR = '/a/www/audio'
|
||||
|
||||
DOCUMENT_FORMAT_BLACKLIST = ["tar", "dtd", "p7s"]
|
||||
|
||||
# Mailing list info URL for lists hosted on the IETF servers
|
||||
MAILING_LIST_INFO_URL = "https://www.ietf.org/mailman/listinfo/%(list_addr)s"
|
||||
MAILING_LIST_ARCHIVE_URL = "https://mailarchive.ietf.org"
|
||||
|
|
58
ietf/stats/backfill_data.py
Normal file
58
ietf/stats/backfill_data.py
Normal file
|
@ -0,0 +1,58 @@
|
|||
import sys, os, argparse
|
||||
|
||||
basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
||||
sys.path = [ basedir ] + sys.path
|
||||
os.environ["DJANGO_SETTINGS_MODULE"] = "ietf.settings"
|
||||
|
||||
virtualenv_activation = os.path.join(basedir, "env", "bin", "activate_this.py")
|
||||
if os.path.exists(virtualenv_activation):
|
||||
execfile(virtualenv_activation, dict(__file__=virtualenv_activation))
|
||||
|
||||
import django
|
||||
django.setup()
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from ietf.doc.models import Document
|
||||
from ietf.utils.draft import Draft
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--document", help="specific document name")
|
||||
parser.add_argument("--words", action="store_true", help="fill in word count")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
docs_qs = Document.objects.filter(type="draft")
|
||||
|
||||
if args.document:
|
||||
docs_qs = docs_qs.filter(docalias__name=args.document)
|
||||
|
||||
for doc in docs_qs.prefetch_related("docalias_set"):
|
||||
canonical_name = doc.name
|
||||
for n in doc.docalias_set.all():
|
||||
if n.name.startswith("rfc"):
|
||||
canonical_name = n.name
|
||||
|
||||
if canonical_name.startswith("rfc"):
|
||||
path = os.path.join(settings.RFC_PATH, canonical_name + ".txt")
|
||||
else:
|
||||
path = os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, canonical_name + "-" + doc.rev + ".txt")
|
||||
|
||||
if not os.path.exists(path):
|
||||
print "skipping", doc.name, "no txt file found at", path
|
||||
continue
|
||||
|
||||
with open(path, 'r') as f:
|
||||
d = Draft(f.read(), path)
|
||||
|
||||
updates = {}
|
||||
|
||||
if args.words:
|
||||
words = d.get_wordcount()
|
||||
if words != doc.words:
|
||||
updates["words"] = words
|
||||
|
||||
if updates:
|
||||
Document.objects.filter(pk=doc.pk).update(**updates)
|
||||
print "updated", canonical_name
|
||||
|
|
@ -31,7 +31,7 @@ class StatisticsTests(TestCase):
|
|||
self.assertTrue(authors_all_url in r["Location"])
|
||||
|
||||
# check various stats types
|
||||
for stats_type in ["authors", "pages"]:
|
||||
for stats_type in ["authors", "pages", "words", "format"]:
|
||||
for document_type in ["all", "rfc", "draft"]:
|
||||
url = urlreverse(ietf.stats.views.document_stats, kwargs={ "stats_type": stats_type, "document_type": document_type })
|
||||
r = self.client.get(url)
|
||||
|
|
|
@ -5,6 +5,6 @@ import ietf.stats.views
|
|||
|
||||
urlpatterns = patterns('',
|
||||
url("^$", ietf.stats.views.stats_index),
|
||||
url("^document/(?:(?P<stats_type>authors|pages|format|spectech)/)?(?:(?P<document_type>all|rfc|draft)/)?$", ietf.stats.views.document_stats),
|
||||
url("^document/(?:(?P<stats_type>authors|pages|words|format|formlang)/)?(?:(?P<document_type>all|rfc|draft)/)?$", ietf.stats.views.document_stats),
|
||||
url("^review/(?:(?P<stats_type>completion|results|states|time)/)?(?:%(acronym)s/)?$" % settings.URL_REGEXPS, ietf.stats.views.review_stats),
|
||||
)
|
||||
|
|
|
@ -1,4 +1,9 @@
|
|||
import datetime, itertools, json, calendar
|
||||
import datetime
|
||||
import itertools
|
||||
import json
|
||||
import calendar
|
||||
import os
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from django.shortcuts import render
|
||||
|
@ -7,6 +12,7 @@ from django.core.urlresolvers import reverse as urlreverse
|
|||
from django.http import HttpResponseRedirect, HttpResponseForbidden
|
||||
from django.db.models import Count
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.conf import settings
|
||||
|
||||
import dateutil.relativedelta
|
||||
|
||||
|
@ -15,10 +21,11 @@ from ietf.review.utils import (extract_review_request_data,
|
|||
ReviewRequestData,
|
||||
compute_review_request_stats,
|
||||
sum_raw_review_request_aggregations)
|
||||
from ietf.submit.models import Submission
|
||||
from ietf.group.models import Role, Group
|
||||
from ietf.person.models import Person
|
||||
from ietf.name.models import ReviewRequestStateName, ReviewResultName
|
||||
from ietf.doc.models import Document
|
||||
from ietf.doc.models import DocAlias
|
||||
from ietf.ietfauth.utils import has_role
|
||||
|
||||
def stats_index(request):
|
||||
|
@ -48,7 +55,6 @@ def generate_query_string(query_dict, overrides):
|
|||
|
||||
return query_part
|
||||
|
||||
|
||||
def document_stats(request, stats_type=None, document_type=None):
|
||||
def build_document_stats_url(stats_type_override=Ellipsis, document_type_override=Ellipsis, get_overrides={}):
|
||||
kwargs = {
|
||||
|
@ -60,10 +66,11 @@ def document_stats(request, stats_type=None, document_type=None):
|
|||
|
||||
# statistics type - one of the tables or the chart
|
||||
possible_stats_types = [
|
||||
("authors", "Number of authors"),
|
||||
("authors", "Authors"),
|
||||
("pages", "Pages"),
|
||||
# ("format", "Format"),
|
||||
# ("spectech", "Specification techniques"),
|
||||
("words", "Words"),
|
||||
("format", "Format"),
|
||||
("formlang", "Formal languages"),
|
||||
]
|
||||
|
||||
possible_stats_types = [ (slug, label, build_document_stats_url(stats_type_override=slug))
|
||||
|
@ -85,13 +92,34 @@ def document_stats(request, stats_type=None, document_type=None):
|
|||
return HttpResponseRedirect(build_document_stats_url(document_type_override=possible_document_types[0][0]))
|
||||
|
||||
|
||||
def put_into_bin(value, bin_size):
|
||||
if value is None:
|
||||
return (value, value)
|
||||
|
||||
v = (value // bin_size) * bin_size
|
||||
return (v, "{} - {}".format(v, v + bin_size - 1))
|
||||
|
||||
def generate_canonical_names(docalias_qs):
|
||||
for doc_id, ts in itertools.groupby(docalias_qs.order_by("document"), lambda t: t[0]):
|
||||
chosen = None
|
||||
for t in ts:
|
||||
if chosen is None:
|
||||
chosen = t
|
||||
else:
|
||||
if t[0].startswith("rfc"):
|
||||
chosen = t
|
||||
elif t[0].startswith("draft") and not chosen[0].startswith("rfc"):
|
||||
chosen = t
|
||||
|
||||
yield chosen
|
||||
|
||||
# filter documents
|
||||
doc_qs = Document.objects.filter(type="draft")
|
||||
docalias_qs = DocAlias.objects.filter(document__type="draft")
|
||||
|
||||
if document_type == "rfc":
|
||||
doc_qs = doc_qs.filter(states__type="draft", states__slug="rfc")
|
||||
docalias_qs = docalias_qs.filter(document__states__type="draft", document__states__slug="rfc")
|
||||
elif document_type == "draft":
|
||||
doc_qs = doc_qs.exclude(states__type="draft", states__slug="rfc")
|
||||
docalias_qs = docalias_qs.exclude(document__states__type="draft", document__states__slug="rfc")
|
||||
|
||||
chart_data = []
|
||||
table_data = []
|
||||
|
@ -104,19 +132,20 @@ def document_stats(request, stats_type=None, document_type=None):
|
|||
doc_label = "draft"
|
||||
|
||||
stats_title = ""
|
||||
bin_size = 1
|
||||
|
||||
if stats_type == "authors":
|
||||
stats_title = "Number of authors for each {}".format(doc_label)
|
||||
|
||||
groups = defaultdict(list)
|
||||
bins = defaultdict(list)
|
||||
|
||||
for name, author_count in doc_qs.values_list("name").annotate(Count("authors")).iterator():
|
||||
groups[author_count].append(name)
|
||||
for name, author_count in generate_canonical_names(docalias_qs.values_list("name").annotate(Count("document__authors"))):
|
||||
bins[author_count].append(name)
|
||||
|
||||
total_docs = sum(len(names) for author_count, names in groups.iteritems())
|
||||
total_docs = sum(len(names) for author_count, names in bins.iteritems())
|
||||
|
||||
series_data = []
|
||||
for author_count, names in sorted(groups.iteritems(), key=lambda t: t[0]):
|
||||
for author_count, names in sorted(bins.iteritems(), key=lambda t: t[0]):
|
||||
percentage = len(names) * 100.0 / total_docs
|
||||
series_data.append((author_count, percentage))
|
||||
table_data.append((author_count, percentage, names))
|
||||
|
@ -129,15 +158,15 @@ def document_stats(request, stats_type=None, document_type=None):
|
|||
elif stats_type == "pages":
|
||||
stats_title = "Number of pages for each {}".format(doc_label)
|
||||
|
||||
groups = defaultdict(list)
|
||||
bins = defaultdict(list)
|
||||
|
||||
for name, pages in doc_qs.values_list("name", "pages"):
|
||||
groups[pages].append(name)
|
||||
for name, pages in generate_canonical_names(docalias_qs.values_list("name", "document__pages")):
|
||||
bins[pages].append(name)
|
||||
|
||||
total_docs = sum(len(names) for pages, names in groups.iteritems())
|
||||
total_docs = sum(len(names) for pages, names in bins.iteritems())
|
||||
|
||||
series_data = []
|
||||
for pages, names in sorted(groups.iteritems(), key=lambda t: t[0]):
|
||||
for pages, names in sorted(bins.iteritems(), key=lambda t: t[0]):
|
||||
percentage = len(names) * 100.0 / total_docs
|
||||
if pages is not None:
|
||||
series_data.append((pages, len(names)))
|
||||
|
@ -148,7 +177,86 @@ def document_stats(request, stats_type=None, document_type=None):
|
|||
"animation": False,
|
||||
})
|
||||
|
||||
elif stats_type == "words":
|
||||
stats_title = "Number of words for each {}".format(doc_label)
|
||||
|
||||
bin_size = 500
|
||||
|
||||
bins = defaultdict(list)
|
||||
|
||||
for name, words in generate_canonical_names(docalias_qs.values_list("name", "document__words")):
|
||||
bins[put_into_bin(words, bin_size)].append(name)
|
||||
|
||||
total_docs = sum(len(names) for words, names in bins.iteritems())
|
||||
|
||||
series_data = []
|
||||
for (value, words), names in sorted(bins.iteritems(), key=lambda t: t[0][0]):
|
||||
percentage = len(names) * 100.0 / total_docs
|
||||
if words is not None:
|
||||
series_data.append((value, len(names)))
|
||||
|
||||
table_data.append((words, percentage, names))
|
||||
|
||||
chart_data.append({
|
||||
"data": series_data,
|
||||
"animation": False,
|
||||
})
|
||||
|
||||
elif stats_type == "format":
|
||||
stats_title = "Formats for each {}".format(doc_label)
|
||||
|
||||
bins = defaultdict(list)
|
||||
|
||||
# on new documents, we should have a Submission row with the file types
|
||||
submission_types = {}
|
||||
|
||||
for doc_name, file_types in Submission.objects.values_list("draft", "file_types").order_by("submission_date", "id"):
|
||||
submission_types[doc_name] = file_types
|
||||
|
||||
doc_names_with_missing_types = {}
|
||||
for canonical_name, rev, doc_name in generate_canonical_names(docalias_qs.values_list("name", "document__rev", "document__name")):
|
||||
types = submission_types.get(doc_name)
|
||||
if types:
|
||||
for dot_ext in types.split(","):
|
||||
bins[dot_ext.lstrip(".").upper()].append(canonical_name)
|
||||
|
||||
else:
|
||||
|
||||
if canonical_name.startswith("rfc"):
|
||||
filename = canonical_name
|
||||
else:
|
||||
filename = canonical_name + "-" + rev
|
||||
|
||||
doc_names_with_missing_types[filename] = canonical_name
|
||||
|
||||
# look up the remaining documents on disk
|
||||
for filename in itertools.chain(os.listdir(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR), os.listdir(settings.RFC_PATH)):
|
||||
t = filename.split(".", 1)
|
||||
if len(t) != 2:
|
||||
continue
|
||||
|
||||
basename, ext = t
|
||||
if any(ext.lower().endswith(blacklisted_ext.lower()) for blacklisted_ext in settings.DOCUMENT_FORMAT_BLACKLIST):
|
||||
continue
|
||||
|
||||
canonical_name = doc_names_with_missing_types.get(basename)
|
||||
|
||||
if canonical_name:
|
||||
bins[ext.upper()].append(canonical_name)
|
||||
|
||||
total_docs = sum(len(names) for fmt, names in bins.iteritems())
|
||||
|
||||
series_data = []
|
||||
for fmt, names in sorted(bins.iteritems(), key=lambda t: t[0]):
|
||||
percentage = len(names) * 100.0 / total_docs
|
||||
series_data.append((fmt, len(names)))
|
||||
|
||||
table_data.append((fmt, percentage, names))
|
||||
|
||||
chart_data.append({
|
||||
"data": series_data,
|
||||
"animation": False,
|
||||
})
|
||||
|
||||
return render(request, "stats/document_stats.html", {
|
||||
"chart_data": mark_safe(json.dumps(chart_data)),
|
||||
|
@ -159,6 +267,8 @@ def document_stats(request, stats_type=None, document_type=None):
|
|||
"possible_document_types": possible_document_types,
|
||||
"document_type": document_type,
|
||||
"doc_label": doc_label,
|
||||
"bin_size": bin_size,
|
||||
"content_template": "stats/document_stats_{}.html".format(stats_type),
|
||||
})
|
||||
|
||||
@login_required
|
||||
|
|
|
@ -35,11 +35,7 @@
|
|||
</div>
|
||||
</div>
|
||||
|
||||
{% if stats_type == "authors" %}
|
||||
{% include "stats/document_stats_authors.html" %}
|
||||
{% elif stats_type == "pages" %}
|
||||
{% include "stats/document_stats_pages.html" %}
|
||||
{% endif %}
|
||||
{% include content_template %}
|
||||
{% endblock %}
|
||||
|
||||
{% block js %}
|
||||
|
|
60
ietf/templates/stats/document_stats_format.html
Normal file
60
ietf/templates/stats/document_stats_format.html
Normal file
|
@ -0,0 +1,60 @@
|
|||
<h3>{{ stats_title }}</h3>
|
||||
|
||||
<div id="chart"></div>
|
||||
|
||||
<script>
|
||||
var chartConf = {
|
||||
chart: {
|
||||
type: 'column'
|
||||
},
|
||||
title: {
|
||||
text: '{{ stats_title|escapejs }}'
|
||||
},
|
||||
xAxis: {
|
||||
type: "category",
|
||||
title: {
|
||||
text: 'Format'
|
||||
}
|
||||
},
|
||||
yAxis: {
|
||||
title: {
|
||||
text: 'Number of {{ doc_label }}s'
|
||||
}
|
||||
},
|
||||
tooltip: {
|
||||
formatter: function () {
|
||||
console.log(this);
|
||||
var s = '<b>' + this.points[0].key + '</b>';
|
||||
|
||||
$.each(this.points, function () {
|
||||
s += '<br/>' + chartConf.yAxis.title.text + ': ' + this.y;
|
||||
});
|
||||
|
||||
return s;
|
||||
},
|
||||
shared: true
|
||||
},
|
||||
series: {{ chart_data }}
|
||||
};
|
||||
</script>
|
||||
|
||||
<h3>Data</h3>
|
||||
|
||||
<table class="table table-condensed stats-data">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Format</th>
|
||||
<th>Percentage of {{ doc_label }}s</th>
|
||||
<th>{{ doc_label|capfirst }}s</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for pages, percentage, names in table_data %}
|
||||
<tr>
|
||||
<td>{{ pages }}</td>
|
||||
<td>{{ percentage|floatformat:2 }}%</td>
|
||||
<td>{% include "stats/includes/docnames_cell.html" %}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
|
@ -41,9 +41,9 @@
|
|||
<table class="table table-condensed stats-data">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Authors</th>
|
||||
<th>Pages</th>
|
||||
<th>Percentage of {{ doc_label }}s</th>
|
||||
<th>{{ doc_label }}s</th>
|
||||
<th>{{ doc_label|capfirst }}s</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
|
|
58
ietf/templates/stats/document_stats_words.html
Normal file
58
ietf/templates/stats/document_stats_words.html
Normal file
|
@ -0,0 +1,58 @@
|
|||
<h3>{{ stats_title }}</h3>
|
||||
|
||||
<div id="chart"></div>
|
||||
|
||||
<script>
|
||||
var chartConf = {
|
||||
chart: {
|
||||
type: 'line'
|
||||
},
|
||||
title: {
|
||||
text: '{{ stats_title|escapejs }}'
|
||||
},
|
||||
xAxis: {
|
||||
title: {
|
||||
text: 'Number of words'
|
||||
}
|
||||
},
|
||||
yAxis: {
|
||||
title: {
|
||||
text: 'Number of {{ doc_label }}s'
|
||||
}
|
||||
},
|
||||
tooltip: {
|
||||
formatter: function () {
|
||||
var s = '<b>' + this.x + ' - ' + (this.x + {{ bin_size }} - 1) + ' ' + (this.x == 1 ? "word" : 'words') + '</b>';
|
||||
|
||||
$.each(this.points, function () {
|
||||
s += '<br/>' + chartConf.yAxis.title.text + ': ' + this.y;
|
||||
});
|
||||
|
||||
return s;
|
||||
},
|
||||
shared: true
|
||||
},
|
||||
series: {{ chart_data }}
|
||||
};
|
||||
</script>
|
||||
|
||||
<h3>Data</h3>
|
||||
|
||||
<table class="table table-condensed stats-data">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Words</th>
|
||||
<th>Percentage of {{ doc_label }}s</th>
|
||||
<th>{{ doc_label|capfirst }}s</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for pages, percentage, names in table_data %}
|
||||
<tr>
|
||||
<td>{{ pages }}</td>
|
||||
<td>{{ percentage|floatformat:2 }}%</td>
|
||||
<td>{% include "stats/includes/docnames_cell.html" %}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
Loading…
Reference in a new issue