Add author affiliation chart.

Also add a model for registering an alias for an affiliation so that
we can group affiliations that are considered the same for statistical
purposes, and a model for registering unimportant endings like Inc.
and GmbH.

Affiliation grouping is done through three means: stripping
uninteresting endings, merging entries that only differ in case and
aliases that map from case-insensitive alias to name.

Stripping endings and merging based on case seem to reduce the number
of needed manually maintained aliases greatly.
 - Legacy-Id: 12785
This commit is contained in:
Ole Laursen 2017-02-03 18:49:43 +00:00
parent 3954dc047d
commit ef251c6bc7
12 changed files with 284 additions and 11 deletions

View file

@ -174,7 +174,7 @@ class BallotPositionDocEventAdmin(DocEventAdmin):
admin.site.register(BallotPositionDocEvent, BallotPositionDocEventAdmin)
class DocumentAuthorAdmin(admin.ModelAdmin):
list_display = ['id', 'document', 'person', 'email', 'order']
search_fields = [ 'document__name', 'person__name', 'email__address', ]
list_display = ['id', 'document', 'person', 'email', 'affiliation', 'order']
search_fields = [ 'document__name', 'person__name', 'email__address', 'affiliation']
admin.site.register(DocumentAuthor, DocumentAuthorAdmin)

View file

@ -1,7 +1,7 @@
from django.contrib import admin
from ietf.person.models import Email, Alias, Person
from ietf.person.models import Email, Alias, Person, AffiliationAlias, AffiliationIgnoredEnding
from ietf.person.name import name_parts
class EmailAdmin(admin.ModelAdmin):
@ -33,3 +33,13 @@ class PersonAdmin(admin.ModelAdmin):
# actions = None
admin.site.register(Person, PersonAdmin)
class AffiliationAliasAdmin(admin.ModelAdmin):
list_filter = ["name"]
list_display = ["alias", "name"]
search_fields = ["alias", "name"]
admin.site.register(AffiliationAlias, AffiliationAliasAdmin)
class AffiliationIgnoredEndingAdmin(admin.ModelAdmin):
list_display = ["ending"]
search_fields = ["ending"]
admin.site.register(AffiliationIgnoredEnding, AffiliationIgnoredEndingAdmin)

View file

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('person', '0014_auto_20160613_0751'),
]
operations = [
migrations.CreateModel(
name='AffiliationAlias',
fields=[
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
('alias', models.CharField(help_text=b'Note that aliases are matched without regarding case.', max_length=255)),
('name', models.CharField(max_length=255)),
],
),
migrations.CreateModel(
name='AffiliationIgnoredEnding',
fields=[
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
('ending', models.CharField(max_length=255)),
],
),
]

View file

@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import migrations
def add_affiliation_info(apps, schema_editor):
AffiliationAlias = apps.get_model("person", "AffiliationAlias")
AffiliationAlias.objects.get_or_create(alias="cisco", name="Cisco Systems")
AffiliationAlias.objects.get_or_create(alias="cisco system", name="Cisco Systems")
AffiliationAlias.objects.get_or_create(alias="cisco systems (india) private limited", name="Cisco Systems")
AffiliationAlias.objects.get_or_create(alias="cisco systems india pvt", name="Cisco Systems")
AffiliationIgnoredEnding = apps.get_model("person", "AffiliationIgnoredEnding")
AffiliationIgnoredEnding.objects.get_or_create(ending="LLC\.?")
AffiliationIgnoredEnding.objects.get_or_create(ending="Ltd\.?")
AffiliationIgnoredEnding.objects.get_or_create(ending="Inc\.?")
AffiliationIgnoredEnding.objects.get_or_create(ending="GmbH\.?")
class Migration(migrations.Migration):
dependencies = [
('person', '0015_affiliationalias_affiliationignoredending'),
]
operations = [
migrations.RunPython(add_affiliation_info, migrations.RunPython.noop)
]

View file

@ -241,3 +241,26 @@ class Email(models.Model):
return
return self.address
class AffiliationAlias(models.Model):
"""Records that alias should be treated as name for statistical
purposes."""
alias = models.CharField(max_length=255, help_text="Note that aliases are matched without regarding case.")
name = models.CharField(max_length=255)
def __unicode__(self):
return u"{} -> {}".format(self.alias, self.name)
def save(self, *args, **kwargs):
self.alias = self.alias.lower()
super(AffiliationAlias, self).save(*args, **kwargs)
class AffiliationIgnoredEnding(models.Model):
"""Records that ending should be stripped from the affiliation for statistical purposes."""
ending = models.CharField(max_length=255, help_text="Regexp with ending, e.g. 'Inc\\.?' - remember to escape .!")
def __unicode__(self):
return self.ending

View file

@ -1,8 +1,10 @@
import pprint
import pprint
import re
from collections import defaultdict
from django.contrib import admin
from django.contrib.auth.models import User
from ietf.person.models import Person
from ietf.person.models import Person, AffiliationAlias, AffiliationIgnoredEnding
def merge_persons(source,target,stream):
@ -86,3 +88,88 @@ def merge_persons(source,target,stream):
else:
print >>stream, "Deleting Person: {}({})".format(source.ascii,source.pk)
source.delete()
def compile_affiliation_ending_stripping_regexp():
parts = []
for ending_re in AffiliationIgnoredEnding.objects.values_list("ending", flat=True):
try:
re.compile(ending_re)
except re.error:
pass
parts.append(ending_re)
re_str = ",? *({}) *$".format("|".join(parts))
return re.compile(re_str, re.IGNORECASE)
def get_aliased_affiliations(affiliations):
"""Given non-unique sequence of affiliations, returns dictionary with
aliases needed.
We employ the following strategies, interleaved:
- Stripping company endings like Inc., GmbH etc. from database
- Looking up aliases stored directly in the database, like
"Examplar International" -> "Examplar"
- Case-folding so Examplar and EXAMPLAR is merged with the
winner being the one with most occurrences (so input should not
be made unique) or most upper case letters in case of ties.
Case folding can be overridden by the aliases in the database."""
res = {}
ending_re = compile_affiliation_ending_stripping_regexp()
known_aliases = { alias.lower(): name for alias, name in AffiliationAlias.objects.values_list("alias", "name") }
affiliations_with_case_spellings = defaultdict(set)
case_spelling_count = defaultdict(int)
for affiliation in affiliations:
original_affiliation = affiliation
# check aliases from DB
alias = known_aliases.get(affiliation.lower())
if alias is not None:
affiliation = alias
res[original_affiliation] = affiliation
# strip ending
alias = ending_re.sub("", affiliation)
if alias != affiliation:
affiliation = alias
res[original_affiliation] = affiliation
# check aliases from DB
alias = known_aliases.get(affiliation.lower())
if alias is not None:
affiliation = alias
res[original_affiliation] = affiliation
affiliations_with_case_spellings[affiliation.lower()].add(original_affiliation)
case_spelling_count[affiliation] += 1
def affiliation_sort_key(affiliation):
count = case_spelling_count[affiliation]
uppercase_letters = sum(1 for c in affiliation if c.isupper())
return (count, uppercase_letters)
# now we just need to pick the most popular uppercase/lowercase
# spelling for each affiliation with more than one
for similar_affiliations in affiliations_with_case_spellings.itervalues():
if len(similar_affiliations) > 1:
most_popular = sorted(similar_affiliations, key=affiliation_sort_key, reverse=True)[0]
print similar_affiliations, most_popular
for affiliation in similar_affiliations:
if affiliation != most_popular:
res[affiliation] = most_popular
print affiliation, "->", most_popular
return res

View file

@ -568,7 +568,7 @@ table.simple-table td:last-child {
width: 7em;
}
.popover .docname {
.document-stats .popover .element {
padding-left: 1em;
text-indent: -1em;
}

View file

@ -30,10 +30,10 @@ $(document).ready(function () {
if (stdNameRegExp.test(element))
displayName = element.slice(0, 3).toUpperCase() + " " + element.slice(3);
html.push('<div class="docname"><a href="/doc/' + element + '/">' + displayName + '</a></div>');
html.push('<div class="element"><a href="/doc/' + element + '/">' + displayName + '</a></div>');
}
else {
html.push('<div>' + element + '</div>');
html.push('<div class="element">' + element + '</div>');
}
});
@ -44,6 +44,7 @@ $(document).ready(function () {
trigger: "focus",
template: '<div class="popover" role="tooltip"><div class="arrow"></div><h3 class="popover-title"></h3><div class="popover-content"></div></div>',
content: html.join(""),
placement: "top",
html: true
}).on("click", function (e) {
e.preventDefault();

View file

@ -25,7 +25,7 @@ class StatisticsTests(TestCase):
self.assertTrue(authors_url in r["Location"])
# check various stats types
for stats_type in ["authors", "pages", "words", "format", "formlang", "author/documents"]:
for stats_type in ["authors", "pages", "words", "format", "formlang", "author/documents", "author/affiliation"]:
for document_type in ["", "rfc", "draft"]:
for time_choice in ["", "5y"]:
url = urlreverse(ietf.stats.views.document_stats, kwargs={ "stats_type": stats_type })

View file

@ -25,6 +25,7 @@ from ietf.group.models import Role, Group
from ietf.person.models import Person
from ietf.name.models import ReviewRequestStateName, ReviewResultName
from ietf.doc.models import DocAlias, Document
from ietf.person.utils import get_aliased_affiliations
from ietf.ietfauth.utils import has_role
def stats_index(request):
@ -351,7 +352,7 @@ def document_stats(request, stats_type=None):
total_persons = person_qs.count()
if stats_type == "author/documents":
stats_title = "Number of {}s for each author".format(doc_label)
stats_title = "Number of {}s per author".format(doc_label)
bins = defaultdict(list)
@ -369,6 +370,38 @@ def document_stats(request, stats_type=None):
"animation": False,
})
elif stats_type == "author/affiliation":
stats_title = "Number of {} authors per affiliation".format(doc_label)
bins = defaultdict(list)
# Since people don't write the affiliation names in the
# same way, and we don't want to go back and edit them
# either, we transform them here.
name_affiliation_set = set((name, affiliation)
for name, affiliation in person_qs.values_list("name", "documentauthor__affiliation"))
aliases = get_aliased_affiliations(affiliation for _, affiliation in name_affiliation_set)
for name, affiliation in name_affiliation_set:
bins[aliases.get(affiliation, affiliation)].append(name)
series_data = []
for affiliation, names in sorted(bins.iteritems(), key=lambda t: t[0].lower()):
percentage = len(names) * 100.0 / total_persons
if affiliation:
series_data.append((affiliation, len(names)))
table_data.append((affiliation, percentage, names))
series_data.sort(key=lambda t: t[1], reverse=True)
series_data = series_data[:30]
chart_data.append({
"data": series_data,
"animation": False,
})
return render(request, "stats/document_stats.html", {
"chart_data": mark_safe(json.dumps(chart_data)),

View file

@ -55,7 +55,9 @@
</div>
</div>
{% include content_template %}
<div class="document-stats">
{% include content_template %}
</div>
{% endblock %}
{% block js %}

View file

@ -0,0 +1,59 @@
<h3>{{ stats_title }}</h3>
<div id="chart"></div>
<script>
var chartConf = {
chart: {
type: 'column'
},
title: {
text: '{{ stats_title|escapejs }}'
},
xAxis: {
type: "category",
title: {
text: 'Affiliation'
}
},
yAxis: {
title: {
text: 'Number of authors'
}
},
tooltip: {
formatter: function () {
var s = '<b>' + this.points[0].key + '</b>';
$.each(this.points, function () {
s += '<br/>' + chartConf.yAxis.title.text + ': ' + this.y;
});
return s;
},
shared: true
},
series: {{ chart_data }}
};
</script>
<h3>Data</h3>
<table class="table table-condensed stats-data">
<thead>
<tr>
<th>Affiliation</th>
<th>Percentage of authors</th>
<th>Authors</th>
</tr>
</thead>
<tbody>
{% for affiliation, percentage, names in table_data %}
<tr>
<td>{{ affiliation|default:"(unknown)" }}</td>
<td>{{ percentage|floatformat:2 }}%</td>
<td>{% include "stats/includes/number_with_details_cell.html" %}</td>
</tr>
{% endfor %}
</tbody>
</table>