Add author affiliation chart.
Also add a model for registering an alias for an affiliation so that we can group affiliations that are considered the same for statistical purposes, and a model for registering unimportant endings like Inc. and GmbH. Affiliation grouping is done through three means: stripping uninteresting endings, merging entries that only differ in case and aliases that map from case-insensitive alias to name. Stripping endings and merging based on case seem to reduce the number of needed manually maintained aliases greatly. - Legacy-Id: 12785
This commit is contained in:
parent
3954dc047d
commit
ef251c6bc7
|
@ -174,7 +174,7 @@ class BallotPositionDocEventAdmin(DocEventAdmin):
|
|||
admin.site.register(BallotPositionDocEvent, BallotPositionDocEventAdmin)
|
||||
|
||||
class DocumentAuthorAdmin(admin.ModelAdmin):
|
||||
list_display = ['id', 'document', 'person', 'email', 'order']
|
||||
search_fields = [ 'document__name', 'person__name', 'email__address', ]
|
||||
list_display = ['id', 'document', 'person', 'email', 'affiliation', 'order']
|
||||
search_fields = [ 'document__name', 'person__name', 'email__address', 'affiliation']
|
||||
admin.site.register(DocumentAuthor, DocumentAuthorAdmin)
|
||||
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
from django.contrib import admin
|
||||
|
||||
|
||||
from ietf.person.models import Email, Alias, Person
|
||||
from ietf.person.models import Email, Alias, Person, AffiliationAlias, AffiliationIgnoredEnding
|
||||
from ietf.person.name import name_parts
|
||||
|
||||
class EmailAdmin(admin.ModelAdmin):
|
||||
|
@ -33,3 +33,13 @@ class PersonAdmin(admin.ModelAdmin):
|
|||
# actions = None
|
||||
admin.site.register(Person, PersonAdmin)
|
||||
|
||||
class AffiliationAliasAdmin(admin.ModelAdmin):
|
||||
list_filter = ["name"]
|
||||
list_display = ["alias", "name"]
|
||||
search_fields = ["alias", "name"]
|
||||
admin.site.register(AffiliationAlias, AffiliationAliasAdmin)
|
||||
|
||||
class AffiliationIgnoredEndingAdmin(admin.ModelAdmin):
|
||||
list_display = ["ending"]
|
||||
search_fields = ["ending"]
|
||||
admin.site.register(AffiliationIgnoredEnding, AffiliationIgnoredEndingAdmin)
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('person', '0014_auto_20160613_0751'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='AffiliationAlias',
|
||||
fields=[
|
||||
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
|
||||
('alias', models.CharField(help_text=b'Note that aliases are matched without regarding case.', max_length=255)),
|
||||
('name', models.CharField(max_length=255)),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='AffiliationIgnoredEnding',
|
||||
fields=[
|
||||
('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)),
|
||||
('ending', models.CharField(max_length=255)),
|
||||
],
|
||||
),
|
||||
]
|
29
ietf/person/migrations/0016_auto_20170203_1030.py
Normal file
29
ietf/person/migrations/0016_auto_20170203_1030.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
def add_affiliation_info(apps, schema_editor):
|
||||
AffiliationAlias = apps.get_model("person", "AffiliationAlias")
|
||||
|
||||
AffiliationAlias.objects.get_or_create(alias="cisco", name="Cisco Systems")
|
||||
AffiliationAlias.objects.get_or_create(alias="cisco system", name="Cisco Systems")
|
||||
AffiliationAlias.objects.get_or_create(alias="cisco systems (india) private limited", name="Cisco Systems")
|
||||
AffiliationAlias.objects.get_or_create(alias="cisco systems india pvt", name="Cisco Systems")
|
||||
|
||||
AffiliationIgnoredEnding = apps.get_model("person", "AffiliationIgnoredEnding")
|
||||
AffiliationIgnoredEnding.objects.get_or_create(ending="LLC\.?")
|
||||
AffiliationIgnoredEnding.objects.get_or_create(ending="Ltd\.?")
|
||||
AffiliationIgnoredEnding.objects.get_or_create(ending="Inc\.?")
|
||||
AffiliationIgnoredEnding.objects.get_or_create(ending="GmbH\.?")
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('person', '0015_affiliationalias_affiliationignoredending'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(add_affiliation_info, migrations.RunPython.noop)
|
||||
]
|
|
@ -241,3 +241,26 @@ class Email(models.Model):
|
|||
return
|
||||
return self.address
|
||||
|
||||
|
||||
class AffiliationAlias(models.Model):
|
||||
"""Records that alias should be treated as name for statistical
|
||||
purposes."""
|
||||
|
||||
alias = models.CharField(max_length=255, help_text="Note that aliases are matched without regarding case.")
|
||||
name = models.CharField(max_length=255)
|
||||
|
||||
def __unicode__(self):
|
||||
return u"{} -> {}".format(self.alias, self.name)
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
self.alias = self.alias.lower()
|
||||
super(AffiliationAlias, self).save(*args, **kwargs)
|
||||
|
||||
class AffiliationIgnoredEnding(models.Model):
|
||||
"""Records that ending should be stripped from the affiliation for statistical purposes."""
|
||||
|
||||
ending = models.CharField(max_length=255, help_text="Regexp with ending, e.g. 'Inc\\.?' - remember to escape .!")
|
||||
|
||||
def __unicode__(self):
|
||||
return self.ending
|
||||
|
||||
|
|
|
@ -1,8 +1,10 @@
|
|||
import pprint
|
||||
import pprint
|
||||
import re
|
||||
from collections import defaultdict
|
||||
|
||||
from django.contrib import admin
|
||||
from django.contrib.auth.models import User
|
||||
from ietf.person.models import Person
|
||||
from ietf.person.models import Person, AffiliationAlias, AffiliationIgnoredEnding
|
||||
|
||||
def merge_persons(source,target,stream):
|
||||
|
||||
|
@ -86,3 +88,88 @@ def merge_persons(source,target,stream):
|
|||
else:
|
||||
print >>stream, "Deleting Person: {}({})".format(source.ascii,source.pk)
|
||||
source.delete()
|
||||
|
||||
|
||||
def compile_affiliation_ending_stripping_regexp():
|
||||
parts = []
|
||||
for ending_re in AffiliationIgnoredEnding.objects.values_list("ending", flat=True):
|
||||
try:
|
||||
re.compile(ending_re)
|
||||
except re.error:
|
||||
pass
|
||||
|
||||
parts.append(ending_re)
|
||||
|
||||
re_str = ",? *({}) *$".format("|".join(parts))
|
||||
|
||||
return re.compile(re_str, re.IGNORECASE)
|
||||
|
||||
|
||||
def get_aliased_affiliations(affiliations):
|
||||
"""Given non-unique sequence of affiliations, returns dictionary with
|
||||
aliases needed.
|
||||
|
||||
We employ the following strategies, interleaved:
|
||||
|
||||
- Stripping company endings like Inc., GmbH etc. from database
|
||||
|
||||
- Looking up aliases stored directly in the database, like
|
||||
"Examplar International" -> "Examplar"
|
||||
|
||||
- Case-folding so Examplar and EXAMPLAR is merged with the
|
||||
winner being the one with most occurrences (so input should not
|
||||
be made unique) or most upper case letters in case of ties.
|
||||
Case folding can be overridden by the aliases in the database."""
|
||||
|
||||
res = {}
|
||||
|
||||
ending_re = compile_affiliation_ending_stripping_regexp()
|
||||
|
||||
known_aliases = { alias.lower(): name for alias, name in AffiliationAlias.objects.values_list("alias", "name") }
|
||||
|
||||
affiliations_with_case_spellings = defaultdict(set)
|
||||
case_spelling_count = defaultdict(int)
|
||||
for affiliation in affiliations:
|
||||
original_affiliation = affiliation
|
||||
|
||||
# check aliases from DB
|
||||
alias = known_aliases.get(affiliation.lower())
|
||||
if alias is not None:
|
||||
affiliation = alias
|
||||
res[original_affiliation] = affiliation
|
||||
|
||||
# strip ending
|
||||
alias = ending_re.sub("", affiliation)
|
||||
if alias != affiliation:
|
||||
affiliation = alias
|
||||
res[original_affiliation] = affiliation
|
||||
|
||||
# check aliases from DB
|
||||
alias = known_aliases.get(affiliation.lower())
|
||||
if alias is not None:
|
||||
affiliation = alias
|
||||
res[original_affiliation] = affiliation
|
||||
|
||||
affiliations_with_case_spellings[affiliation.lower()].add(original_affiliation)
|
||||
case_spelling_count[affiliation] += 1
|
||||
|
||||
def affiliation_sort_key(affiliation):
|
||||
count = case_spelling_count[affiliation]
|
||||
uppercase_letters = sum(1 for c in affiliation if c.isupper())
|
||||
return (count, uppercase_letters)
|
||||
|
||||
# now we just need to pick the most popular uppercase/lowercase
|
||||
# spelling for each affiliation with more than one
|
||||
for similar_affiliations in affiliations_with_case_spellings.itervalues():
|
||||
if len(similar_affiliations) > 1:
|
||||
most_popular = sorted(similar_affiliations, key=affiliation_sort_key, reverse=True)[0]
|
||||
print similar_affiliations, most_popular
|
||||
for affiliation in similar_affiliations:
|
||||
if affiliation != most_popular:
|
||||
res[affiliation] = most_popular
|
||||
print affiliation, "->", most_popular
|
||||
|
||||
return res
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -568,7 +568,7 @@ table.simple-table td:last-child {
|
|||
width: 7em;
|
||||
}
|
||||
|
||||
.popover .docname {
|
||||
.document-stats .popover .element {
|
||||
padding-left: 1em;
|
||||
text-indent: -1em;
|
||||
}
|
||||
|
|
|
@ -30,10 +30,10 @@ $(document).ready(function () {
|
|||
if (stdNameRegExp.test(element))
|
||||
displayName = element.slice(0, 3).toUpperCase() + " " + element.slice(3);
|
||||
|
||||
html.push('<div class="docname"><a href="/doc/' + element + '/">' + displayName + '</a></div>');
|
||||
html.push('<div class="element"><a href="/doc/' + element + '/">' + displayName + '</a></div>');
|
||||
}
|
||||
else {
|
||||
html.push('<div>' + element + '</div>');
|
||||
html.push('<div class="element">' + element + '</div>');
|
||||
}
|
||||
});
|
||||
|
||||
|
@ -44,6 +44,7 @@ $(document).ready(function () {
|
|||
trigger: "focus",
|
||||
template: '<div class="popover" role="tooltip"><div class="arrow"></div><h3 class="popover-title"></h3><div class="popover-content"></div></div>',
|
||||
content: html.join(""),
|
||||
placement: "top",
|
||||
html: true
|
||||
}).on("click", function (e) {
|
||||
e.preventDefault();
|
||||
|
|
|
@ -25,7 +25,7 @@ class StatisticsTests(TestCase):
|
|||
self.assertTrue(authors_url in r["Location"])
|
||||
|
||||
# check various stats types
|
||||
for stats_type in ["authors", "pages", "words", "format", "formlang", "author/documents"]:
|
||||
for stats_type in ["authors", "pages", "words", "format", "formlang", "author/documents", "author/affiliation"]:
|
||||
for document_type in ["", "rfc", "draft"]:
|
||||
for time_choice in ["", "5y"]:
|
||||
url = urlreverse(ietf.stats.views.document_stats, kwargs={ "stats_type": stats_type })
|
||||
|
|
|
@ -25,6 +25,7 @@ from ietf.group.models import Role, Group
|
|||
from ietf.person.models import Person
|
||||
from ietf.name.models import ReviewRequestStateName, ReviewResultName
|
||||
from ietf.doc.models import DocAlias, Document
|
||||
from ietf.person.utils import get_aliased_affiliations
|
||||
from ietf.ietfauth.utils import has_role
|
||||
|
||||
def stats_index(request):
|
||||
|
@ -351,7 +352,7 @@ def document_stats(request, stats_type=None):
|
|||
total_persons = person_qs.count()
|
||||
|
||||
if stats_type == "author/documents":
|
||||
stats_title = "Number of {}s for each author".format(doc_label)
|
||||
stats_title = "Number of {}s per author".format(doc_label)
|
||||
|
||||
bins = defaultdict(list)
|
||||
|
||||
|
@ -369,6 +370,38 @@ def document_stats(request, stats_type=None):
|
|||
"animation": False,
|
||||
})
|
||||
|
||||
elif stats_type == "author/affiliation":
|
||||
stats_title = "Number of {} authors per affiliation".format(doc_label)
|
||||
|
||||
bins = defaultdict(list)
|
||||
|
||||
# Since people don't write the affiliation names in the
|
||||
# same way, and we don't want to go back and edit them
|
||||
# either, we transform them here.
|
||||
|
||||
name_affiliation_set = set((name, affiliation)
|
||||
for name, affiliation in person_qs.values_list("name", "documentauthor__affiliation"))
|
||||
|
||||
aliases = get_aliased_affiliations(affiliation for _, affiliation in name_affiliation_set)
|
||||
|
||||
for name, affiliation in name_affiliation_set:
|
||||
bins[aliases.get(affiliation, affiliation)].append(name)
|
||||
|
||||
series_data = []
|
||||
for affiliation, names in sorted(bins.iteritems(), key=lambda t: t[0].lower()):
|
||||
percentage = len(names) * 100.0 / total_persons
|
||||
if affiliation:
|
||||
series_data.append((affiliation, len(names)))
|
||||
table_data.append((affiliation, percentage, names))
|
||||
|
||||
series_data.sort(key=lambda t: t[1], reverse=True)
|
||||
series_data = series_data[:30]
|
||||
|
||||
chart_data.append({
|
||||
"data": series_data,
|
||||
"animation": False,
|
||||
})
|
||||
|
||||
|
||||
return render(request, "stats/document_stats.html", {
|
||||
"chart_data": mark_safe(json.dumps(chart_data)),
|
||||
|
|
|
@ -55,7 +55,9 @@
|
|||
</div>
|
||||
</div>
|
||||
|
||||
{% include content_template %}
|
||||
<div class="document-stats">
|
||||
{% include content_template %}
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
||||
{% block js %}
|
||||
|
|
59
ietf/templates/stats/document_stats_author_affiliation.html
Normal file
59
ietf/templates/stats/document_stats_author_affiliation.html
Normal file
|
@ -0,0 +1,59 @@
|
|||
<h3>{{ stats_title }}</h3>
|
||||
|
||||
<div id="chart"></div>
|
||||
|
||||
<script>
|
||||
var chartConf = {
|
||||
chart: {
|
||||
type: 'column'
|
||||
},
|
||||
title: {
|
||||
text: '{{ stats_title|escapejs }}'
|
||||
},
|
||||
xAxis: {
|
||||
type: "category",
|
||||
title: {
|
||||
text: 'Affiliation'
|
||||
}
|
||||
},
|
||||
yAxis: {
|
||||
title: {
|
||||
text: 'Number of authors'
|
||||
}
|
||||
},
|
||||
tooltip: {
|
||||
formatter: function () {
|
||||
var s = '<b>' + this.points[0].key + '</b>';
|
||||
|
||||
$.each(this.points, function () {
|
||||
s += '<br/>' + chartConf.yAxis.title.text + ': ' + this.y;
|
||||
});
|
||||
|
||||
return s;
|
||||
},
|
||||
shared: true
|
||||
},
|
||||
series: {{ chart_data }}
|
||||
};
|
||||
</script>
|
||||
|
||||
<h3>Data</h3>
|
||||
|
||||
<table class="table table-condensed stats-data">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Affiliation</th>
|
||||
<th>Percentage of authors</th>
|
||||
<th>Authors</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for affiliation, percentage, names in table_data %}
|
||||
<tr>
|
||||
<td>{{ affiliation|default:"(unknown)" }}</td>
|
||||
<td>{{ percentage|floatformat:2 }}%</td>
|
||||
<td>{% include "stats/includes/number_with_details_cell.html" %}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
Loading…
Reference in a new issue