From ef251c6bc7aa1ca19a2d9eb3120be3af0aa1845b Mon Sep 17 00:00:00 2001 From: Ole Laursen Date: Fri, 3 Feb 2017 18:49:43 +0000 Subject: [PATCH] Add author affiliation chart. Also add a model for registering an alias for an affiliation so that we can group affiliations that are considered the same for statistical purposes, and a model for registering unimportant endings like Inc. and GmbH. Affiliation grouping is done through three means: stripping uninteresting endings, merging entries that only differ in case and aliases that map from case-insensitive alias to name. Stripping endings and merging based on case seem to reduce the number of needed manually maintained aliases greatly. - Legacy-Id: 12785 --- ietf/doc/admin.py | 4 +- ietf/person/admin.py | 12 ++- ...filiationalias_affiliationignoredending.py | 29 ++++++ .../migrations/0016_auto_20170203_1030.py | 29 ++++++ ietf/person/models.py | 23 +++++ ietf/person/utils.py | 91 ++++++++++++++++++- ietf/static/ietf/css/ietf.css | 2 +- ietf/static/ietf/js/document-stats.js | 5 +- ietf/stats/tests.py | 2 +- ietf/stats/views.py | 35 ++++++- ietf/templates/stats/document_stats.html | 4 +- .../document_stats_author_affiliation.html | 59 ++++++++++++ 12 files changed, 284 insertions(+), 11 deletions(-) create mode 100644 ietf/person/migrations/0015_affiliationalias_affiliationignoredending.py create mode 100644 ietf/person/migrations/0016_auto_20170203_1030.py create mode 100644 ietf/templates/stats/document_stats_author_affiliation.html diff --git a/ietf/doc/admin.py b/ietf/doc/admin.py index c5db20e37..288c4a635 100644 --- a/ietf/doc/admin.py +++ b/ietf/doc/admin.py @@ -174,7 +174,7 @@ class BallotPositionDocEventAdmin(DocEventAdmin): admin.site.register(BallotPositionDocEvent, BallotPositionDocEventAdmin) class DocumentAuthorAdmin(admin.ModelAdmin): - list_display = ['id', 'document', 'person', 'email', 'order'] - search_fields = [ 'document__name', 'person__name', 'email__address', ] + list_display = ['id', 'document', 'person', 'email', 'affiliation', 'order'] + search_fields = [ 'document__name', 'person__name', 'email__address', 'affiliation'] admin.site.register(DocumentAuthor, DocumentAuthorAdmin) diff --git a/ietf/person/admin.py b/ietf/person/admin.py index 8c5ce62c0..563d212e3 100644 --- a/ietf/person/admin.py +++ b/ietf/person/admin.py @@ -1,7 +1,7 @@ from django.contrib import admin -from ietf.person.models import Email, Alias, Person +from ietf.person.models import Email, Alias, Person, AffiliationAlias, AffiliationIgnoredEnding from ietf.person.name import name_parts class EmailAdmin(admin.ModelAdmin): @@ -33,3 +33,13 @@ class PersonAdmin(admin.ModelAdmin): # actions = None admin.site.register(Person, PersonAdmin) +class AffiliationAliasAdmin(admin.ModelAdmin): + list_filter = ["name"] + list_display = ["alias", "name"] + search_fields = ["alias", "name"] +admin.site.register(AffiliationAlias, AffiliationAliasAdmin) + +class AffiliationIgnoredEndingAdmin(admin.ModelAdmin): + list_display = ["ending"] + search_fields = ["ending"] +admin.site.register(AffiliationIgnoredEnding, AffiliationIgnoredEndingAdmin) diff --git a/ietf/person/migrations/0015_affiliationalias_affiliationignoredending.py b/ietf/person/migrations/0015_affiliationalias_affiliationignoredending.py new file mode 100644 index 000000000..1747fd224 --- /dev/null +++ b/ietf/person/migrations/0015_affiliationalias_affiliationignoredending.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('person', '0014_auto_20160613_0751'), + ] + + operations = [ + migrations.CreateModel( + name='AffiliationAlias', + fields=[ + ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('alias', models.CharField(help_text=b'Note that aliases are matched without regarding case.', max_length=255)), + ('name', models.CharField(max_length=255)), + ], + ), + migrations.CreateModel( + name='AffiliationIgnoredEnding', + fields=[ + ('id', models.AutoField(verbose_name='ID', serialize=False, auto_created=True, primary_key=True)), + ('ending', models.CharField(max_length=255)), + ], + ), + ] diff --git a/ietf/person/migrations/0016_auto_20170203_1030.py b/ietf/person/migrations/0016_auto_20170203_1030.py new file mode 100644 index 000000000..d5f4fd950 --- /dev/null +++ b/ietf/person/migrations/0016_auto_20170203_1030.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import migrations + +def add_affiliation_info(apps, schema_editor): + AffiliationAlias = apps.get_model("person", "AffiliationAlias") + + AffiliationAlias.objects.get_or_create(alias="cisco", name="Cisco Systems") + AffiliationAlias.objects.get_or_create(alias="cisco system", name="Cisco Systems") + AffiliationAlias.objects.get_or_create(alias="cisco systems (india) private limited", name="Cisco Systems") + AffiliationAlias.objects.get_or_create(alias="cisco systems india pvt", name="Cisco Systems") + + AffiliationIgnoredEnding = apps.get_model("person", "AffiliationIgnoredEnding") + AffiliationIgnoredEnding.objects.get_or_create(ending="LLC\.?") + AffiliationIgnoredEnding.objects.get_or_create(ending="Ltd\.?") + AffiliationIgnoredEnding.objects.get_or_create(ending="Inc\.?") + AffiliationIgnoredEnding.objects.get_or_create(ending="GmbH\.?") + + +class Migration(migrations.Migration): + + dependencies = [ + ('person', '0015_affiliationalias_affiliationignoredending'), + ] + + operations = [ + migrations.RunPython(add_affiliation_info, migrations.RunPython.noop) + ] diff --git a/ietf/person/models.py b/ietf/person/models.py index 61fa6b2c0..9b2392d4c 100644 --- a/ietf/person/models.py +++ b/ietf/person/models.py @@ -241,3 +241,26 @@ class Email(models.Model): return return self.address + +class AffiliationAlias(models.Model): + """Records that alias should be treated as name for statistical + purposes.""" + + alias = models.CharField(max_length=255, help_text="Note that aliases are matched without regarding case.") + name = models.CharField(max_length=255) + + def __unicode__(self): + return u"{} -> {}".format(self.alias, self.name) + + def save(self, *args, **kwargs): + self.alias = self.alias.lower() + super(AffiliationAlias, self).save(*args, **kwargs) + +class AffiliationIgnoredEnding(models.Model): + """Records that ending should be stripped from the affiliation for statistical purposes.""" + + ending = models.CharField(max_length=255, help_text="Regexp with ending, e.g. 'Inc\\.?' - remember to escape .!") + + def __unicode__(self): + return self.ending + diff --git a/ietf/person/utils.py b/ietf/person/utils.py index 55e7a6929..0dcf08ee6 100755 --- a/ietf/person/utils.py +++ b/ietf/person/utils.py @@ -1,8 +1,10 @@ -import pprint +import pprint +import re +from collections import defaultdict from django.contrib import admin from django.contrib.auth.models import User -from ietf.person.models import Person +from ietf.person.models import Person, AffiliationAlias, AffiliationIgnoredEnding def merge_persons(source,target,stream): @@ -86,3 +88,88 @@ def merge_persons(source,target,stream): else: print >>stream, "Deleting Person: {}({})".format(source.ascii,source.pk) source.delete() + + +def compile_affiliation_ending_stripping_regexp(): + parts = [] + for ending_re in AffiliationIgnoredEnding.objects.values_list("ending", flat=True): + try: + re.compile(ending_re) + except re.error: + pass + + parts.append(ending_re) + + re_str = ",? *({}) *$".format("|".join(parts)) + + return re.compile(re_str, re.IGNORECASE) + + +def get_aliased_affiliations(affiliations): + """Given non-unique sequence of affiliations, returns dictionary with + aliases needed. + + We employ the following strategies, interleaved: + + - Stripping company endings like Inc., GmbH etc. from database + + - Looking up aliases stored directly in the database, like + "Examplar International" -> "Examplar" + + - Case-folding so Examplar and EXAMPLAR is merged with the + winner being the one with most occurrences (so input should not + be made unique) or most upper case letters in case of ties. + Case folding can be overridden by the aliases in the database.""" + + res = {} + + ending_re = compile_affiliation_ending_stripping_regexp() + + known_aliases = { alias.lower(): name for alias, name in AffiliationAlias.objects.values_list("alias", "name") } + + affiliations_with_case_spellings = defaultdict(set) + case_spelling_count = defaultdict(int) + for affiliation in affiliations: + original_affiliation = affiliation + + # check aliases from DB + alias = known_aliases.get(affiliation.lower()) + if alias is not None: + affiliation = alias + res[original_affiliation] = affiliation + + # strip ending + alias = ending_re.sub("", affiliation) + if alias != affiliation: + affiliation = alias + res[original_affiliation] = affiliation + + # check aliases from DB + alias = known_aliases.get(affiliation.lower()) + if alias is not None: + affiliation = alias + res[original_affiliation] = affiliation + + affiliations_with_case_spellings[affiliation.lower()].add(original_affiliation) + case_spelling_count[affiliation] += 1 + + def affiliation_sort_key(affiliation): + count = case_spelling_count[affiliation] + uppercase_letters = sum(1 for c in affiliation if c.isupper()) + return (count, uppercase_letters) + + # now we just need to pick the most popular uppercase/lowercase + # spelling for each affiliation with more than one + for similar_affiliations in affiliations_with_case_spellings.itervalues(): + if len(similar_affiliations) > 1: + most_popular = sorted(similar_affiliations, key=affiliation_sort_key, reverse=True)[0] + print similar_affiliations, most_popular + for affiliation in similar_affiliations: + if affiliation != most_popular: + res[affiliation] = most_popular + print affiliation, "->", most_popular + + return res + + + diff --git a/ietf/static/ietf/css/ietf.css b/ietf/static/ietf/css/ietf.css index 76b391d58..f095e9eb7 100644 --- a/ietf/static/ietf/css/ietf.css +++ b/ietf/static/ietf/css/ietf.css @@ -568,7 +568,7 @@ table.simple-table td:last-child { width: 7em; } -.popover .docname { +.document-stats .popover .element { padding-left: 1em; text-indent: -1em; } diff --git a/ietf/static/ietf/js/document-stats.js b/ietf/static/ietf/js/document-stats.js index fdfbfa36b..25dfcc785 100644 --- a/ietf/static/ietf/js/document-stats.js +++ b/ietf/static/ietf/js/document-stats.js @@ -30,10 +30,10 @@ $(document).ready(function () { if (stdNameRegExp.test(element)) displayName = element.slice(0, 3).toUpperCase() + " " + element.slice(3); - html.push('
' + displayName + '
'); + html.push('
' + displayName + '
'); } else { - html.push('
' + element + '
'); + html.push('
' + element + '
'); } }); @@ -44,6 +44,7 @@ $(document).ready(function () { trigger: "focus", template: '', content: html.join(""), + placement: "top", html: true }).on("click", function (e) { e.preventDefault(); diff --git a/ietf/stats/tests.py b/ietf/stats/tests.py index fa973544c..026a49b96 100644 --- a/ietf/stats/tests.py +++ b/ietf/stats/tests.py @@ -25,7 +25,7 @@ class StatisticsTests(TestCase): self.assertTrue(authors_url in r["Location"]) # check various stats types - for stats_type in ["authors", "pages", "words", "format", "formlang", "author/documents"]: + for stats_type in ["authors", "pages", "words", "format", "formlang", "author/documents", "author/affiliation"]: for document_type in ["", "rfc", "draft"]: for time_choice in ["", "5y"]: url = urlreverse(ietf.stats.views.document_stats, kwargs={ "stats_type": stats_type }) diff --git a/ietf/stats/views.py b/ietf/stats/views.py index c151888b5..0dadfdaf7 100644 --- a/ietf/stats/views.py +++ b/ietf/stats/views.py @@ -25,6 +25,7 @@ from ietf.group.models import Role, Group from ietf.person.models import Person from ietf.name.models import ReviewRequestStateName, ReviewResultName from ietf.doc.models import DocAlias, Document +from ietf.person.utils import get_aliased_affiliations from ietf.ietfauth.utils import has_role def stats_index(request): @@ -351,7 +352,7 @@ def document_stats(request, stats_type=None): total_persons = person_qs.count() if stats_type == "author/documents": - stats_title = "Number of {}s for each author".format(doc_label) + stats_title = "Number of {}s per author".format(doc_label) bins = defaultdict(list) @@ -369,6 +370,38 @@ def document_stats(request, stats_type=None): "animation": False, }) + elif stats_type == "author/affiliation": + stats_title = "Number of {} authors per affiliation".format(doc_label) + + bins = defaultdict(list) + + # Since people don't write the affiliation names in the + # same way, and we don't want to go back and edit them + # either, we transform them here. + + name_affiliation_set = set((name, affiliation) + for name, affiliation in person_qs.values_list("name", "documentauthor__affiliation")) + + aliases = get_aliased_affiliations(affiliation for _, affiliation in name_affiliation_set) + + for name, affiliation in name_affiliation_set: + bins[aliases.get(affiliation, affiliation)].append(name) + + series_data = [] + for affiliation, names in sorted(bins.iteritems(), key=lambda t: t[0].lower()): + percentage = len(names) * 100.0 / total_persons + if affiliation: + series_data.append((affiliation, len(names))) + table_data.append((affiliation, percentage, names)) + + series_data.sort(key=lambda t: t[1], reverse=True) + series_data = series_data[:30] + + chart_data.append({ + "data": series_data, + "animation": False, + }) + return render(request, "stats/document_stats.html", { "chart_data": mark_safe(json.dumps(chart_data)), diff --git a/ietf/templates/stats/document_stats.html b/ietf/templates/stats/document_stats.html index 2e4bda401..8ff53471d 100644 --- a/ietf/templates/stats/document_stats.html +++ b/ietf/templates/stats/document_stats.html @@ -55,7 +55,9 @@ - {% include content_template %} +
+ {% include content_template %} +
{% endblock %} {% block js %} diff --git a/ietf/templates/stats/document_stats_author_affiliation.html b/ietf/templates/stats/document_stats_author_affiliation.html new file mode 100644 index 000000000..acca4ff4d --- /dev/null +++ b/ietf/templates/stats/document_stats_author_affiliation.html @@ -0,0 +1,59 @@ +

{{ stats_title }}

+ +
+ + + +

Data

+ + + + + + + + + + + {% for affiliation, percentage, names in table_data %} + + + + + + {% endfor %} + +
AffiliationPercentage of authorsAuthors
{{ affiliation|default:"(unknown)" }}{{ percentage|floatformat:2 }}%{% include "stats/includes/number_with_details_cell.html" %}