Merged in [12461] from rjsparks@nostrum.com:

Added migration to fetch text from reviews in the mail archives and populate the review documents. Fixes #2064. Will patch into production. - Legacy-Id: 12463 Note: SVN reference [12461] has been migrated to Git commit 63a9599bafdcacf49f1bb374b2156d8473da93d1
2016-12-05 21:03:49 +00:00 · 2016-12-05 21:03:49 +00:00 · e4ce339235
parent 7ad38caec2
commit e4ce339235
5 changed files with 104 additions and 2 deletions
--- a/ietf/doc/migrations/0017_fill_review_document_contents.py
+++ b/ietf/doc/migrations/0017_fill_review_document_contents.py
@ -0,0 +1,67 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+import debug     # pyflakes:ignore
+
+import contextlib
+import os
+import urllib2
+
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+
+from django.db import migrations
+from django.conf import settings
+
+def get_filename(doc):
+    path = settings.DOCUMENT_PATH_PATTERN.format(doc=doc)
+    # ! These files right now are created with no version number?
+    #name = '%s-%s.txt' % (doc.name,doc.rev)
+    name = '%s.txt' % (doc.name,)
+    return os.path.join(path,name)
+
+def forward(apps,schema_editor):
+    # for each qualifying document
+    Document = apps.get_model('doc','Document')
+
+    for doc in tqdm(Document.objects.filter(type='review',external_url__contains="www.ietf.org/mail-archive/web"),desc="Pointers into Mhonarc"):
+        filename = get_filename(doc)
+        if not os.path.isfile(filename):
+            with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile:
+                fullcontents = infile.read().decode('utf-8', 'ignore');
+                start = fullcontents.find('<!--X-Body-of-Message-->')
+                end = fullcontents.find('<!--X-Body-of-Message-End-->')
+                bodyblock=fullcontents[start+len('<!--X-Body-of-Message-->'):end]
+                text = BeautifulSoup(bodyblock,"lxml").get_text('\n\n') \
+                           .replace('FAQ at <\n\nhttp://wiki.tools','FAQ at <http://wiki.tools') \
+                           .replace('wiki/GenArtfaq\n\n>','wiki/GenArtfaq>')
+                with contextlib.closing(open(filename,'w')) as outfile:
+                    outfile.write(text.encode('utf8'))
+
+    for doc in tqdm(Document.objects.filter(type='review',external_url__contains="mailarchive.ietf.org"),desc="Pointers into Mailarchive"):
+        filename = get_filename(doc)
+        if not os.path.isfile(filename):
+            with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile:
+                fullcontents = infile.read().decode('utf-8', 'ignore');
+                soup = BeautifulSoup(fullcontents,"lxml")
+                divpre = soup.find('div',{"id":"msg-payload"}).find('pre')
+                text = divpre.get_text('\n\n')
+                with contextlib.closing(open(filename,'w')) as outfile:
+                    outfile.write(text.encode('utf8'))
+
+    ## After this migration, we should figure out what to do with these stragglers:
+    ## In [29]: Document.objects.filter(type='review').exclude(Q(external_url__contains="mailarchive")|Q(external_url__contains="mail-archive")).values_list('external_url',flat=True)
+    ## Out[29]: [u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=1909/review_edit?reviewid=2300', u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=8460/review_edit?reviewid=2735', u'https://www.ietf.org/ibin/c5i?mid=6&rid=49&gid=0&k1=933&k2=55337&tid=1296220835', u'https://www.ietf.org/mailman/private/tsv-dir/2012-February/002007.html', u'', u'']
+
+def reverse(apps,schema_editor):
+    pass
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('doc', '0016_auto_20160927_0713'),
+    ]
+
+    operations = [
+        migrations.RunPython(forward,reverse)
+    ]
--- a/ietf/doc/utils.py
+++ b/ietf/doc/utils.py
@ -297,6 +297,20 @@ def add_events_message_info(events):
        e.in_reply_to = e.addedmessageevent.in_reply_to


+def get_unicode_document_content(key, filename, split=True, markup=True, codec='utf-8', errors='ignore'):
+    try:
+        with open(filename, 'rb') as f:
+            raw_content = f.read().decode(codec,errors)
+    except IOError:
+        error = "Error; cannot read ("+key+")"
+        return error
+
+    if markup:
+        return markup_txt.markup_unicode(raw_content, split)
+    else:
+        return raw_content
+
+
 def get_document_content(key, filename, split=True, markup=True):
    try:
        with open(filename, 'rb') as f:
--- a/ietf/doc/views_doc.py
+++ b/ietf/doc/views_doc.py
@ -51,7 +51,7 @@ from ietf.doc.utils import ( add_links_in_new_revision_events, augment_events_wi
    can_adopt_draft, get_chartering_type, get_document_content, get_tags_for_stream_id,
    needed_ballot_positions, nice_consensus, prettify_std_name, update_telechat, has_same_ballot,
    get_initial_notify, make_notify_changed_event, crawl_history, default_consensus,
-    add_events_message_info)
+    add_events_message_info, get_unicode_document_content)
 from ietf.community.utils import augment_docs_with_tracking_info
 from ietf.group.models import Role
 from ietf.group.utils import can_manage_group, can_manage_materials
@ -582,7 +582,7 @@ def document_main(request, name, rev=None):
    if doc.type_id == "review":
        basename = "{}.txt".format(doc.name, doc.rev)
        pathname = os.path.join(doc.get_file_path(), basename)
-        content = get_document_content(basename, pathname, split=False)
+        content = get_unicode_document_content(basename, pathname, split=False)

        review_req = ReviewRequest.objects.filter(review=doc.name).first()

--- a/ietf/utils/markup_txt.py
+++ b/ietf/utils/markup_txt.py
@ -71,3 +71,23 @@ def markup(content, split=True):
        #return (content1, content2)
    else:
        return "<pre>" + content + "</pre>\n"
+
+def markup_unicode(content, split=True):
+    # normalize line endings to LF only
+    content = content.replace("\r\n", "\n")
+    content = content.replace("\r", "\n")
+
+    # remove leading white space
+    content = content.lstrip()
+    # remove runs of blank lines
+    content = re.sub("\n\n\n+", "\n\n", content)
+
+    # expand tabs + escape 
+    content = escape(content.expandtabs())
+
+    if split:
+        n = content.find("\n", 5000)
+        content1 = "<pre>"+content[:n+1]+"</pre>\n"
+        return content1
+    else:
+        return "<pre>" + content + "</pre>\n"
--- a/requirements.txt
+++ b/requirements.txt
@ -1,6 +1,7 @@
 # -*- conf-mode -*-
 setuptools>=18.5			# Require this first, to prevent later errors
 #
+beautifulsoup4>=4.5.1
 bibtexparser>=0.6.2
 coverage>=4.0.1,!=4.0.2
 #cssselect>=0.6.1               # for PyQuery