Merged in [12461] from rjsparks@nostrum.com:

Added migration to fetch text from reviews in the mail archives and populate the review documents. Fixes #2064.  Will patch into production.
 - Legacy-Id: 12463
Note: SVN reference [12461] has been migrated to Git commit 63a9599bafdcacf49f1bb374b2156d8473da93d1
This commit is contained in:
Henrik Levkowetz 2016-12-05 21:03:49 +00:00
parent 7ad38caec2
commit e4ce339235
5 changed files with 104 additions and 2 deletions

View file

@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import debug # pyflakes:ignore
import contextlib
import os
import urllib2
from bs4 import BeautifulSoup
from tqdm import tqdm
from django.db import migrations
from django.conf import settings
def get_filename(doc):
path = settings.DOCUMENT_PATH_PATTERN.format(doc=doc)
# ! These files right now are created with no version number?
#name = '%s-%s.txt' % (doc.name,doc.rev)
name = '%s.txt' % (doc.name,)
return os.path.join(path,name)
def forward(apps,schema_editor):
# for each qualifying document
Document = apps.get_model('doc','Document')
for doc in tqdm(Document.objects.filter(type='review',external_url__contains="www.ietf.org/mail-archive/web"),desc="Pointers into Mhonarc"):
filename = get_filename(doc)
if not os.path.isfile(filename):
with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile:
fullcontents = infile.read().decode('utf-8', 'ignore');
start = fullcontents.find('<!--X-Body-of-Message-->')
end = fullcontents.find('<!--X-Body-of-Message-End-->')
bodyblock=fullcontents[start+len('<!--X-Body-of-Message-->'):end]
text = BeautifulSoup(bodyblock,"lxml").get_text('\n\n') \
.replace('FAQ at <\n\nhttp://wiki.tools','FAQ at <http://wiki.tools') \
.replace('wiki/GenArtfaq\n\n>','wiki/GenArtfaq>')
with contextlib.closing(open(filename,'w')) as outfile:
outfile.write(text.encode('utf8'))
for doc in tqdm(Document.objects.filter(type='review',external_url__contains="mailarchive.ietf.org"),desc="Pointers into Mailarchive"):
filename = get_filename(doc)
if not os.path.isfile(filename):
with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile:
fullcontents = infile.read().decode('utf-8', 'ignore');
soup = BeautifulSoup(fullcontents,"lxml")
divpre = soup.find('div',{"id":"msg-payload"}).find('pre')
text = divpre.get_text('\n\n')
with contextlib.closing(open(filename,'w')) as outfile:
outfile.write(text.encode('utf8'))
## After this migration, we should figure out what to do with these stragglers:
## In [29]: Document.objects.filter(type='review').exclude(Q(external_url__contains="mailarchive")|Q(external_url__contains="mail-archive")).values_list('external_url',flat=True)
## Out[29]: [u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=1909/review_edit?reviewid=2300', u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=8460/review_edit?reviewid=2735', u'https://www.ietf.org/ibin/c5i?mid=6&rid=49&gid=0&k1=933&k2=55337&tid=1296220835', u'https://www.ietf.org/mailman/private/tsv-dir/2012-February/002007.html', u'', u'']
def reverse(apps,schema_editor):
pass
class Migration(migrations.Migration):
dependencies = [
('doc', '0016_auto_20160927_0713'),
]
operations = [
migrations.RunPython(forward,reverse)
]

View file

@ -297,6 +297,20 @@ def add_events_message_info(events):
e.in_reply_to = e.addedmessageevent.in_reply_to
def get_unicode_document_content(key, filename, split=True, markup=True, codec='utf-8', errors='ignore'):
try:
with open(filename, 'rb') as f:
raw_content = f.read().decode(codec,errors)
except IOError:
error = "Error; cannot read ("+key+")"
return error
if markup:
return markup_txt.markup_unicode(raw_content, split)
else:
return raw_content
def get_document_content(key, filename, split=True, markup=True):
try:
with open(filename, 'rb') as f:

View file

@ -51,7 +51,7 @@ from ietf.doc.utils import ( add_links_in_new_revision_events, augment_events_wi
can_adopt_draft, get_chartering_type, get_document_content, get_tags_for_stream_id,
needed_ballot_positions, nice_consensus, prettify_std_name, update_telechat, has_same_ballot,
get_initial_notify, make_notify_changed_event, crawl_history, default_consensus,
add_events_message_info)
add_events_message_info, get_unicode_document_content)
from ietf.community.utils import augment_docs_with_tracking_info
from ietf.group.models import Role
from ietf.group.utils import can_manage_group, can_manage_materials
@ -582,7 +582,7 @@ def document_main(request, name, rev=None):
if doc.type_id == "review":
basename = "{}.txt".format(doc.name, doc.rev)
pathname = os.path.join(doc.get_file_path(), basename)
content = get_document_content(basename, pathname, split=False)
content = get_unicode_document_content(basename, pathname, split=False)
review_req = ReviewRequest.objects.filter(review=doc.name).first()

View file

@ -71,3 +71,23 @@ def markup(content, split=True):
#return (content1, content2)
else:
return "<pre>" + content + "</pre>\n"
def markup_unicode(content, split=True):
# normalize line endings to LF only
content = content.replace("\r\n", "\n")
content = content.replace("\r", "\n")
# remove leading white space
content = content.lstrip()
# remove runs of blank lines
content = re.sub("\n\n\n+", "\n\n", content)
# expand tabs + escape
content = escape(content.expandtabs())
if split:
n = content.find("\n", 5000)
content1 = "<pre>"+content[:n+1]+"</pre>\n"
return content1
else:
return "<pre>" + content + "</pre>\n"

View file

@ -1,6 +1,7 @@
# -*- conf-mode -*-
setuptools>=18.5 # Require this first, to prevent later errors
#
beautifulsoup4>=4.5.1
bibtexparser>=0.6.2
coverage>=4.0.1,!=4.0.2
#cssselect>=0.6.1 # for PyQuery