Merged in [12461] from rjsparks@nostrum.com:
Added migration to fetch text from reviews in the mail archives and populate the review documents. Fixes #2064. Will patch into production. - Legacy-Id: 12463 Note: SVN reference [12461] has been migrated to Git commit 63a9599bafdcacf49f1bb374b2156d8473da93d1
This commit is contained in:
parent
7ad38caec2
commit
e4ce339235
67
ietf/doc/migrations/0017_fill_review_document_contents.py
Normal file
67
ietf/doc/migrations/0017_fill_review_document_contents.py
Normal file
|
@ -0,0 +1,67 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import debug # pyflakes:ignore
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
import urllib2
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from tqdm import tqdm
|
||||
|
||||
from django.db import migrations
|
||||
from django.conf import settings
|
||||
|
||||
def get_filename(doc):
|
||||
path = settings.DOCUMENT_PATH_PATTERN.format(doc=doc)
|
||||
# ! These files right now are created with no version number?
|
||||
#name = '%s-%s.txt' % (doc.name,doc.rev)
|
||||
name = '%s.txt' % (doc.name,)
|
||||
return os.path.join(path,name)
|
||||
|
||||
def forward(apps,schema_editor):
|
||||
# for each qualifying document
|
||||
Document = apps.get_model('doc','Document')
|
||||
|
||||
for doc in tqdm(Document.objects.filter(type='review',external_url__contains="www.ietf.org/mail-archive/web"),desc="Pointers into Mhonarc"):
|
||||
filename = get_filename(doc)
|
||||
if not os.path.isfile(filename):
|
||||
with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile:
|
||||
fullcontents = infile.read().decode('utf-8', 'ignore');
|
||||
start = fullcontents.find('<!--X-Body-of-Message-->')
|
||||
end = fullcontents.find('<!--X-Body-of-Message-End-->')
|
||||
bodyblock=fullcontents[start+len('<!--X-Body-of-Message-->'):end]
|
||||
text = BeautifulSoup(bodyblock,"lxml").get_text('\n\n') \
|
||||
.replace('FAQ at <\n\nhttp://wiki.tools','FAQ at <http://wiki.tools') \
|
||||
.replace('wiki/GenArtfaq\n\n>','wiki/GenArtfaq>')
|
||||
with contextlib.closing(open(filename,'w')) as outfile:
|
||||
outfile.write(text.encode('utf8'))
|
||||
|
||||
for doc in tqdm(Document.objects.filter(type='review',external_url__contains="mailarchive.ietf.org"),desc="Pointers into Mailarchive"):
|
||||
filename = get_filename(doc)
|
||||
if not os.path.isfile(filename):
|
||||
with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile:
|
||||
fullcontents = infile.read().decode('utf-8', 'ignore');
|
||||
soup = BeautifulSoup(fullcontents,"lxml")
|
||||
divpre = soup.find('div',{"id":"msg-payload"}).find('pre')
|
||||
text = divpre.get_text('\n\n')
|
||||
with contextlib.closing(open(filename,'w')) as outfile:
|
||||
outfile.write(text.encode('utf8'))
|
||||
|
||||
## After this migration, we should figure out what to do with these stragglers:
|
||||
## In [29]: Document.objects.filter(type='review').exclude(Q(external_url__contains="mailarchive")|Q(external_url__contains="mail-archive")).values_list('external_url',flat=True)
|
||||
## Out[29]: [u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=1909/review_edit?reviewid=2300', u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=8460/review_edit?reviewid=2735', u'https://www.ietf.org/ibin/c5i?mid=6&rid=49&gid=0&k1=933&k2=55337&tid=1296220835', u'https://www.ietf.org/mailman/private/tsv-dir/2012-February/002007.html', u'', u'']
|
||||
|
||||
def reverse(apps,schema_editor):
|
||||
pass
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('doc', '0016_auto_20160927_0713'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(forward,reverse)
|
||||
]
|
|
@ -297,6 +297,20 @@ def add_events_message_info(events):
|
|||
e.in_reply_to = e.addedmessageevent.in_reply_to
|
||||
|
||||
|
||||
def get_unicode_document_content(key, filename, split=True, markup=True, codec='utf-8', errors='ignore'):
|
||||
try:
|
||||
with open(filename, 'rb') as f:
|
||||
raw_content = f.read().decode(codec,errors)
|
||||
except IOError:
|
||||
error = "Error; cannot read ("+key+")"
|
||||
return error
|
||||
|
||||
if markup:
|
||||
return markup_txt.markup_unicode(raw_content, split)
|
||||
else:
|
||||
return raw_content
|
||||
|
||||
|
||||
def get_document_content(key, filename, split=True, markup=True):
|
||||
try:
|
||||
with open(filename, 'rb') as f:
|
||||
|
|
|
@ -51,7 +51,7 @@ from ietf.doc.utils import ( add_links_in_new_revision_events, augment_events_wi
|
|||
can_adopt_draft, get_chartering_type, get_document_content, get_tags_for_stream_id,
|
||||
needed_ballot_positions, nice_consensus, prettify_std_name, update_telechat, has_same_ballot,
|
||||
get_initial_notify, make_notify_changed_event, crawl_history, default_consensus,
|
||||
add_events_message_info)
|
||||
add_events_message_info, get_unicode_document_content)
|
||||
from ietf.community.utils import augment_docs_with_tracking_info
|
||||
from ietf.group.models import Role
|
||||
from ietf.group.utils import can_manage_group, can_manage_materials
|
||||
|
@ -582,7 +582,7 @@ def document_main(request, name, rev=None):
|
|||
if doc.type_id == "review":
|
||||
basename = "{}.txt".format(doc.name, doc.rev)
|
||||
pathname = os.path.join(doc.get_file_path(), basename)
|
||||
content = get_document_content(basename, pathname, split=False)
|
||||
content = get_unicode_document_content(basename, pathname, split=False)
|
||||
|
||||
review_req = ReviewRequest.objects.filter(review=doc.name).first()
|
||||
|
||||
|
|
|
@ -71,3 +71,23 @@ def markup(content, split=True):
|
|||
#return (content1, content2)
|
||||
else:
|
||||
return "<pre>" + content + "</pre>\n"
|
||||
|
||||
def markup_unicode(content, split=True):
|
||||
# normalize line endings to LF only
|
||||
content = content.replace("\r\n", "\n")
|
||||
content = content.replace("\r", "\n")
|
||||
|
||||
# remove leading white space
|
||||
content = content.lstrip()
|
||||
# remove runs of blank lines
|
||||
content = re.sub("\n\n\n+", "\n\n", content)
|
||||
|
||||
# expand tabs + escape
|
||||
content = escape(content.expandtabs())
|
||||
|
||||
if split:
|
||||
n = content.find("\n", 5000)
|
||||
content1 = "<pre>"+content[:n+1]+"</pre>\n"
|
||||
return content1
|
||||
else:
|
||||
return "<pre>" + content + "</pre>\n"
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
# -*- conf-mode -*-
|
||||
setuptools>=18.5 # Require this first, to prevent later errors
|
||||
#
|
||||
beautifulsoup4>=4.5.1
|
||||
bibtexparser>=0.6.2
|
||||
coverage>=4.0.1,!=4.0.2
|
||||
#cssselect>=0.6.1 # for PyQuery
|
||||
|
|
Loading…
Reference in a new issue