diff --git a/ietf/doc/migrations/0017_fill_review_document_contents.py b/ietf/doc/migrations/0017_fill_review_document_contents.py new file mode 100644 index 000000000..ca5590368 --- /dev/null +++ b/ietf/doc/migrations/0017_fill_review_document_contents.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import debug # pyflakes:ignore + +import contextlib +import os +import urllib2 + +from bs4 import BeautifulSoup +from tqdm import tqdm + +from django.db import migrations +from django.conf import settings + +def get_filename(doc): + path = settings.DOCUMENT_PATH_PATTERN.format(doc=doc) + # ! These files right now are created with no version number? + #name = '%s-%s.txt' % (doc.name,doc.rev) + name = '%s.txt' % (doc.name,) + return os.path.join(path,name) + +def forward(apps,schema_editor): + # for each qualifying document + Document = apps.get_model('doc','Document') + + for doc in tqdm(Document.objects.filter(type='review',external_url__contains="www.ietf.org/mail-archive/web"),desc="Pointers into Mhonarc"): + filename = get_filename(doc) + if not os.path.isfile(filename): + with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile: + fullcontents = infile.read().decode('utf-8', 'ignore'); + start = fullcontents.find('') + end = fullcontents.find('') + bodyblock=fullcontents[start+len(''):end] + text = BeautifulSoup(bodyblock,"lxml").get_text('\n\n') \ + .replace('FAQ at <\n\nhttp://wiki.tools','FAQ at ','wiki/GenArtfaq>') + with contextlib.closing(open(filename,'w')) as outfile: + outfile.write(text.encode('utf8')) + + for doc in tqdm(Document.objects.filter(type='review',external_url__contains="mailarchive.ietf.org"),desc="Pointers into Mailarchive"): + filename = get_filename(doc) + if not os.path.isfile(filename): + with contextlib.closing(urllib2.urlopen(doc.external_url)) as infile: + fullcontents = infile.read().decode('utf-8', 'ignore'); + soup = BeautifulSoup(fullcontents,"lxml") + divpre = soup.find('div',{"id":"msg-payload"}).find('pre') + text = divpre.get_text('\n\n') + with contextlib.closing(open(filename,'w')) as outfile: + outfile.write(text.encode('utf8')) + + ## After this migration, we should figure out what to do with these stragglers: + ## In [29]: Document.objects.filter(type='review').exclude(Q(external_url__contains="mailarchive")|Q(external_url__contains="mail-archive")).values_list('external_url',flat=True) + ## Out[29]: [u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=1909/review_edit?reviewid=2300', u'https://art.tools.ietf.org/tools/art/genart/index.cgi/t=8460/review_edit?reviewid=2735', u'https://www.ietf.org/ibin/c5i?mid=6&rid=49&gid=0&k1=933&k2=55337&tid=1296220835', u'https://www.ietf.org/mailman/private/tsv-dir/2012-February/002007.html', u'', u''] + +def reverse(apps,schema_editor): + pass + +class Migration(migrations.Migration): + + dependencies = [ + ('doc', '0016_auto_20160927_0713'), + ] + + operations = [ + migrations.RunPython(forward,reverse) + ] diff --git a/ietf/doc/utils.py b/ietf/doc/utils.py index a7a3b7be0..75bdae6ff 100644 --- a/ietf/doc/utils.py +++ b/ietf/doc/utils.py @@ -297,6 +297,20 @@ def add_events_message_info(events): e.in_reply_to = e.addedmessageevent.in_reply_to +def get_unicode_document_content(key, filename, split=True, markup=True, codec='utf-8', errors='ignore'): + try: + with open(filename, 'rb') as f: + raw_content = f.read().decode(codec,errors) + except IOError: + error = "Error; cannot read ("+key+")" + return error + + if markup: + return markup_txt.markup_unicode(raw_content, split) + else: + return raw_content + + def get_document_content(key, filename, split=True, markup=True): try: with open(filename, 'rb') as f: diff --git a/ietf/doc/views_doc.py b/ietf/doc/views_doc.py index 5d365e656..de1ff526a 100644 --- a/ietf/doc/views_doc.py +++ b/ietf/doc/views_doc.py @@ -51,7 +51,7 @@ from ietf.doc.utils import ( add_links_in_new_revision_events, augment_events_wi can_adopt_draft, get_chartering_type, get_document_content, get_tags_for_stream_id, needed_ballot_positions, nice_consensus, prettify_std_name, update_telechat, has_same_ballot, get_initial_notify, make_notify_changed_event, crawl_history, default_consensus, - add_events_message_info) + add_events_message_info, get_unicode_document_content) from ietf.community.utils import augment_docs_with_tracking_info from ietf.group.models import Role from ietf.group.utils import can_manage_group, can_manage_materials @@ -582,7 +582,7 @@ def document_main(request, name, rev=None): if doc.type_id == "review": basename = "{}.txt".format(doc.name, doc.rev) pathname = os.path.join(doc.get_file_path(), basename) - content = get_document_content(basename, pathname, split=False) + content = get_unicode_document_content(basename, pathname, split=False) review_req = ReviewRequest.objects.filter(review=doc.name).first() diff --git a/ietf/utils/markup_txt.py b/ietf/utils/markup_txt.py index 3094ba98b..f172f9b1e 100644 --- a/ietf/utils/markup_txt.py +++ b/ietf/utils/markup_txt.py @@ -71,3 +71,23 @@ def markup(content, split=True): #return (content1, content2) else: return "
" + content + "
\n" + +def markup_unicode(content, split=True): + # normalize line endings to LF only + content = content.replace("\r\n", "\n") + content = content.replace("\r", "\n") + + # remove leading white space + content = content.lstrip() + # remove runs of blank lines + content = re.sub("\n\n\n+", "\n\n", content) + + # expand tabs + escape + content = escape(content.expandtabs()) + + if split: + n = content.find("\n", 5000) + content1 = "
"+content[:n+1]+"
\n" + return content1 + else: + return "
" + content + "
\n" diff --git a/requirements.txt b/requirements.txt index 3ac966cf7..6dae2b088 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # -*- conf-mode -*- setuptools>=18.5 # Require this first, to prevent later errors # +beautifulsoup4>=4.5.1 bibtexparser>=0.6.2 coverage>=4.0.1,!=4.0.2 #cssselect>=0.6.1 # for PyQuery