From 967ece7e7d192ad7c36cc415fc735af8fbf19678 Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Fri, 8 Dec 2017 21:51:11 +0000 Subject: [PATCH] Started refactoring of reading text from document files (drafts, charters, etc.) in order to normalise on one way of doing this, and making that return unicode rather than undecoded bytes. This is the first step of two, in order to gauge the possible issues and report on discrepancies. - Legacy-Id: 14406 --- ietf/doc/mails.py | 11 ++++- ietf/doc/models.py | 3 ++ ietf/doc/templatetags/ietf_filters.py | 10 +++- ietf/doc/utils.py | 12 +++-- ietf/doc/views_conflict_review.py | 7 ++- ietf/doc/views_doc.py | 52 ++++++++++++++++---- ietf/doc/views_status_change.py | 7 ++- ietf/meeting/forms.py | 10 +++- ietf/secr/telechat/views.py | 11 ++++- ietf/utils/markup_txt.py | 69 ++++++++++----------------- ietf/utils/text.py | 9 ++++ 11 files changed, 135 insertions(+), 66 deletions(-) diff --git a/ietf/doc/mails.py b/ietf/doc/mails.py index b3cd75f1d..fd7c2501c 100644 --- a/ietf/doc/mails.py +++ b/ietf/doc/mails.py @@ -8,6 +8,8 @@ from django.utils.html import strip_tags from django.conf import settings from django.urls import reverse as urlreverse +import debug # pyflakes:ignore + from ietf.utils.mail import send_mail, send_mail_text from ietf.ipr.utils import iprs_from_docs, related_docs from ietf.doc.models import WriteupDocEvent, LastCallDocEvent, DocAlias, ConsensusDocEvent @@ -15,6 +17,7 @@ from ietf.doc.utils import needed_ballot_positions, get_document_content from ietf.group.models import Role from ietf.doc.models import Document from ietf.mailtrigger.utils import gather_address_lists +from ietf.utils import log def email_state_changed(request, doc, text, mailtrigger_id=None): (to,cc) = gather_address_lists(mailtrigger_id or 'doc_state_edited',doc=doc) @@ -515,7 +518,13 @@ def email_charter_internal_review(request, charter): os.path.join(settings.CHARTER_PATH,filename), split=False, markup=False, - ) + ).decode('utf-8') + utext = charter.text_or_error() # pyflakes:ignore + if charter_text and charter_text != utext and not 'Error; cannot read' in charter_text: + debug.show('charter_text[:64]') + debug.show('utext[:64]') + log.assertion('charter_text == utext') + send_mail(request, addrs.to, settings.DEFAULT_FROM_EMAIL, 'Internal %s Review: %s (%s)'%(charter.group.type.name,charter.group.name,charter.group.acronym), 'doc/mail/charter_internal_review.txt', diff --git a/ietf/doc/models.py b/ietf/doc/models.py index 6e254b8f4..6e6f7f100 100644 --- a/ietf/doc/models.py +++ b/ietf/doc/models.py @@ -449,6 +449,9 @@ class DocumentInfo(models.Model): # return text + def text_or_error(self): + return self.text() or "Error; cannot read (%s)"%self.get_file_name() + def htmlized(self): name = self.get_base_name() text = self.text() diff --git a/ietf/doc/templatetags/ietf_filters.py b/ietf/doc/templatetags/ietf_filters.py index fad6bb321..b1db0b417 100644 --- a/ietf/doc/templatetags/ietf_filters.py +++ b/ietf/doc/templatetags/ietf_filters.py @@ -18,7 +18,7 @@ import debug # pyflakes:ignore from ietf.doc.models import ConsensusDocEvent from ietf.doc.utils import get_document_content from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped - +from ietf.utils import log register = template.Library() @@ -509,7 +509,13 @@ def document_content(doc): if doc is None: return None path = os.path.join(doc.get_file_path(),doc.filename_with_rev()) - return get_document_content(doc.name,path,markup=False) + content = get_document_content(doc.name,path,markup=False) + utext = doc.text_or_error() # pyflakes:ignore + if content and content != utext and not 'Error; cannot read' in content: + debug.show('content[:64]') + debug.show('utext[:64]') + log.assertion('content == utext') + return content @register.filter def format_timedelta(timedelta): diff --git a/ietf/doc/utils.py b/ietf/doc/utils.py index e8ea4e9e0..64291853d 100644 --- a/ietf/doc/utils.py +++ b/ietf/doc/utils.py @@ -22,7 +22,7 @@ from ietf.doc.models import TelechatDocEvent from ietf.name.models import DocReminderTypeName, DocRelationshipName from ietf.group.models import Role from ietf.ietfauth.utils import has_role -from ietf.utils import draft, markup_txt +from ietf.utils import draft from ietf.utils.mail import send_mail from ietf.mailtrigger.utils import gather_address_lists @@ -299,6 +299,7 @@ def get_unicode_document_content(key, filename, codec='utf-8', errors='ignore'): return raw_content def get_document_content(key, filename, split=True, markup=True): + #log.unreachable("2017-12-05") try: with open(filename, 'rb') as f: raw_content = f.read() @@ -306,10 +307,11 @@ def get_document_content(key, filename, split=True, markup=True): error = "Error; cannot read ("+key+")" return error - if markup: - return markup_txt.markup(raw_content, split) - else: - return raw_content +# if markup: +# return markup_txt.markup(raw_content, split) +# else: +# return raw_content + return raw_content def tags_suffix(tags): return (u"::" + u"::".join(t.name for t in tags)) if tags else u"" diff --git a/ietf/doc/views_conflict_review.py b/ietf/doc/views_conflict_review.py index 833deac9a..e7375a20b 100644 --- a/ietf/doc/views_conflict_review.py +++ b/ietf/doc/views_conflict_review.py @@ -254,7 +254,12 @@ def edit_ad(request, name): def default_approval_text(review): filename = "%s-%s.txt" % (review.canonical_name(), review.rev) - current_text = get_document_content(filename, os.path.join(settings.CONFLICT_REVIEW_PATH, filename), split=False, markup=False) + current_text = get_document_content(filename, os.path.join(settings.CONFLICT_REVIEW_PATH, filename), split=False, markup=False).decode('utf-8') + utext = review.text_or_error() # pyflakes:ignore + if current_text and current_text != utext and not 'Error; cannot read' in current_text: + debug.show('current_text[:64]') + debug.show('utext[:64]') + log.assertion('current_text == utext') conflictdoc = review.relateddocument_set.get(relationship__slug='conflrev').target.document if conflictdoc.stream_id=='ise': diff --git a/ietf/doc/views_doc.py b/ietf/doc/views_doc.py index a7063ef2e..4097fd581 100644 --- a/ietf/doc/views_doc.py +++ b/ietf/doc/views_doc.py @@ -66,6 +66,8 @@ from ietf.meeting.utils import group_sessions, get_upcoming_manageable_sessions, from ietf.review.models import ReviewRequest from ietf.review.utils import can_request_review_of_doc, review_requests_to_list_for_docs from ietf.review.utils import no_review_from_teams_on_doc +from ietf.utils import markup_txt, log +from ietf.utils.text import maybe_split def render_document_top(request, doc, tab, name): @@ -186,7 +188,13 @@ def document_main(request, name, rev=None): filename = name + ".txt" content = get_document_content(filename, os.path.join(settings.RFC_PATH, filename), - split_content, markup=True) + split_content, markup=True).decode('utf-8') + utext = doc.text_or_error() # pyflakes:ignore + if content and content != utext and not 'Error; cannot read' in content: + debug.show('content[:64]') + debug.show('utext[:64]') + log.assertion('content == utext') + content = markup_txt.markup(maybe_split(content, split=split_content)) # file types base_path = os.path.join(settings.RFC_PATH, name + ".") @@ -216,7 +224,13 @@ def document_main(request, name, rev=None): filename = "%s-%s.txt" % (draft_name, doc.rev) content = get_document_content(filename, os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, filename), - split_content, markup=True) + split_content, markup=True).decode('utf-8') + utext = doc.text_or_error() # pyflakes:ignore + if content and content != utext and not 'Error; cannot read' in content: + debug.show('content[:64]') + debug.show('utext[:64]') + log.assertion('content == utext') + content = markup_txt.markup(maybe_split(content, split=split_content)) # file types base_path = os.path.join(settings.INTERNET_DRAFT_PATH, doc.name + "-" + doc.rev + ".") @@ -439,7 +453,13 @@ def document_main(request, name, rev=None): if doc.type_id == "charter": filename = "%s-%s.txt" % (doc.canonical_name(), doc.rev) - content = get_document_content(filename, os.path.join(settings.CHARTER_PATH, filename), split=False, markup=True) + content = get_document_content(filename, os.path.join(settings.CHARTER_PATH, filename), split=False, markup=True).decode('utf-8') + utext = doc.text_or_error() # pyflakes:ignore + if content and content != utext and not 'Error; cannot read' in content: + debug.show('content[:64]') + debug.show('utext[:64]') + log.assertion('content == utext') + content = markup_txt.markup(content) ballot_summary = None if doc.get_state_slug() in ("intrev", "iesgrev"): @@ -480,9 +500,15 @@ def document_main(request, name, rev=None): if doc.rev == "00" and not os.path.isfile(pathname): # This could move to a template - content = "A conflict review response has not yet been proposed." + content = u"A conflict review response has not yet been proposed." else: - content = get_document_content(filename, pathname, split=False, markup=True) + content = get_document_content(filename, pathname, split=False, markup=True).decode('utf-8') + utext = doc.text_or_error() # pyflakes:ignore + if content and content != utext and not 'Error; cannot read' in content: + debug.show('content[:64]') + debug.show('utext[:64]') + log.assertion('content == utext') + content = markup_txt.markup(content) ballot_summary = None if doc.get_state_slug() in ("iesgeval") and doc.active_ballot(): @@ -507,9 +533,14 @@ def document_main(request, name, rev=None): if doc.rev == "00" and not os.path.isfile(pathname): # This could move to a template - content = "Status change text has not yet been proposed." + content = u"Status change text has not yet been proposed." else: - content = get_document_content(filename, pathname, split=False) + content = get_document_content(filename, pathname, split=False).decode('utf-8') + utext = doc.text_or_error() # pyflakes:ignore + if content and content != utext and not 'Error; cannot read' in content: + debug.show('content[:64]') + debug.show('utext[:64]') + log.assertion('content == utext') ballot_summary = None if doc.get_state_slug() in ("iesgeval"): @@ -562,7 +593,12 @@ def document_main(request, name, rev=None): url = urlbase + extension if extension == ".txt": - content = get_document_content(basename, pathname + extension, split=False) + content = get_document_content(basename, pathname + extension, split=False).decode('utf-8') + utext = doc.text_or_error() # pyflakes:ignore + if content != utext: + debug.show('content[:64]') + debug.show('utext[:64]') + log.assertion('content == utext') t = "plain text" other_types.append((t, url)) diff --git a/ietf/doc/views_status_change.py b/ietf/doc/views_status_change.py index ea47065cc..547906c99 100644 --- a/ietf/doc/views_status_change.py +++ b/ietf/doc/views_status_change.py @@ -282,7 +282,12 @@ def newstatus(relateddoc): def default_approval_text(status_change,relateddoc): filename = "%s-%s.txt" % (status_change.canonical_name(), status_change.rev) - current_text = get_document_content(filename, os.path.join(settings.STATUS_CHANGE_PATH, filename), split=False, markup=False) + current_text = get_document_content(filename, os.path.join(settings.STATUS_CHANGE_PATH, filename), split=False, markup=False).decode('utf-8') + utext = status_change.text_or_error() # pyflakes:ignore + if current_text and current_text != utext and not 'Error; cannot read' in current_text: + debug.show('current_text[:64]') + debug.show('utext[:64]') + log.assertion('current_text == utext') if relateddoc.target.document.std_level.slug in ('std','ps','ds','bcp',): action = "Protocol Action" diff --git a/ietf/meeting/forms.py b/ietf/meeting/forms.py index dfa830361..fc01f686d 100644 --- a/ietf/meeting/forms.py +++ b/ietf/meeting/forms.py @@ -18,6 +18,7 @@ from ietf.meeting.helpers import is_meeting_approved, get_next_agenda_name from ietf.message.models import Message from ietf.person.models import Person from ietf.utils.fields import DatepickerDateField, DurationField +from ietf.utils import log # need to insert empty option for use in ChoiceField # countries.insert(0, ('', '-'*9 )) @@ -220,7 +221,14 @@ class InterimSessionModelForm(forms.ModelForm): if self.instance.agenda(): doc = self.instance.agenda() path = os.path.join(doc.get_file_path(), doc.filename_with_rev()) - self.initial['agenda'] = get_document_content(os.path.basename(path), path, markup=False) + content = get_document_content(os.path.basename(path), path, markup=False).decode('utf-8') + utext = doc.text_or_error() # pyflakes:ignore + if content and content != utext and not 'Error; cannot read' in content: + debug.show('content[:64]') + debug.show('utext[:64]') + log.assertion('content == utext') + self.initial['agenda'] = content + def clean_date(self): '''Date field validator. We can't use required on the input because diff --git a/ietf/secr/telechat/views.py b/ietf/secr/telechat/views.py index 57d0b854f..61334e540 100644 --- a/ietf/secr/telechat/views.py +++ b/ietf/secr/telechat/views.py @@ -6,6 +6,8 @@ from django.forms.formsets import formset_factory from django.shortcuts import render, get_object_or_404, redirect from django.utils.functional import curry +import debug # pyflakes:ignore + from ietf.doc.models import DocEvent, Document, BallotDocEvent, BallotPositionDocEvent, BallotType, WriteupDocEvent from ietf.doc.utils import get_document_content, add_state_change_event from ietf.person.models import Person @@ -15,7 +17,7 @@ from ietf.iesg.models import TelechatDate, TelechatAgendaItem, Telechat from ietf.iesg.agenda import agenda_data, get_doc_section from ietf.ietfauth.utils import role_required from ietf.secr.telechat.forms import BallotForm, ChangeStateForm, DateSelectForm, TELECHAT_TAGS - +from ietf.utils import log ''' @@ -70,7 +72,12 @@ def get_doc_writeup(doc): writeup = latest.text elif doc.type_id == 'conflrev': path = os.path.join(doc.get_file_path(),doc.filename_with_rev()) - writeup = get_document_content(doc.name,path,split=False,markup=False) + writeup = get_document_content(doc.name,path,split=False,markup=False).decode('utf-8') + utext = doc.text_or_error() # pyflakes:ignore + if writeup and writeup != utext and not 'Error; cannot read' in writeup: + debug.show('writeup[:64]') + debug.show('utext[:64]') + log.assertion('writeup == utext') return writeup def get_last_telechat_date(): diff --git a/ietf/utils/markup_txt.py b/ietf/utils/markup_txt.py index 6efbf26a0..fd71a517f 100644 --- a/ietf/utils/markup_txt.py +++ b/ietf/utils/markup_txt.py @@ -30,26 +30,37 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from django.utils.html import escape -import string import re +import six +import string +from django.utils.html import escape + +from ietf.utils import log from ietf.utils.text import wordwrap -def markup(content, split=True, width=None): +def markup_ascii(content, width=None): + log.unreachable('2017-12-08') + if six.PY2: + assert isinstance(content, basestring) + # at this point, "content" is normal string + # fix most common non-ASCII characters + t1 = string.maketrans("\x91\x92\x93\x94\x95\x96\x97\xc6\xe8\xe9", "\'\'\"\"o--\'ee") + # map everything except printable ASCII, TAB, LF, FF to "?" + t2 = string.maketrans('','') + t3 = "?"*9 + "\t\n?\f" + "?"*19 + t2[32:127] + "?"*129 + t4 = t1.translate(t3) + content = content.translate(t4) + else: + log.assertion('six.PY2') + return markup(content.decode('ascii'), width) + +def markup(content, width=None): + log.assertion('isinstance(content, six.text_type)') # normalize line endings to LF only content = content.replace("\r\n", "\n") content = content.replace("\r", "\n") - # at this point, "content" is normal string - # fix most common non-ASCII characters - t1 = string.maketrans("\x91\x92\x93\x94\x95\x96\x97\xc6\xe8\xe9", "\'\'\"\"o--\'ee") - # map everything except printable ASCII, TAB, LF, FF to "?" - t2 = string.maketrans('','') - t3 = "?"*9 + "\t\n?\f" + "?"*19 + t2[32:127] + "?"*129 - t4 = t1.translate(t3) - content = content.translate(t4) - # remove leading white space content = content.lstrip() # remove runs of blank lines @@ -69,36 +80,4 @@ def markup(content, split=True, width=None): content = re.sub("\n\n([0-9]+\\.|[A-Z]\\.[0-9]|Appendix|Status of|Abstract|Table of|Full Copyright|Copyright|Intellectual Property|Acknowled|Author|Index)(.*)(?=\n\n)", """\n\n\g<1>\g<2>""", content) - if split: - n = content.find("\n", 5000) - content1 = "
"+content[:n+1]+"
\n" - return content1 - #content2 = "
"+content[n+1:]+"
\n" - #return (content1, content2) - else: - return "
" + content + "
\n" - -def markup_unicode(content, split=True, width=None, container_classes=None): - # normalize line endings to LF only - content = content.replace("\r\n", "\n") - content = content.replace("\r", "\n") - - # remove leading white space - content = content.lstrip() - # remove runs of blank lines - content = re.sub("\n\n\n+", "\n\n", content) - - # maybe wordwrap. This must be done before the escaping below. - if width: - content = wordwrap(content, width) - - # expand tabs + escape - content_to_show = escape(content.expandtabs()) - - if split: - n = content.find("\n", 5000) - content_to_show = content_to_show[:n+1] - - pre = '
' % container_classes if container_classes else '
'
-
-    return pre+content_to_show+'
\n' + return "
" + content + "
\n" diff --git a/ietf/utils/text.py b/ietf/utils/text.py index 0d074f158..a06d9dd50 100644 --- a/ietf/utils/text.py +++ b/ietf/utils/text.py @@ -124,3 +124,12 @@ def isascii(text): return True except UnicodeEncodeError: return False + +def maybe_split(text, split=True, pos=5000): + if split: + n = text.find("\n", pos) + text = text[:n+1] + return text + + +