Started refactoring of reading text from document files (drafts, charters, etc.) in order to normalise on one way of doing this, and making that return unicode rather than undecoded bytes. This is the first step of two, in order to gauge the possible issues and report on discrepancies.

- Legacy-Id: 14406
This commit is contained in:
Henrik Levkowetz 2017-12-08 21:51:11 +00:00
parent f2f21c4ef3
commit 967ece7e7d
11 changed files with 135 additions and 66 deletions

View file

@ -8,6 +8,8 @@ from django.utils.html import strip_tags
from django.conf import settings
from django.urls import reverse as urlreverse
import debug # pyflakes:ignore
from ietf.utils.mail import send_mail, send_mail_text
from ietf.ipr.utils import iprs_from_docs, related_docs
from ietf.doc.models import WriteupDocEvent, LastCallDocEvent, DocAlias, ConsensusDocEvent
@ -15,6 +17,7 @@ from ietf.doc.utils import needed_ballot_positions, get_document_content
from ietf.group.models import Role
from ietf.doc.models import Document
from ietf.mailtrigger.utils import gather_address_lists
from ietf.utils import log
def email_state_changed(request, doc, text, mailtrigger_id=None):
(to,cc) = gather_address_lists(mailtrigger_id or 'doc_state_edited',doc=doc)
@ -515,7 +518,13 @@ def email_charter_internal_review(request, charter):
os.path.join(settings.CHARTER_PATH,filename),
split=False,
markup=False,
)
).decode('utf-8')
utext = charter.text_or_error() # pyflakes:ignore
if charter_text and charter_text != utext and not 'Error; cannot read' in charter_text:
debug.show('charter_text[:64]')
debug.show('utext[:64]')
log.assertion('charter_text == utext')
send_mail(request, addrs.to, settings.DEFAULT_FROM_EMAIL,
'Internal %s Review: %s (%s)'%(charter.group.type.name,charter.group.name,charter.group.acronym),
'doc/mail/charter_internal_review.txt',

View file

@ -449,6 +449,9 @@ class DocumentInfo(models.Model):
#
return text
def text_or_error(self):
return self.text() or "Error; cannot read (%s)"%self.get_file_name()
def htmlized(self):
name = self.get_base_name()
text = self.text()

View file

@ -18,7 +18,7 @@ import debug # pyflakes:ignore
from ietf.doc.models import ConsensusDocEvent
from ietf.doc.utils import get_document_content
from ietf.utils.text import wordwrap, fill, wrap_text_if_unwrapped
from ietf.utils import log
register = template.Library()
@ -509,7 +509,13 @@ def document_content(doc):
if doc is None:
return None
path = os.path.join(doc.get_file_path(),doc.filename_with_rev())
return get_document_content(doc.name,path,markup=False)
content = get_document_content(doc.name,path,markup=False)
utext = doc.text_or_error() # pyflakes:ignore
if content and content != utext and not 'Error; cannot read' in content:
debug.show('content[:64]')
debug.show('utext[:64]')
log.assertion('content == utext')
return content
@register.filter
def format_timedelta(timedelta):

View file

@ -22,7 +22,7 @@ from ietf.doc.models import TelechatDocEvent
from ietf.name.models import DocReminderTypeName, DocRelationshipName
from ietf.group.models import Role
from ietf.ietfauth.utils import has_role
from ietf.utils import draft, markup_txt
from ietf.utils import draft
from ietf.utils.mail import send_mail
from ietf.mailtrigger.utils import gather_address_lists
@ -299,6 +299,7 @@ def get_unicode_document_content(key, filename, codec='utf-8', errors='ignore'):
return raw_content
def get_document_content(key, filename, split=True, markup=True):
#log.unreachable("2017-12-05")
try:
with open(filename, 'rb') as f:
raw_content = f.read()
@ -306,10 +307,11 @@ def get_document_content(key, filename, split=True, markup=True):
error = "Error; cannot read ("+key+")"
return error
if markup:
return markup_txt.markup(raw_content, split)
else:
return raw_content
# if markup:
# return markup_txt.markup(raw_content, split)
# else:
# return raw_content
return raw_content
def tags_suffix(tags):
return (u"::" + u"::".join(t.name for t in tags)) if tags else u""

View file

@ -254,7 +254,12 @@ def edit_ad(request, name):
def default_approval_text(review):
filename = "%s-%s.txt" % (review.canonical_name(), review.rev)
current_text = get_document_content(filename, os.path.join(settings.CONFLICT_REVIEW_PATH, filename), split=False, markup=False)
current_text = get_document_content(filename, os.path.join(settings.CONFLICT_REVIEW_PATH, filename), split=False, markup=False).decode('utf-8')
utext = review.text_or_error() # pyflakes:ignore
if current_text and current_text != utext and not 'Error; cannot read' in current_text:
debug.show('current_text[:64]')
debug.show('utext[:64]')
log.assertion('current_text == utext')
conflictdoc = review.relateddocument_set.get(relationship__slug='conflrev').target.document
if conflictdoc.stream_id=='ise':

View file

@ -66,6 +66,8 @@ from ietf.meeting.utils import group_sessions, get_upcoming_manageable_sessions,
from ietf.review.models import ReviewRequest
from ietf.review.utils import can_request_review_of_doc, review_requests_to_list_for_docs
from ietf.review.utils import no_review_from_teams_on_doc
from ietf.utils import markup_txt, log
from ietf.utils.text import maybe_split
def render_document_top(request, doc, tab, name):
@ -186,7 +188,13 @@ def document_main(request, name, rev=None):
filename = name + ".txt"
content = get_document_content(filename, os.path.join(settings.RFC_PATH, filename),
split_content, markup=True)
split_content, markup=True).decode('utf-8')
utext = doc.text_or_error() # pyflakes:ignore
if content and content != utext and not 'Error; cannot read' in content:
debug.show('content[:64]')
debug.show('utext[:64]')
log.assertion('content == utext')
content = markup_txt.markup(maybe_split(content, split=split_content))
# file types
base_path = os.path.join(settings.RFC_PATH, name + ".")
@ -216,7 +224,13 @@ def document_main(request, name, rev=None):
filename = "%s-%s.txt" % (draft_name, doc.rev)
content = get_document_content(filename, os.path.join(settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR, filename),
split_content, markup=True)
split_content, markup=True).decode('utf-8')
utext = doc.text_or_error() # pyflakes:ignore
if content and content != utext and not 'Error; cannot read' in content:
debug.show('content[:64]')
debug.show('utext[:64]')
log.assertion('content == utext')
content = markup_txt.markup(maybe_split(content, split=split_content))
# file types
base_path = os.path.join(settings.INTERNET_DRAFT_PATH, doc.name + "-" + doc.rev + ".")
@ -439,7 +453,13 @@ def document_main(request, name, rev=None):
if doc.type_id == "charter":
filename = "%s-%s.txt" % (doc.canonical_name(), doc.rev)
content = get_document_content(filename, os.path.join(settings.CHARTER_PATH, filename), split=False, markup=True)
content = get_document_content(filename, os.path.join(settings.CHARTER_PATH, filename), split=False, markup=True).decode('utf-8')
utext = doc.text_or_error() # pyflakes:ignore
if content and content != utext and not 'Error; cannot read' in content:
debug.show('content[:64]')
debug.show('utext[:64]')
log.assertion('content == utext')
content = markup_txt.markup(content)
ballot_summary = None
if doc.get_state_slug() in ("intrev", "iesgrev"):
@ -480,9 +500,15 @@ def document_main(request, name, rev=None):
if doc.rev == "00" and not os.path.isfile(pathname):
# This could move to a template
content = "A conflict review response has not yet been proposed."
content = u"A conflict review response has not yet been proposed."
else:
content = get_document_content(filename, pathname, split=False, markup=True)
content = get_document_content(filename, pathname, split=False, markup=True).decode('utf-8')
utext = doc.text_or_error() # pyflakes:ignore
if content and content != utext and not 'Error; cannot read' in content:
debug.show('content[:64]')
debug.show('utext[:64]')
log.assertion('content == utext')
content = markup_txt.markup(content)
ballot_summary = None
if doc.get_state_slug() in ("iesgeval") and doc.active_ballot():
@ -507,9 +533,14 @@ def document_main(request, name, rev=None):
if doc.rev == "00" and not os.path.isfile(pathname):
# This could move to a template
content = "Status change text has not yet been proposed."
content = u"Status change text has not yet been proposed."
else:
content = get_document_content(filename, pathname, split=False)
content = get_document_content(filename, pathname, split=False).decode('utf-8')
utext = doc.text_or_error() # pyflakes:ignore
if content and content != utext and not 'Error; cannot read' in content:
debug.show('content[:64]')
debug.show('utext[:64]')
log.assertion('content == utext')
ballot_summary = None
if doc.get_state_slug() in ("iesgeval"):
@ -562,7 +593,12 @@ def document_main(request, name, rev=None):
url = urlbase + extension
if extension == ".txt":
content = get_document_content(basename, pathname + extension, split=False)
content = get_document_content(basename, pathname + extension, split=False).decode('utf-8')
utext = doc.text_or_error() # pyflakes:ignore
if content != utext:
debug.show('content[:64]')
debug.show('utext[:64]')
log.assertion('content == utext')
t = "plain text"
other_types.append((t, url))

View file

@ -282,7 +282,12 @@ def newstatus(relateddoc):
def default_approval_text(status_change,relateddoc):
filename = "%s-%s.txt" % (status_change.canonical_name(), status_change.rev)
current_text = get_document_content(filename, os.path.join(settings.STATUS_CHANGE_PATH, filename), split=False, markup=False)
current_text = get_document_content(filename, os.path.join(settings.STATUS_CHANGE_PATH, filename), split=False, markup=False).decode('utf-8')
utext = status_change.text_or_error() # pyflakes:ignore
if current_text and current_text != utext and not 'Error; cannot read' in current_text:
debug.show('current_text[:64]')
debug.show('utext[:64]')
log.assertion('current_text == utext')
if relateddoc.target.document.std_level.slug in ('std','ps','ds','bcp',):
action = "Protocol Action"

View file

@ -18,6 +18,7 @@ from ietf.meeting.helpers import is_meeting_approved, get_next_agenda_name
from ietf.message.models import Message
from ietf.person.models import Person
from ietf.utils.fields import DatepickerDateField, DurationField
from ietf.utils import log
# need to insert empty option for use in ChoiceField
# countries.insert(0, ('', '-'*9 ))
@ -220,7 +221,14 @@ class InterimSessionModelForm(forms.ModelForm):
if self.instance.agenda():
doc = self.instance.agenda()
path = os.path.join(doc.get_file_path(), doc.filename_with_rev())
self.initial['agenda'] = get_document_content(os.path.basename(path), path, markup=False)
content = get_document_content(os.path.basename(path), path, markup=False).decode('utf-8')
utext = doc.text_or_error() # pyflakes:ignore
if content and content != utext and not 'Error; cannot read' in content:
debug.show('content[:64]')
debug.show('utext[:64]')
log.assertion('content == utext')
self.initial['agenda'] = content
def clean_date(self):
'''Date field validator. We can't use required on the input because

View file

@ -6,6 +6,8 @@ from django.forms.formsets import formset_factory
from django.shortcuts import render, get_object_or_404, redirect
from django.utils.functional import curry
import debug # pyflakes:ignore
from ietf.doc.models import DocEvent, Document, BallotDocEvent, BallotPositionDocEvent, BallotType, WriteupDocEvent
from ietf.doc.utils import get_document_content, add_state_change_event
from ietf.person.models import Person
@ -15,7 +17,7 @@ from ietf.iesg.models import TelechatDate, TelechatAgendaItem, Telechat
from ietf.iesg.agenda import agenda_data, get_doc_section
from ietf.ietfauth.utils import role_required
from ietf.secr.telechat.forms import BallotForm, ChangeStateForm, DateSelectForm, TELECHAT_TAGS
from ietf.utils import log
'''
@ -70,7 +72,12 @@ def get_doc_writeup(doc):
writeup = latest.text
elif doc.type_id == 'conflrev':
path = os.path.join(doc.get_file_path(),doc.filename_with_rev())
writeup = get_document_content(doc.name,path,split=False,markup=False)
writeup = get_document_content(doc.name,path,split=False,markup=False).decode('utf-8')
utext = doc.text_or_error() # pyflakes:ignore
if writeup and writeup != utext and not 'Error; cannot read' in writeup:
debug.show('writeup[:64]')
debug.show('utext[:64]')
log.assertion('writeup == utext')
return writeup
def get_last_telechat_date():

View file

@ -30,26 +30,37 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from django.utils.html import escape
import string
import re
import six
import string
from django.utils.html import escape
from ietf.utils import log
from ietf.utils.text import wordwrap
def markup(content, split=True, width=None):
def markup_ascii(content, width=None):
log.unreachable('2017-12-08')
if six.PY2:
assert isinstance(content, basestring)
# at this point, "content" is normal string
# fix most common non-ASCII characters
t1 = string.maketrans("\x91\x92\x93\x94\x95\x96\x97\xc6\xe8\xe9", "\'\'\"\"o--\'ee")
# map everything except printable ASCII, TAB, LF, FF to "?"
t2 = string.maketrans('','')
t3 = "?"*9 + "\t\n?\f" + "?"*19 + t2[32:127] + "?"*129
t4 = t1.translate(t3)
content = content.translate(t4)
else:
log.assertion('six.PY2')
return markup(content.decode('ascii'), width)
def markup(content, width=None):
log.assertion('isinstance(content, six.text_type)')
# normalize line endings to LF only
content = content.replace("\r\n", "\n")
content = content.replace("\r", "\n")
# at this point, "content" is normal string
# fix most common non-ASCII characters
t1 = string.maketrans("\x91\x92\x93\x94\x95\x96\x97\xc6\xe8\xe9", "\'\'\"\"o--\'ee")
# map everything except printable ASCII, TAB, LF, FF to "?"
t2 = string.maketrans('','')
t3 = "?"*9 + "\t\n?\f" + "?"*19 + t2[32:127] + "?"*129
t4 = t1.translate(t3)
content = content.translate(t4)
# remove leading white space
content = content.lstrip()
# remove runs of blank lines
@ -69,36 +80,4 @@ def markup(content, split=True, width=None):
content = re.sub("\n\n([0-9]+\\.|[A-Z]\\.[0-9]|Appendix|Status of|Abstract|Table of|Full Copyright|Copyright|Intellectual Property|Acknowled|Author|Index)(.*)(?=\n\n)", """\n\n<span class="m_h">\g<1>\g<2></span>""", content)
if split:
n = content.find("\n", 5000)
content1 = "<pre>"+content[:n+1]+"</pre>\n"
return content1
#content2 = "<pre>"+content[n+1:]+"</pre>\n"
#return (content1, content2)
else:
return "<pre>" + content + "</pre>\n"
def markup_unicode(content, split=True, width=None, container_classes=None):
# normalize line endings to LF only
content = content.replace("\r\n", "\n")
content = content.replace("\r", "\n")
# remove leading white space
content = content.lstrip()
# remove runs of blank lines
content = re.sub("\n\n\n+", "\n\n", content)
# maybe wordwrap. This must be done before the escaping below.
if width:
content = wordwrap(content, width)
# expand tabs + escape
content_to_show = escape(content.expandtabs())
if split:
n = content.find("\n", 5000)
content_to_show = content_to_show[:n+1]
pre = '<pre class="%s" >' % container_classes if container_classes else '<pre>'
return pre+content_to_show+'</pre>\n'
return "<pre>" + content + "</pre>\n"

View file

@ -124,3 +124,12 @@ def isascii(text):
return True
except UnicodeEncodeError:
return False
def maybe_split(text, split=True, pos=5000):
if split:
n = text.find("\n", pos)
text = text[:n+1]
return text