diff --git a/ietf/doc/models.py b/ietf/doc/models.py index 639e6ca85..077502db1 100644 --- a/ietf/doc/models.py +++ b/ietf/doc/models.py @@ -530,7 +530,7 @@ class DocumentInfo(models.Model): def replaced_by(self): return set([ r.document for r in self.related_that("replaces") ]) - def text(self): + def text(self, size = -1): path = self.get_file_name() root, ext = os.path.splitext(path) txtpath = root+'.txt' @@ -538,14 +538,21 @@ class DocumentInfo(models.Model): path = txtpath try: with io.open(path, 'rb') as file: - raw = file.read() + raw = file.read(size) except IOError: return None + text = None try: text = raw.decode('utf-8') except UnicodeDecodeError: - text = raw.decode('latin-1') - # + for back in range(1,4): + try: + text = raw[:-back].decode('utf-8') + break + except UnicodeDecodeError: + pass + if text is None: + text = raw.decode('latin-1') return text def text_or_error(self): diff --git a/ietf/doc/views_doc.py b/ietf/doc/views_doc.py index 915dcebde..50c60aefc 100644 --- a/ietf/doc/views_doc.py +++ b/ietf/doc/views_doc.py @@ -84,7 +84,7 @@ from ietf.review.models import ReviewAssignment from ietf.review.utils import can_request_review_of_doc, review_assignments_to_list_for_docs, review_requests_to_list_for_docs from ietf.review.utils import no_review_from_teams_on_doc from ietf.utils import markup_txt, log, markdown -from ietf.utils.draft import PlaintextDraft +from ietf.utils.draft import get_status_from_draft_text from ietf.utils.meetecho import MeetechoAPIError, SlidesManager from ietf.utils.response import permission_denied from ietf.utils.text import maybe_split @@ -2261,12 +2261,11 @@ def idnits2_state(request, name, rev=None): elif doc.intended_std_level: doc.deststatus = doc.intended_std_level.name else: - text = doc.text() + # 10000 is a conservative prefix on number of utf-8 encoded bytes to + # cover at least the first 10 lines of characters + text = doc.text(size=10000) if text: - parsed_draft = PlaintextDraft( - text=doc.text(), source=name, name_from_source=False - ) - doc.deststatus = parsed_draft.get_status() + doc.deststatus = get_status_from_draft_text(text) else: doc.deststatus = "Unknown" return render( diff --git a/ietf/utils/draft.py b/ietf/utils/draft.py index a1e79760e..50add5abb 100755 --- a/ietf/utils/draft.py +++ b/ietf/utils/draft.py @@ -131,6 +131,24 @@ def acronym_match(s, l): #_debug(" s:%s; l:%s => %s; %s" % (s, l, acronym, s==acronym)) return s == acronym +def get_status_from_draft_text(text): + + # Take prefix to shortcut work over very large drafts + # 5000 is conservatively much more than a full page of characters and we + # only want the first 10 lines. + text = text.strip()[:5000] # Take prefix to shortcut work over very large drafts + text = re.sub(".\x08", "", text) # Get rid of inkribbon backspace-emphasis + text = text.replace("\r\n", "\n") # Convert DOS to unix + text = text.replace("\r", "\n") # Convert MAC to unix + lines = text.split("\n")[:10] + status = None + for line in lines: + status_match = re.search(r"^\s*Intended [Ss]tatus:\s*(.*?) ", line) + if status_match: + status = status_match.group(1) + break + return status + class Draft: """Base class for drafts