From b1585124d6ea914ea79a8d6d4b40d7550e2ab3d1 Mon Sep 17 00:00:00 2001 From: Robert Sparks Date: Thu, 6 Jan 2022 20:17:55 +0000 Subject: [PATCH] Improve robustness of pdfization. Tune the test crawler. Commit ready for merge. - Legacy-Id: 19813 --- bin/test-crawl | 2 ++ ietf/doc/models.py | 6 +++++- ietf/doc/utils.py | 5 +++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/bin/test-crawl b/bin/test-crawl index 416f60f34..3bf5f838d 100755 --- a/bin/test-crawl +++ b/bin/test-crawl @@ -232,6 +232,8 @@ def skip_url(url): # Skip most html conversions, not worth the time "^/doc/html/draft-[0-9ac-z]", "^/doc/html/draft-b[0-9b-z]", + "^/doc/pdf/draft-[0-9ac-z]", + "^/doc/pdf/draft-b[0-9b-z]", "^/doc/html/charter-.*", "^/doc/html/status-.*", "^/doc/html/rfc.*", diff --git a/ietf/doc/models.py b/ietf/doc/models.py index 07887c503..8ddaa3174 100644 --- a/ietf/doc/models.py +++ b/ietf/doc/models.py @@ -577,7 +577,11 @@ class DocumentInfo(models.Model): pdf = None if not pdf: html = rfc2html.markup(text, path=settings.PDFIZER_URL_PREFIX) - pdf = wpHTML(string=html).write_pdf(stylesheets=[io.BytesIO(b'html { font-size: 94%;}')]) + try: + pdf = wpHTML(string=html.replace('\xad','')).write_pdf(stylesheets=[io.BytesIO(b'html { font-size: 94%;}')]) + except AssertionError: + log.log(f'weasyprint failed with an assert on {self.name}') + pdf = None if pdf: cache.set(cache_key, pdf, settings.PDFIZER_CACHE_TIME) return pdf diff --git a/ietf/doc/utils.py b/ietf/doc/utils.py index b3d9eb949..61cf00813 100644 --- a/ietf/doc/utils.py +++ b/ietf/doc/utils.py @@ -1051,8 +1051,9 @@ def build_file_urls(doc): label = "plain text" if t == "txt" else t file_urls.append((label, base + doc.name + "-" + doc.rev + "." + t)) - file_urls.append(("htmlized", urlreverse('ietf.doc.views_doc.document_html', kwargs=dict(name=doc.name, rev=doc.rev)))) - file_urls.append(("pdfized", urlreverse('ietf.doc.views_doc.document_pdfized', kwargs=dict(name=doc.name, rev=doc.rev)))) + if doc.text(): + file_urls.append(("htmlized", urlreverse('ietf.doc.views_doc.document_html', kwargs=dict(name=doc.name, rev=doc.rev)))) + file_urls.append(("pdfized", urlreverse('ietf.doc.views_doc.document_pdfized', kwargs=dict(name=doc.name, rev=doc.rev)))) file_urls.append(("bibtex", urlreverse('ietf.doc.views_doc.document_bibtex',kwargs=dict(name=doc.name,rev=doc.rev)))) return file_urls, found_types