From adbf8acb819f99577edb2c7ce946dd2e3f93743d Mon Sep 17 00:00:00 2001 From: Robert Sparks Date: Tue, 7 Dec 2021 23:49:58 +0000 Subject: [PATCH] Provide pdfs of htmlized (pdfized) documents to replace tools.ietf.org/pdf/ at /doc/pdf. Commit ready for merge. - Legacy-Id: 19753 --- ietf/doc/factories.py | 17 +++++++++++++++++ ietf/doc/models.py | 17 +++++++++++++++++ ietf/doc/tests.py | 34 ++++++++++++++++++++++++++++++++++ ietf/doc/urls.py | 1 + ietf/doc/views_doc.py | 37 +++++++++++++++++++++++++++++++++++-- ietf/settings.py | 17 +++++++++++++++++ requirements.txt | 1 + 7 files changed, 122 insertions(+), 2 deletions(-) diff --git a/ietf/doc/factories.py b/ietf/doc/factories.py index 35c3824f4..4f6a5cf1b 100644 --- a/ietf/doc/factories.py +++ b/ietf/doc/factories.py @@ -147,6 +147,12 @@ class IndividualRfcFactory(IndividualDraftFactory): else: obj.set_state(State.objects.get(type_id='draft',slug='rfc')) + @factory.post_generation + def reset_canonical_name(obj, create, extracted, **kwargs): + if hasattr(obj, '_canonical_name'): + del obj._canonical_name + return None + class WgDraftFactory(BaseDocumentFactory): type_id = 'draft' @@ -186,6 +192,11 @@ class WgRfcFactory(WgDraftFactory): obj.set_state(State.objects.get(type_id='draft',slug='rfc')) obj.set_state(State.objects.get(type_id='draft-iesg', slug='pub')) + @factory.post_generation + def reset_canonical_name(obj, create, extracted, **kwargs): + if hasattr(obj, '_canonical_name'): + del obj._canonical_name + return None class RgDraftFactory(BaseDocumentFactory): @@ -230,6 +241,12 @@ class RgRfcFactory(RgDraftFactory): obj.set_state(State.objects.get(type_id='draft-stream-irtf', slug='pub')) obj.set_state(State.objects.get(type_id='draft-iesg',slug='idexists')) + @factory.post_generation + def reset_canonical_name(obj, create, extracted, **kwargs): + if hasattr(obj, '_canonical_name'): + del obj._canonical_name + return None + class CharterFactory(BaseDocumentFactory): diff --git a/ietf/doc/models.py b/ietf/doc/models.py index cd7327428..07887c503 100644 --- a/ietf/doc/models.py +++ b/ietf/doc/models.py @@ -10,6 +10,7 @@ import rfc2html import time from typing import Optional, TYPE_CHECKING +from weasyprint import HTML as wpHTML from django.db import models from django.core import checks @@ -565,6 +566,22 @@ class DocumentInfo(models.Model): cache.set(cache_key, html, settings.HTMLIZER_CACHE_TIME) return html + def pdfized(self): + name = self.get_base_name() + text = self.text() + cache = caches['pdfized'] + cache_key = name.split('.')[0] + try: + pdf = cache.get(cache_key) + except EOFError: + pdf = None + if not pdf: + html = rfc2html.markup(text, path=settings.PDFIZER_URL_PREFIX) + pdf = wpHTML(string=html).write_pdf(stylesheets=[io.BytesIO(b'html { font-size: 94%;}')]) + if pdf: + cache.set(cache_key, pdf, settings.PDFIZER_CACHE_TIME) + return pdf + def references(self): return self.relations_that_doc(('refnorm','refinfo','refunk','refold')) diff --git a/ietf/doc/tests.py b/ietf/doc/tests.py index d630a90c3..ec99f1035 100644 --- a/ietf/doc/tests.py +++ b/ietf/doc/tests.py @@ -2733,4 +2733,38 @@ class RawIdTests(TestCase): charter = CharterFactory() self.should_404(dict(name=charter.name)) +class PdfizedTests(TestCase): + def __init__(self, *args, **kwargs): + self.view = "ietf.doc.views_doc.document_pdfized" + super(self.__class__, self).__init__(*args, **kwargs) + + def should_succeed(self, argdict): + url = urlreverse(self.view, kwargs=argdict) + r = self.client.get(url) + self.assertEqual(r.status_code,200) + self.assertEqual(r.get('Content-Type'),'application/pdf;charset=utf-8') + + def should_404(self, argdict): + url = urlreverse(self.view, kwargs=argdict) + r = self.client.get(url) + self.assertEqual(r.status_code, 404) + + def test_pdfized(self): + rfc = WgRfcFactory(create_revisions=range(0,2)) + + dir = settings.RFC_PATH + with (Path(dir) / f'{rfc.canonical_name()}.txt').open('w') as f: + f.write('text content') + dir = settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR + for r in range(0,2): + with (Path(dir) / f'{rfc.name}-{r:02d}.txt').open('w') as f: + f.write('text content') + + self.should_succeed(dict(name=rfc.canonical_name())) + self.should_succeed(dict(name=rfc.name)) + for r in range(0,2): + self.should_succeed(dict(name=rfc.name,rev=f'{r:02d}')) + for ext in ('pdf','txt','html','anythingatall'): + self.should_succeed(dict(name=rfc.name,rev=f'{r:02d}',ext=ext)) + self.should_404(dict(name=rfc.name,rev='02')) diff --git a/ietf/doc/urls.py b/ietf/doc/urls.py index 47a8f5a03..e5614503d 100644 --- a/ietf/doc/urls.py +++ b/ietf/doc/urls.py @@ -72,6 +72,7 @@ urlpatterns = [ url(r'^html/%(name)s(?:-%(rev)s)?(\.txt|\.html)?/?$' % settings.URL_REGEXPS, views_doc.document_html), url(r'^id/%(name)s(?:-%(rev)s)?(?:\.(?P(txt|html|xml)))?/?$' % settings.URL_REGEXPS, views_doc.document_raw_id), + url(r'^pdf/%(name)s(?:-%(rev)s)?(?:\.(?P[a-z]+))?/?$' % settings.URL_REGEXPS, views_doc.document_pdfized), # End of block that should be an idealized docs.ietf.org service instead diff --git a/ietf/doc/views_doc.py b/ietf/doc/views_doc.py index f73b3de12..0c81f8c24 100644 --- a/ietf/doc/views_doc.py +++ b/ietf/doc/views_doc.py @@ -769,8 +769,7 @@ def document_html(request, name, rev=None): return redirect('ietf.doc.views_doc.document_html', name=found.matched_name) doc = found.documents.get() - if not os.path.exists(doc.get_file_name()): - raise Http404("File not found: %s" % doc.get_file_name()) + if found.matched_rev or found.matched_name.startswith('rfc'): rev = found.matched_rev @@ -778,6 +777,10 @@ def document_html(request, name, rev=None): rev = doc.rev if rev: doc = doc.history_set.filter(rev=rev).first() or doc.fake_history_obj(rev) + + if not os.path.exists(doc.get_file_name()): + raise Http404("File not found: %s" % doc.get_file_name()) + if doc.type_id in ['draft',]: doc.supermeta = build_doc_supermeta_block(doc) doc.meta = build_doc_meta_block(doc, settings.HTMLIZER_URL_PREFIX) @@ -803,6 +806,36 @@ def document_html(request, name, rev=None): return render(request, "doc/document_html.html", {"doc":doc, "doccolor":doccolor }) +def document_pdfized(request, name, rev=None, ext=None): + + found = fuzzy_find_documents(name, rev) + num_found = found.documents.count() + if num_found == 0: + raise Http404("Document not found: %s" % name) + if num_found > 1: + raise Http404("Multiple documents matched: %s" % name) + + if found.matched_name.startswith('rfc') and name != found.matched_name: + return redirect('ietf.doc.views_doc.document_pdfized', name=found.matched_name) + + doc = found.documents.get() + + if found.matched_rev or found.matched_name.startswith('rfc'): + rev = found.matched_rev + else: + rev = doc.rev + if rev: + doc = doc.history_set.filter(rev=rev).first() or doc.fake_history_obj(rev) + + if not os.path.exists(doc.get_file_name()): + raise Http404("File not found: %s" % doc.get_file_name()) + + pdf = doc.pdfized() + if pdf: + return HttpResponse(pdf,content_type='application/pdf;charset=utf-8') + else: + raise Http404 + def check_doc_email_aliases(): pattern = re.compile(r'^expand-(.*?)(\..*?)?@.*? +(.*)$') good_count = 0 diff --git a/ietf/settings.py b/ietf/settings.py index f836ceb59..bb0915932 100644 --- a/ietf/settings.py +++ b/ietf/settings.py @@ -743,6 +743,13 @@ CACHES = { 'MAX_ENTRIES': 100000, # 100,000 }, }, + 'pdfized': { + 'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache', + 'LOCATION': '/a/cache/datatracker/pdfized', + 'OPTIONS': { + 'MAX_ENTRIES': 100000, # 100,000 + }, + }, 'slowpages': { 'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache', 'LOCATION': '/a/cache/datatracker/slowpages', @@ -755,6 +762,8 @@ CACHES = { HTMLIZER_VERSION = 1 HTMLIZER_URL_PREFIX = "/doc/html" HTMLIZER_CACHE_TIME = 60*60*24*14 # 14 days +PDFIZER_CACHE_TIME = HTMLIZER_CACHE_TIME +PDFIZER_URL_PREFIX = IDTRACKER_BASE_URL+"/doc/pdf" # Email settings IPR_EMAIL_FROM = 'ietf-ipr@ietf.org' @@ -1267,6 +1276,14 @@ if SERVER_MODE != 'production': 'MAX_ENTRIES': 1000, }, }, + 'pdfized': { + 'BACKEND': 'django.core.cache.backends.dummy.DummyCache', + #'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache', + 'LOCATION': '/var/cache/datatracker/pdfized', + 'OPTIONS': { + 'MAX_ENTRIES': 1000, + }, + }, 'slowpages': { 'BACKEND': 'django.core.cache.backends.dummy.DummyCache', #'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache', diff --git a/requirements.txt b/requirements.txt index 0e66ec607..557f96d0f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -70,6 +70,7 @@ tqdm>=3.7.0 #Trac>=1.0.10,<1.2 Unidecode>=0.4.18,<1.2.0 #wsgiref>=0.1.2 +weasyprint>=53.4 xml2rfc>=2.35.0 xym>=0.4.4,!=0.4.7,<1.0 #zxcvbn-python>=4.4.14 # Not needed until we do back-end password entropy validation