Merged in [19753] from rjsparks@nostrum.com:

Provide pdfs of htmlized (pdfized) documents to replace tools.ietf.org/pdf/ at /doc/pdf.
 - Legacy-Id: 19758
Note: SVN reference [19753] has been migrated to Git commit adbf8acb81
This commit is contained in:
Robert Sparks 2021-12-08 00:11:24 +00:00
commit 5883e10cca
7 changed files with 122 additions and 2 deletions

View file

@ -147,6 +147,12 @@ class IndividualRfcFactory(IndividualDraftFactory):
else:
obj.set_state(State.objects.get(type_id='draft',slug='rfc'))
@factory.post_generation
def reset_canonical_name(obj, create, extracted, **kwargs):
if hasattr(obj, '_canonical_name'):
del obj._canonical_name
return None
class WgDraftFactory(BaseDocumentFactory):
type_id = 'draft'
@ -186,6 +192,11 @@ class WgRfcFactory(WgDraftFactory):
obj.set_state(State.objects.get(type_id='draft',slug='rfc'))
obj.set_state(State.objects.get(type_id='draft-iesg', slug='pub'))
@factory.post_generation
def reset_canonical_name(obj, create, extracted, **kwargs):
if hasattr(obj, '_canonical_name'):
del obj._canonical_name
return None
class RgDraftFactory(BaseDocumentFactory):
@ -230,6 +241,12 @@ class RgRfcFactory(RgDraftFactory):
obj.set_state(State.objects.get(type_id='draft-stream-irtf', slug='pub'))
obj.set_state(State.objects.get(type_id='draft-iesg',slug='idexists'))
@factory.post_generation
def reset_canonical_name(obj, create, extracted, **kwargs):
if hasattr(obj, '_canonical_name'):
del obj._canonical_name
return None
class CharterFactory(BaseDocumentFactory):

View file

@ -10,6 +10,7 @@ import rfc2html
import time
from typing import Optional, TYPE_CHECKING
from weasyprint import HTML as wpHTML
from django.db import models
from django.core import checks
@ -565,6 +566,22 @@ class DocumentInfo(models.Model):
cache.set(cache_key, html, settings.HTMLIZER_CACHE_TIME)
return html
def pdfized(self):
name = self.get_base_name()
text = self.text()
cache = caches['pdfized']
cache_key = name.split('.')[0]
try:
pdf = cache.get(cache_key)
except EOFError:
pdf = None
if not pdf:
html = rfc2html.markup(text, path=settings.PDFIZER_URL_PREFIX)
pdf = wpHTML(string=html).write_pdf(stylesheets=[io.BytesIO(b'html { font-size: 94%;}')])
if pdf:
cache.set(cache_key, pdf, settings.PDFIZER_CACHE_TIME)
return pdf
def references(self):
return self.relations_that_doc(('refnorm','refinfo','refunk','refold'))

View file

@ -2733,4 +2733,38 @@ class RawIdTests(TestCase):
charter = CharterFactory()
self.should_404(dict(name=charter.name))
class PdfizedTests(TestCase):
def __init__(self, *args, **kwargs):
self.view = "ietf.doc.views_doc.document_pdfized"
super(self.__class__, self).__init__(*args, **kwargs)
def should_succeed(self, argdict):
url = urlreverse(self.view, kwargs=argdict)
r = self.client.get(url)
self.assertEqual(r.status_code,200)
self.assertEqual(r.get('Content-Type'),'application/pdf;charset=utf-8')
def should_404(self, argdict):
url = urlreverse(self.view, kwargs=argdict)
r = self.client.get(url)
self.assertEqual(r.status_code, 404)
def test_pdfized(self):
rfc = WgRfcFactory(create_revisions=range(0,2))
dir = settings.RFC_PATH
with (Path(dir) / f'{rfc.canonical_name()}.txt').open('w') as f:
f.write('text content')
dir = settings.INTERNET_ALL_DRAFTS_ARCHIVE_DIR
for r in range(0,2):
with (Path(dir) / f'{rfc.name}-{r:02d}.txt').open('w') as f:
f.write('text content')
self.should_succeed(dict(name=rfc.canonical_name()))
self.should_succeed(dict(name=rfc.name))
for r in range(0,2):
self.should_succeed(dict(name=rfc.name,rev=f'{r:02d}'))
for ext in ('pdf','txt','html','anythingatall'):
self.should_succeed(dict(name=rfc.name,rev=f'{r:02d}',ext=ext))
self.should_404(dict(name=rfc.name,rev='02'))

View file

@ -72,6 +72,7 @@ urlpatterns = [
url(r'^html/%(name)s(?:-%(rev)s)?(\.txt|\.html)?/?$' % settings.URL_REGEXPS, views_doc.document_html),
url(r'^id/%(name)s(?:-%(rev)s)?(?:\.(?P<ext>(txt|html|xml)))?/?$' % settings.URL_REGEXPS, views_doc.document_raw_id),
url(r'^pdf/%(name)s(?:-%(rev)s)?(?:\.(?P<ext>[a-z]+))?/?$' % settings.URL_REGEXPS, views_doc.document_pdfized),
# End of block that should be an idealized docs.ietf.org service instead

View file

@ -769,8 +769,7 @@ def document_html(request, name, rev=None):
return redirect('ietf.doc.views_doc.document_html', name=found.matched_name)
doc = found.documents.get()
if not os.path.exists(doc.get_file_name()):
raise Http404("File not found: %s" % doc.get_file_name())
if found.matched_rev or found.matched_name.startswith('rfc'):
rev = found.matched_rev
@ -778,6 +777,10 @@ def document_html(request, name, rev=None):
rev = doc.rev
if rev:
doc = doc.history_set.filter(rev=rev).first() or doc.fake_history_obj(rev)
if not os.path.exists(doc.get_file_name()):
raise Http404("File not found: %s" % doc.get_file_name())
if doc.type_id in ['draft',]:
doc.supermeta = build_doc_supermeta_block(doc)
doc.meta = build_doc_meta_block(doc, settings.HTMLIZER_URL_PREFIX)
@ -803,6 +806,36 @@ def document_html(request, name, rev=None):
return render(request, "doc/document_html.html", {"doc":doc, "doccolor":doccolor })
def document_pdfized(request, name, rev=None, ext=None):
found = fuzzy_find_documents(name, rev)
num_found = found.documents.count()
if num_found == 0:
raise Http404("Document not found: %s" % name)
if num_found > 1:
raise Http404("Multiple documents matched: %s" % name)
if found.matched_name.startswith('rfc') and name != found.matched_name:
return redirect('ietf.doc.views_doc.document_pdfized', name=found.matched_name)
doc = found.documents.get()
if found.matched_rev or found.matched_name.startswith('rfc'):
rev = found.matched_rev
else:
rev = doc.rev
if rev:
doc = doc.history_set.filter(rev=rev).first() or doc.fake_history_obj(rev)
if not os.path.exists(doc.get_file_name()):
raise Http404("File not found: %s" % doc.get_file_name())
pdf = doc.pdfized()
if pdf:
return HttpResponse(pdf,content_type='application/pdf;charset=utf-8')
else:
raise Http404
def check_doc_email_aliases():
pattern = re.compile(r'^expand-(.*?)(\..*?)?@.*? +(.*)$')
good_count = 0

View file

@ -743,6 +743,13 @@ CACHES = {
'MAX_ENTRIES': 100000, # 100,000
},
},
'pdfized': {
'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache',
'LOCATION': '/a/cache/datatracker/pdfized',
'OPTIONS': {
'MAX_ENTRIES': 100000, # 100,000
},
},
'slowpages': {
'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache',
'LOCATION': '/a/cache/datatracker/slowpages',
@ -755,6 +762,8 @@ CACHES = {
HTMLIZER_VERSION = 1
HTMLIZER_URL_PREFIX = "/doc/html"
HTMLIZER_CACHE_TIME = 60*60*24*14 # 14 days
PDFIZER_CACHE_TIME = HTMLIZER_CACHE_TIME
PDFIZER_URL_PREFIX = IDTRACKER_BASE_URL+"/doc/pdf"
# Email settings
IPR_EMAIL_FROM = 'ietf-ipr@ietf.org'
@ -1273,6 +1282,14 @@ if SERVER_MODE != 'production':
'MAX_ENTRIES': 1000,
},
},
'pdfized': {
'BACKEND': 'django.core.cache.backends.dummy.DummyCache',
#'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache',
'LOCATION': '/var/cache/datatracker/pdfized',
'OPTIONS': {
'MAX_ENTRIES': 1000,
},
},
'slowpages': {
'BACKEND': 'django.core.cache.backends.dummy.DummyCache',
#'BACKEND': 'django.core.cache.backends.filebased.FileBasedCache',

View file

@ -70,6 +70,7 @@ tqdm>=3.7.0
#Trac>=1.0.10,<1.2
Unidecode>=0.4.18,<1.2.0
#wsgiref>=0.1.2
weasyprint>=53.4
xml2rfc>=2.35.0
xym>=0.4.4,!=0.4.7,<1.0
#zxcvbn-python>=4.4.14 # Not needed until we do back-end password entropy validation