refactor: generate I-D bibxml files via celery (#7426)

* refactor: task to generate_draft_bibxml_files

* test: test task/utility methods

* chore: add periodic task

* chore: remove generate_draft_bibxml_files.py

* chore: further prune /bin/hourly
This commit is contained in:
Jennifer Richards 2024-05-16 13:37:29 -03:00 committed by GitHub
parent de8b3b5ce3
commit ffb9eb12ff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 195 additions and 108 deletions

View file

@ -5,33 +5,15 @@
# This script is expected to be triggered by cron from
# /etc/cron.d/datatracker
export LANG=en_US.UTF-8
export PYTHONIOENCODING=utf-8
# Make sure we stop if something goes wrong:
program=${0##*/}
trap 'echo "$program($LINENO): Command failed with error code $? ([$$] $0 $*)"; exit 1' ERR
DTDIR=/a/www/ietf-datatracker/web
cd $DTDIR/
# Set up the virtual environment
source $DTDIR/env/bin/activate
logger -p user.info -t cron "Running $DTDIR/bin/hourly"
# Generate some static files
ID=/a/ietfdata/doc/draft/repository
DERIVED=/a/ietfdata/derived
DOWNLOAD=/a/www/www6s/download
CHARTER=/a/www/ietf-ftp/charter
wget -q https://datatracker.ietf.org/wg/1wg-charters-by-acronym.txt -O $CHARTER/1wg-charters-by-acronym.txt
wget -q https://datatracker.ietf.org/wg/1wg-charters.txt -O $CHARTER/1wg-charters.txt
# Regenerate the last week of bibxml-ids
$DTDIR/ietf/manage.py generate_draft_bibxml_files
# Create and update group wikis
#$DTDIR/ietf/manage.py create_group_wikis
# exit 0

View file

@ -1,84 +0,0 @@
# Copyright The IETF Trust 2012-2020, All Rights Reserved
# -*- coding: utf-8 -*-
import datetime
import io
import os
import re
import sys
from django.conf import settings
from django.core.management.base import BaseCommand
from django.utils import timezone
import debug # pyflakes:ignore
from ietf.doc.models import NewRevisionDocEvent
from ietf.doc.utils import bibxml_for_draft
DEFAULT_DAYS = 7
class Command(BaseCommand):
help = ('Generate draft bibxml files for xml2rfc references, placing them in the '
'directory configured in settings.BIBXML_BASE_PATH: %s. '
'By default, generate files as needed for new Internet-Draft revisions from the '
'last %s days.' % (settings.BIBXML_BASE_PATH, DEFAULT_DAYS))
def add_arguments(self, parser):
parser.add_argument('--all', action='store_true', default=False, help="Process all documents, not only recent submissions")
parser.add_argument('--days', type=int, default=DEFAULT_DAYS, help="Look submissions from the last DAYS days, instead of %s" % DEFAULT_DAYS)
def say(self, msg):
if self.verbosity > 0:
sys.stdout.write(msg)
sys.stdout.write('\n')
def note(self, msg):
if self.verbosity > 1:
sys.stdout.write(msg)
sys.stdout.write('\n')
def mutter(self, msg):
if self.verbosity > 2:
sys.stdout.write(msg)
sys.stdout.write('\n')
def write(self, fn, new):
# normalize new
new = re.sub(r'\r\n?', r'\n', new)
try:
with io.open(fn, encoding='utf-8') as f:
old = f.read()
except IOError:
old = ""
if old.strip() != new.strip():
self.note('Writing %s' % os.path.basename(fn))
with io.open(fn, "w", encoding='utf-8') as f:
f.write(new)
def handle(self, *args, **options):
self.verbosity = options.get("verbosity", 1)
process_all = options.get("all")
days = options.get("days")
#
bibxmldir = os.path.join(settings.BIBXML_BASE_PATH, 'bibxml-ids')
if not os.path.exists(bibxmldir):
os.makedirs(bibxmldir)
#
if process_all:
doc_events = NewRevisionDocEvent.objects.filter(type='new_revision', doc__type_id='draft')
else:
start = timezone.now() - datetime.timedelta(days=days)
doc_events = NewRevisionDocEvent.objects.filter(type='new_revision', doc__type_id='draft', time__gte=start)
doc_events = doc_events.order_by('time')
for e in doc_events:
self.mutter('%s %s' % (e.time, e.doc.name))
try:
doc = e.doc
bibxml = bibxml_for_draft(doc, e.rev)
ref_rev_file_name = os.path.join(bibxmldir, 'reference.I-D.%s-%s.xml' % (doc.name, e.rev))
self.write(ref_rev_file_name, bibxml)
except Exception as ee:
sys.stderr.write('\n%s-%s: %s\n' % (doc.name, doc.rev, ee))

View file

@ -9,6 +9,7 @@ from celery import shared_task
from pathlib import Path
from django.conf import settings
from django.utils import timezone
from ietf.utils import log
from ietf.utils.timezone import datetime_today
@ -24,8 +25,13 @@ from .expire import (
send_expire_warning_for_draft,
)
from .lastcall import get_expired_last_calls, expire_last_call
from .models import Document
from .utils import generate_idnits2_rfc_status, generate_idnits2_rfcs_obsoleted
from .models import Document, NewRevisionDocEvent
from .utils import (
generate_idnits2_rfc_status,
generate_idnits2_rfcs_obsoleted,
update_or_create_draft_bibxml_file,
ensure_draft_bibxml_path_exists,
)
@shared_task
@ -90,3 +96,24 @@ def generate_idnits2_rfcs_obsoleted_task():
outpath.write_text(blob, encoding="utf8")
except Exception as e:
log.log(f"failed to write idnits2-rfcs-obsoleted: {e}")
@shared_task
def generate_draft_bibxml_files_task(days=7, process_all=False):
"""Generate bibxml files for recently updated docs
If process_all is False (the default), processes only docs with new revisions
in the last specified number of days.
"""
ensure_draft_bibxml_path_exists()
doc_events = NewRevisionDocEvent.objects.filter(
type="new_revision",
doc__type_id="draft",
).order_by("time")
if not process_all:
doc_events = doc_events.filter(time__gte=timezone.now() - datetime.timedelta(days=days))
for event in doc_events:
try:
update_or_create_draft_bibxml_file(event.doc, event.rev)
except Exception as err:
log.log(f"Error generating bibxml for {event.doc.name}-{event.rev}: {err}")

View file

@ -1,18 +1,21 @@
# Copyright The IETF Trust 2024, All Rights Reserved
import datetime
import mock
from pathlib import Path
from django.conf import settings
from django.utils import timezone
from ietf.utils.test_utils import TestCase
from ietf.utils.timezone import datetime_today
from .factories import DocumentFactory
from .models import Document
from .factories import DocumentFactory, NewRevisionDocEventFactory
from .models import Document, NewRevisionDocEvent
from .tasks import (
expire_ids_task,
expire_last_calls_task,
generate_draft_bibxml_files_task,
generate_idnits2_rfcs_obsoleted_task,
generate_idnits2_rfc_status_task,
notify_expirations_task,
@ -114,3 +117,86 @@ class TaskTests(TestCase):
"dåtå".encode("utf8"),
(Path(settings.DERIVED_DIR) / "idnits2-rfcs-obsoleted").read_bytes(),
)
@mock.patch("ietf.doc.tasks.ensure_draft_bibxml_path_exists")
@mock.patch("ietf.doc.tasks.update_or_create_draft_bibxml_file")
def test_generate_draft_bibxml_files_task(self, mock_create, mock_ensure_path):
now = timezone.now()
very_old_event = NewRevisionDocEventFactory(
time=now - datetime.timedelta(days=1000), rev="17"
)
old_event = NewRevisionDocEventFactory(
time=now - datetime.timedelta(days=8), rev="03"
)
young_event = NewRevisionDocEventFactory(
time=now - datetime.timedelta(days=6), rev="06"
)
# a couple that should always be ignored
NewRevisionDocEventFactory(
time=now - datetime.timedelta(days=6), rev="09", doc__type_id="rfc" # not a draft
)
NewRevisionDocEventFactory(
type="changed_document", # not a "new_revision" type
time=now - datetime.timedelta(days=6),
rev="09",
doc__type_id="rfc",
)
# Get rid of the "00" events created by the factories -- they're just noise for this test
NewRevisionDocEvent.objects.filter(rev="00").delete()
# default args - look back 7 days
generate_draft_bibxml_files_task()
self.assertTrue(mock_ensure_path.called)
self.assertCountEqual(
mock_create.call_args_list, [mock.call(young_event.doc, young_event.rev)]
)
mock_create.reset_mock()
mock_ensure_path.reset_mock()
# shorter lookback
generate_draft_bibxml_files_task(days=5)
self.assertTrue(mock_ensure_path.called)
self.assertCountEqual(mock_create.call_args_list, [])
mock_create.reset_mock()
mock_ensure_path.reset_mock()
# longer lookback
generate_draft_bibxml_files_task(days=9)
self.assertTrue(mock_ensure_path.called)
self.assertCountEqual(
mock_create.call_args_list,
[
mock.call(young_event.doc, young_event.rev),
mock.call(old_event.doc, old_event.rev),
],
)
mock_create.reset_mock()
mock_ensure_path.reset_mock()
# everything
generate_draft_bibxml_files_task(process_all=True)
self.assertTrue(mock_ensure_path.called)
self.assertCountEqual(
mock_create.call_args_list,
[
mock.call(young_event.doc, young_event.rev),
mock.call(old_event.doc, old_event.rev),
mock.call(very_old_event.doc, very_old_event.rev),
],
)
mock_create.reset_mock()
mock_ensure_path.reset_mock()
# everything should still be tried, even if there's an exception
mock_create.side_effect = RuntimeError
generate_draft_bibxml_files_task(process_all=True)
self.assertTrue(mock_ensure_path.called)
self.assertCountEqual(
mock_create.call_args_list,
[
mock.call(young_event.doc, young_event.rev),
mock.call(old_event.doc, old_event.rev),
mock.call(very_old_event.doc, very_old_event.rev),
],
)

View file

@ -2,8 +2,10 @@
import datetime
import debug # pyflakes:ignore
from unittest.mock import patch
from pathlib import Path
from unittest.mock import call, patch
from django.conf import settings
from django.db import IntegrityError
from django.test.utils import override_settings
from django.utils import timezone
@ -16,7 +18,8 @@ from ietf.person.models import Person
from ietf.doc.factories import DocumentFactory, WgRfcFactory, WgDraftFactory
from ietf.doc.models import State, DocumentActionHolder, DocumentAuthor
from ietf.doc.utils import (update_action_holders, add_state_change_event, update_documentauthors,
fuzzy_find_documents, rebuild_reference_relations, build_file_urls)
fuzzy_find_documents, rebuild_reference_relations, build_file_urls,
ensure_draft_bibxml_path_exists, update_or_create_draft_bibxml_file)
from ietf.utils.draft import Draft, PlaintextDraft
from ietf.utils.xmldraft import XMLDraft
@ -484,3 +487,49 @@ class RebuildReferenceRelationsTests(TestCase):
(self.updated.name, 'updates'),
]
)
class DraftBibxmlTests(TestCase):
settings_temp_path_overrides = TestCase.settings_temp_path_overrides + ["BIBXML_BASE_PATH"]
def test_ensure_draft_bibxml_path_exists(self):
expected = Path(settings.BIBXML_BASE_PATH) / "bibxml-ids"
self.assertFalse(expected.exists())
ensure_draft_bibxml_path_exists()
self.assertTrue(expected.is_dir()) # false if does not exist or is not dir
@patch("ietf.doc.utils.bibxml_for_draft", return_value="This\ris\nmy\r\nbibxml")
def test_create_draft_bibxml_file(self, mock):
bibxml_path = Path(settings.BIBXML_BASE_PATH) / "bibxml-ids"
bibxml_path.mkdir(exist_ok=False) # expect to start with a clean slate
doc = DocumentFactory()
ref_path = bibxml_path / f"reference.I-D.{doc.name}-26.xml" # we're pretending it's rev 26
update_or_create_draft_bibxml_file(doc, "26")
self.assertEqual(mock.call_count, 1)
self.assertEqual(mock.call_args, call(doc, "26"))
self.assertEqual(ref_path.read_text(), "This\nis\nmy\nbibxml")
@patch("ietf.doc.utils.bibxml_for_draft", return_value="This\ris\nmy\r\nbibxml")
def test_update_draft_bibxml_file(self, mock):
bibxml_path = Path(settings.BIBXML_BASE_PATH) / "bibxml-ids"
bibxml_path.mkdir(exist_ok=False) # expect to start with a clean slate
doc = DocumentFactory()
ref_path = bibxml_path / f"reference.I-D.{doc.name}-26.xml" # we're pretending it's rev 26
ref_path.write_text("Old data")
# should replace it
update_or_create_draft_bibxml_file(doc, "26")
self.assertEqual(mock.call_count, 1)
self.assertEqual(mock.call_args, call(doc, "26"))
self.assertEqual(ref_path.read_text(), "This\nis\nmy\nbibxml")
# should leave it alone if it differs only by leading/trailing whitespace
mock.reset_mock()
mock.return_value = " \n This\nis\nmy\nbibxml "
update_or_create_draft_bibxml_file(doc, "26")
self.assertEqual(mock.call_count, 1)
self.assertEqual(mock.call_args, call(doc, "26"))
self.assertEqual(ref_path.read_text(), "This\nis\nmy\nbibxml")

View file

@ -1413,3 +1413,20 @@ def investigate_fragment(name_fragment):
unverifiable_collections=unverifiable_collections,
unexpected=unexpected,
)
def update_or_create_draft_bibxml_file(doc, rev):
log.assertion("doc.type_id == 'draft'")
normalized_bibxml = re.sub(r"\r\n?", r"\n", bibxml_for_draft(doc, rev))
ref_rev_file_path = Path(settings.BIBXML_BASE_PATH) / "bibxml-ids" / f"reference.I-D.{doc.name}-{rev}.xml"
try:
existing_bibxml = ref_rev_file_path.read_text(encoding="utf8")
except IOError:
existing_bibxml = ""
if normalized_bibxml.strip() != existing_bibxml.strip():
log.log(f"Writing {ref_rev_file_path}")
ref_rev_file_path.write_text(normalized_bibxml, encoding="utf8")
def ensure_draft_bibxml_path_exists():
(Path(settings.BIBXML_BASE_PATH) / "bibxml-ids").mkdir(exist_ok=True)

View file

@ -221,6 +221,16 @@ class Command(BaseCommand):
),
)
PeriodicTask.objects.get_or_create(
name="Generate I-D bibxml files",
task="ietf.doc.tasks.generate_draft_bibxml_files_task",
defaults=dict(
enabled=False,
crontab=self.crontabs["hourly"],
description="Generate draft bibxml files for the last week's drafts",
),
)
def show_tasks(self):
for label, crontab in self.crontabs.items():
tasks = PeriodicTask.objects.filter(crontab=crontab).order_by(