refactor: generate I-D bibxml files via celery (#7426)

* refactor: task to generate_draft_bibxml_files

* test: test task/utility methods

* chore: add periodic task

* chore: remove generate_draft_bibxml_files.py

* chore: further prune /bin/hourly
This commit is contained in:
Jennifer Richards 2024-05-16 13:37:29 -03:00 committed by GitHub
parent de8b3b5ce3
commit ffb9eb12ff
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 195 additions and 108 deletions

View file

@ -5,33 +5,15 @@
# This script is expected to be triggered by cron from # This script is expected to be triggered by cron from
# /etc/cron.d/datatracker # /etc/cron.d/datatracker
export LANG=en_US.UTF-8 export LANG=en_US.UTF-8
export PYTHONIOENCODING=utf-8
# Make sure we stop if something goes wrong: # Make sure we stop if something goes wrong:
program=${0##*/} program=${0##*/}
trap 'echo "$program($LINENO): Command failed with error code $? ([$$] $0 $*)"; exit 1' ERR trap 'echo "$program($LINENO): Command failed with error code $? ([$$] $0 $*)"; exit 1' ERR
DTDIR=/a/www/ietf-datatracker/web
cd $DTDIR/
# Set up the virtual environment
source $DTDIR/env/bin/activate
logger -p user.info -t cron "Running $DTDIR/bin/hourly" logger -p user.info -t cron "Running $DTDIR/bin/hourly"
# Generate some static files
ID=/a/ietfdata/doc/draft/repository
DERIVED=/a/ietfdata/derived
DOWNLOAD=/a/www/www6s/download
CHARTER=/a/www/ietf-ftp/charter CHARTER=/a/www/ietf-ftp/charter
wget -q https://datatracker.ietf.org/wg/1wg-charters-by-acronym.txt -O $CHARTER/1wg-charters-by-acronym.txt wget -q https://datatracker.ietf.org/wg/1wg-charters-by-acronym.txt -O $CHARTER/1wg-charters-by-acronym.txt
wget -q https://datatracker.ietf.org/wg/1wg-charters.txt -O $CHARTER/1wg-charters.txt wget -q https://datatracker.ietf.org/wg/1wg-charters.txt -O $CHARTER/1wg-charters.txt
# Regenerate the last week of bibxml-ids
$DTDIR/ietf/manage.py generate_draft_bibxml_files
# Create and update group wikis
#$DTDIR/ietf/manage.py create_group_wikis
# exit 0 # exit 0

View file

@ -1,84 +0,0 @@
# Copyright The IETF Trust 2012-2020, All Rights Reserved
# -*- coding: utf-8 -*-
import datetime
import io
import os
import re
import sys
from django.conf import settings
from django.core.management.base import BaseCommand
from django.utils import timezone
import debug # pyflakes:ignore
from ietf.doc.models import NewRevisionDocEvent
from ietf.doc.utils import bibxml_for_draft
DEFAULT_DAYS = 7
class Command(BaseCommand):
help = ('Generate draft bibxml files for xml2rfc references, placing them in the '
'directory configured in settings.BIBXML_BASE_PATH: %s. '
'By default, generate files as needed for new Internet-Draft revisions from the '
'last %s days.' % (settings.BIBXML_BASE_PATH, DEFAULT_DAYS))
def add_arguments(self, parser):
parser.add_argument('--all', action='store_true', default=False, help="Process all documents, not only recent submissions")
parser.add_argument('--days', type=int, default=DEFAULT_DAYS, help="Look submissions from the last DAYS days, instead of %s" % DEFAULT_DAYS)
def say(self, msg):
if self.verbosity > 0:
sys.stdout.write(msg)
sys.stdout.write('\n')
def note(self, msg):
if self.verbosity > 1:
sys.stdout.write(msg)
sys.stdout.write('\n')
def mutter(self, msg):
if self.verbosity > 2:
sys.stdout.write(msg)
sys.stdout.write('\n')
def write(self, fn, new):
# normalize new
new = re.sub(r'\r\n?', r'\n', new)
try:
with io.open(fn, encoding='utf-8') as f:
old = f.read()
except IOError:
old = ""
if old.strip() != new.strip():
self.note('Writing %s' % os.path.basename(fn))
with io.open(fn, "w", encoding='utf-8') as f:
f.write(new)
def handle(self, *args, **options):
self.verbosity = options.get("verbosity", 1)
process_all = options.get("all")
days = options.get("days")
#
bibxmldir = os.path.join(settings.BIBXML_BASE_PATH, 'bibxml-ids')
if not os.path.exists(bibxmldir):
os.makedirs(bibxmldir)
#
if process_all:
doc_events = NewRevisionDocEvent.objects.filter(type='new_revision', doc__type_id='draft')
else:
start = timezone.now() - datetime.timedelta(days=days)
doc_events = NewRevisionDocEvent.objects.filter(type='new_revision', doc__type_id='draft', time__gte=start)
doc_events = doc_events.order_by('time')
for e in doc_events:
self.mutter('%s %s' % (e.time, e.doc.name))
try:
doc = e.doc
bibxml = bibxml_for_draft(doc, e.rev)
ref_rev_file_name = os.path.join(bibxmldir, 'reference.I-D.%s-%s.xml' % (doc.name, e.rev))
self.write(ref_rev_file_name, bibxml)
except Exception as ee:
sys.stderr.write('\n%s-%s: %s\n' % (doc.name, doc.rev, ee))

View file

@ -9,6 +9,7 @@ from celery import shared_task
from pathlib import Path from pathlib import Path
from django.conf import settings from django.conf import settings
from django.utils import timezone
from ietf.utils import log from ietf.utils import log
from ietf.utils.timezone import datetime_today from ietf.utils.timezone import datetime_today
@ -24,8 +25,13 @@ from .expire import (
send_expire_warning_for_draft, send_expire_warning_for_draft,
) )
from .lastcall import get_expired_last_calls, expire_last_call from .lastcall import get_expired_last_calls, expire_last_call
from .models import Document from .models import Document, NewRevisionDocEvent
from .utils import generate_idnits2_rfc_status, generate_idnits2_rfcs_obsoleted from .utils import (
generate_idnits2_rfc_status,
generate_idnits2_rfcs_obsoleted,
update_or_create_draft_bibxml_file,
ensure_draft_bibxml_path_exists,
)
@shared_task @shared_task
@ -90,3 +96,24 @@ def generate_idnits2_rfcs_obsoleted_task():
outpath.write_text(blob, encoding="utf8") outpath.write_text(blob, encoding="utf8")
except Exception as e: except Exception as e:
log.log(f"failed to write idnits2-rfcs-obsoleted: {e}") log.log(f"failed to write idnits2-rfcs-obsoleted: {e}")
@shared_task
def generate_draft_bibxml_files_task(days=7, process_all=False):
"""Generate bibxml files for recently updated docs
If process_all is False (the default), processes only docs with new revisions
in the last specified number of days.
"""
ensure_draft_bibxml_path_exists()
doc_events = NewRevisionDocEvent.objects.filter(
type="new_revision",
doc__type_id="draft",
).order_by("time")
if not process_all:
doc_events = doc_events.filter(time__gte=timezone.now() - datetime.timedelta(days=days))
for event in doc_events:
try:
update_or_create_draft_bibxml_file(event.doc, event.rev)
except Exception as err:
log.log(f"Error generating bibxml for {event.doc.name}-{event.rev}: {err}")

View file

@ -1,18 +1,21 @@
# Copyright The IETF Trust 2024, All Rights Reserved # Copyright The IETF Trust 2024, All Rights Reserved
import datetime
import mock import mock
from pathlib import Path from pathlib import Path
from django.conf import settings from django.conf import settings
from django.utils import timezone
from ietf.utils.test_utils import TestCase from ietf.utils.test_utils import TestCase
from ietf.utils.timezone import datetime_today from ietf.utils.timezone import datetime_today
from .factories import DocumentFactory from .factories import DocumentFactory, NewRevisionDocEventFactory
from .models import Document from .models import Document, NewRevisionDocEvent
from .tasks import ( from .tasks import (
expire_ids_task, expire_ids_task,
expire_last_calls_task, expire_last_calls_task,
generate_draft_bibxml_files_task,
generate_idnits2_rfcs_obsoleted_task, generate_idnits2_rfcs_obsoleted_task,
generate_idnits2_rfc_status_task, generate_idnits2_rfc_status_task,
notify_expirations_task, notify_expirations_task,
@ -114,3 +117,86 @@ class TaskTests(TestCase):
"dåtå".encode("utf8"), "dåtå".encode("utf8"),
(Path(settings.DERIVED_DIR) / "idnits2-rfcs-obsoleted").read_bytes(), (Path(settings.DERIVED_DIR) / "idnits2-rfcs-obsoleted").read_bytes(),
) )
@mock.patch("ietf.doc.tasks.ensure_draft_bibxml_path_exists")
@mock.patch("ietf.doc.tasks.update_or_create_draft_bibxml_file")
def test_generate_draft_bibxml_files_task(self, mock_create, mock_ensure_path):
now = timezone.now()
very_old_event = NewRevisionDocEventFactory(
time=now - datetime.timedelta(days=1000), rev="17"
)
old_event = NewRevisionDocEventFactory(
time=now - datetime.timedelta(days=8), rev="03"
)
young_event = NewRevisionDocEventFactory(
time=now - datetime.timedelta(days=6), rev="06"
)
# a couple that should always be ignored
NewRevisionDocEventFactory(
time=now - datetime.timedelta(days=6), rev="09", doc__type_id="rfc" # not a draft
)
NewRevisionDocEventFactory(
type="changed_document", # not a "new_revision" type
time=now - datetime.timedelta(days=6),
rev="09",
doc__type_id="rfc",
)
# Get rid of the "00" events created by the factories -- they're just noise for this test
NewRevisionDocEvent.objects.filter(rev="00").delete()
# default args - look back 7 days
generate_draft_bibxml_files_task()
self.assertTrue(mock_ensure_path.called)
self.assertCountEqual(
mock_create.call_args_list, [mock.call(young_event.doc, young_event.rev)]
)
mock_create.reset_mock()
mock_ensure_path.reset_mock()
# shorter lookback
generate_draft_bibxml_files_task(days=5)
self.assertTrue(mock_ensure_path.called)
self.assertCountEqual(mock_create.call_args_list, [])
mock_create.reset_mock()
mock_ensure_path.reset_mock()
# longer lookback
generate_draft_bibxml_files_task(days=9)
self.assertTrue(mock_ensure_path.called)
self.assertCountEqual(
mock_create.call_args_list,
[
mock.call(young_event.doc, young_event.rev),
mock.call(old_event.doc, old_event.rev),
],
)
mock_create.reset_mock()
mock_ensure_path.reset_mock()
# everything
generate_draft_bibxml_files_task(process_all=True)
self.assertTrue(mock_ensure_path.called)
self.assertCountEqual(
mock_create.call_args_list,
[
mock.call(young_event.doc, young_event.rev),
mock.call(old_event.doc, old_event.rev),
mock.call(very_old_event.doc, very_old_event.rev),
],
)
mock_create.reset_mock()
mock_ensure_path.reset_mock()
# everything should still be tried, even if there's an exception
mock_create.side_effect = RuntimeError
generate_draft_bibxml_files_task(process_all=True)
self.assertTrue(mock_ensure_path.called)
self.assertCountEqual(
mock_create.call_args_list,
[
mock.call(young_event.doc, young_event.rev),
mock.call(old_event.doc, old_event.rev),
mock.call(very_old_event.doc, very_old_event.rev),
],
)

View file

@ -2,8 +2,10 @@
import datetime import datetime
import debug # pyflakes:ignore import debug # pyflakes:ignore
from unittest.mock import patch from pathlib import Path
from unittest.mock import call, patch
from django.conf import settings
from django.db import IntegrityError from django.db import IntegrityError
from django.test.utils import override_settings from django.test.utils import override_settings
from django.utils import timezone from django.utils import timezone
@ -16,7 +18,8 @@ from ietf.person.models import Person
from ietf.doc.factories import DocumentFactory, WgRfcFactory, WgDraftFactory from ietf.doc.factories import DocumentFactory, WgRfcFactory, WgDraftFactory
from ietf.doc.models import State, DocumentActionHolder, DocumentAuthor from ietf.doc.models import State, DocumentActionHolder, DocumentAuthor
from ietf.doc.utils import (update_action_holders, add_state_change_event, update_documentauthors, from ietf.doc.utils import (update_action_holders, add_state_change_event, update_documentauthors,
fuzzy_find_documents, rebuild_reference_relations, build_file_urls) fuzzy_find_documents, rebuild_reference_relations, build_file_urls,
ensure_draft_bibxml_path_exists, update_or_create_draft_bibxml_file)
from ietf.utils.draft import Draft, PlaintextDraft from ietf.utils.draft import Draft, PlaintextDraft
from ietf.utils.xmldraft import XMLDraft from ietf.utils.xmldraft import XMLDraft
@ -484,3 +487,49 @@ class RebuildReferenceRelationsTests(TestCase):
(self.updated.name, 'updates'), (self.updated.name, 'updates'),
] ]
) )
class DraftBibxmlTests(TestCase):
settings_temp_path_overrides = TestCase.settings_temp_path_overrides + ["BIBXML_BASE_PATH"]
def test_ensure_draft_bibxml_path_exists(self):
expected = Path(settings.BIBXML_BASE_PATH) / "bibxml-ids"
self.assertFalse(expected.exists())
ensure_draft_bibxml_path_exists()
self.assertTrue(expected.is_dir()) # false if does not exist or is not dir
@patch("ietf.doc.utils.bibxml_for_draft", return_value="This\ris\nmy\r\nbibxml")
def test_create_draft_bibxml_file(self, mock):
bibxml_path = Path(settings.BIBXML_BASE_PATH) / "bibxml-ids"
bibxml_path.mkdir(exist_ok=False) # expect to start with a clean slate
doc = DocumentFactory()
ref_path = bibxml_path / f"reference.I-D.{doc.name}-26.xml" # we're pretending it's rev 26
update_or_create_draft_bibxml_file(doc, "26")
self.assertEqual(mock.call_count, 1)
self.assertEqual(mock.call_args, call(doc, "26"))
self.assertEqual(ref_path.read_text(), "This\nis\nmy\nbibxml")
@patch("ietf.doc.utils.bibxml_for_draft", return_value="This\ris\nmy\r\nbibxml")
def test_update_draft_bibxml_file(self, mock):
bibxml_path = Path(settings.BIBXML_BASE_PATH) / "bibxml-ids"
bibxml_path.mkdir(exist_ok=False) # expect to start with a clean slate
doc = DocumentFactory()
ref_path = bibxml_path / f"reference.I-D.{doc.name}-26.xml" # we're pretending it's rev 26
ref_path.write_text("Old data")
# should replace it
update_or_create_draft_bibxml_file(doc, "26")
self.assertEqual(mock.call_count, 1)
self.assertEqual(mock.call_args, call(doc, "26"))
self.assertEqual(ref_path.read_text(), "This\nis\nmy\nbibxml")
# should leave it alone if it differs only by leading/trailing whitespace
mock.reset_mock()
mock.return_value = " \n This\nis\nmy\nbibxml "
update_or_create_draft_bibxml_file(doc, "26")
self.assertEqual(mock.call_count, 1)
self.assertEqual(mock.call_args, call(doc, "26"))
self.assertEqual(ref_path.read_text(), "This\nis\nmy\nbibxml")

View file

@ -1413,3 +1413,20 @@ def investigate_fragment(name_fragment):
unverifiable_collections=unverifiable_collections, unverifiable_collections=unverifiable_collections,
unexpected=unexpected, unexpected=unexpected,
) )
def update_or_create_draft_bibxml_file(doc, rev):
log.assertion("doc.type_id == 'draft'")
normalized_bibxml = re.sub(r"\r\n?", r"\n", bibxml_for_draft(doc, rev))
ref_rev_file_path = Path(settings.BIBXML_BASE_PATH) / "bibxml-ids" / f"reference.I-D.{doc.name}-{rev}.xml"
try:
existing_bibxml = ref_rev_file_path.read_text(encoding="utf8")
except IOError:
existing_bibxml = ""
if normalized_bibxml.strip() != existing_bibxml.strip():
log.log(f"Writing {ref_rev_file_path}")
ref_rev_file_path.write_text(normalized_bibxml, encoding="utf8")
def ensure_draft_bibxml_path_exists():
(Path(settings.BIBXML_BASE_PATH) / "bibxml-ids").mkdir(exist_ok=True)

View file

@ -221,6 +221,16 @@ class Command(BaseCommand):
), ),
) )
PeriodicTask.objects.get_or_create(
name="Generate I-D bibxml files",
task="ietf.doc.tasks.generate_draft_bibxml_files_task",
defaults=dict(
enabled=False,
crontab=self.crontabs["hourly"],
description="Generate draft bibxml files for the last week's drafts",
),
)
def show_tasks(self): def show_tasks(self):
for label, crontab in self.crontabs.items(): for label, crontab in self.crontabs.items():
tasks = PeriodicTask.objects.filter(crontab=crontab).order_by( tasks = PeriodicTask.objects.filter(crontab=crontab).order_by(