* feat: rfc_editor_queue_updates_task * refactor: use rfc_editor_queue_updates_task() * chore: remove now-unused scripts * test: test new task * chore: de-lint
183 lines
6 KiB
Python
183 lines
6 KiB
Python
# Copyright The IETF Trust 2024, All Rights Reserved
|
|
#
|
|
# Celery task definitions
|
|
#
|
|
import datetime
|
|
import io
|
|
import requests
|
|
|
|
from celery import shared_task
|
|
|
|
from django.conf import settings
|
|
from django.utils import timezone
|
|
|
|
from ietf.sync import iana
|
|
from ietf.sync import rfceditor
|
|
from ietf.sync.rfceditor import MIN_QUEUE_RESULTS, parse_queue, update_drafts_from_queue
|
|
from ietf.utils import log
|
|
from ietf.utils.timezone import date_today
|
|
|
|
|
|
@shared_task
|
|
def rfc_editor_index_update_task(full_index=False):
|
|
"""Update metadata from the RFC index
|
|
|
|
Default is to examine only changes in the past 365 days. Call with full_index=True to update
|
|
the full RFC index.
|
|
|
|
According to comments on the original script, a year's worth took about 20s on production as of
|
|
August 2022
|
|
|
|
The original rfc-editor-index-update script had a long-disabled provision for running the
|
|
rebuild_reference_relations scripts after the update. That has not been brought over
|
|
at all because it should be implemented as its own task if it is needed.
|
|
"""
|
|
skip_date = None if full_index else date_today() - datetime.timedelta(days=365)
|
|
log.log(
|
|
"Updating document metadata from RFC index going back to {since}, from {url}".format(
|
|
since=skip_date if skip_date is not None else "the beginning",
|
|
url=settings.RFC_EDITOR_INDEX_URL,
|
|
)
|
|
)
|
|
try:
|
|
response = requests.get(
|
|
settings.RFC_EDITOR_INDEX_URL,
|
|
timeout=30, # seconds
|
|
)
|
|
except requests.Timeout as exc:
|
|
log.log(f'GET request timed out retrieving RFC editor index: {exc}')
|
|
return # failed
|
|
rfc_index_xml = response.text
|
|
index_data = rfceditor.parse_index(io.StringIO(rfc_index_xml))
|
|
try:
|
|
response = requests.get(
|
|
settings.RFC_EDITOR_ERRATA_JSON_URL,
|
|
timeout=30, # seconds
|
|
)
|
|
except requests.Timeout as exc:
|
|
log.log(f'GET request timed out retrieving RFC editor errata: {exc}')
|
|
return # failed
|
|
errata_data = response.json()
|
|
if len(index_data) < rfceditor.MIN_INDEX_RESULTS:
|
|
log.log("Not enough index entries, only %s" % len(index_data))
|
|
return # failed
|
|
if len(errata_data) < rfceditor.MIN_ERRATA_RESULTS:
|
|
log.log("Not enough errata entries, only %s" % len(errata_data))
|
|
return # failed
|
|
for rfc_number, changes, doc, rfc_published in rfceditor.update_docs_from_rfc_index(
|
|
index_data, errata_data, skip_older_than_date=skip_date
|
|
):
|
|
for c in changes:
|
|
log.log("RFC%s, %s: %s" % (rfc_number, doc.name, c))
|
|
|
|
|
|
@shared_task
|
|
def rfc_editor_queue_updates_task():
|
|
log.log(f"Updating RFC Editor queue states from {settings.RFC_EDITOR_QUEUE_URL}")
|
|
try:
|
|
response = requests.get(
|
|
settings.RFC_EDITOR_QUEUE_URL,
|
|
timeout=30, # seconds
|
|
)
|
|
except requests.Timeout as exc:
|
|
log.log(f"GET request timed out retrieving RFC editor queue: {exc}")
|
|
return # failed
|
|
drafts, warnings = parse_queue(io.StringIO(response.text))
|
|
for w in warnings:
|
|
log.log(f"Warning: {w}")
|
|
|
|
if len(drafts) < MIN_QUEUE_RESULTS:
|
|
log.log("Not enough results, only %s" % len(drafts))
|
|
return # failed
|
|
|
|
changed, warnings = update_drafts_from_queue(drafts)
|
|
for w in warnings:
|
|
log.log(f"Warning: {w}")
|
|
|
|
for c in changed:
|
|
log.log(f"Updated {c}")
|
|
|
|
|
|
@shared_task
|
|
def iana_changes_update_task():
|
|
# compensate to avoid we ask for something that happened now and then
|
|
# don't get it back because our request interval is slightly off
|
|
CLOCK_SKEW_COMPENSATION = 5 # seconds
|
|
|
|
# actually the interface accepts 24 hours, but then we get into
|
|
# trouble with daylights savings - meh
|
|
MAX_INTERVAL_ACCEPTED_BY_IANA = datetime.timedelta(hours=23)
|
|
|
|
start = (
|
|
timezone.now()
|
|
- datetime.timedelta(hours=23)
|
|
+ datetime.timedelta(seconds=CLOCK_SKEW_COMPENSATION,)
|
|
)
|
|
end = start + datetime.timedelta(hours=23)
|
|
|
|
t = start
|
|
while t < end:
|
|
# the IANA server doesn't allow us to fetch more than a certain
|
|
# period, so loop over the requested period and make multiple
|
|
# requests if necessary
|
|
|
|
text = iana.fetch_changes_json(
|
|
settings.IANA_SYNC_CHANGES_URL, t, min(end, t + MAX_INTERVAL_ACCEPTED_BY_IANA)
|
|
)
|
|
log.log(f"Retrieved the JSON: {text}")
|
|
|
|
changes = iana.parse_changes_json(text)
|
|
added_events, warnings = iana.update_history_with_changes(
|
|
changes, send_email=True
|
|
)
|
|
|
|
for e in added_events:
|
|
log.log(
|
|
f"Added event for {e.doc_id} {e.time}: {e.desc} (parsed json: {e.json})"
|
|
)
|
|
|
|
for w in warnings:
|
|
log.log(f"WARNING: {w}")
|
|
|
|
t += MAX_INTERVAL_ACCEPTED_BY_IANA
|
|
|
|
|
|
@shared_task
|
|
def iana_protocols_update_task():
|
|
# Earliest date for which we have data suitable to update (was described as
|
|
# "this needs to be the date where this tool is first deployed" in the original
|
|
# iana-protocols-updates script)"
|
|
rfc_must_published_later_than = datetime.datetime(
|
|
2012,
|
|
11,
|
|
26,
|
|
tzinfo=datetime.timezone.utc,
|
|
)
|
|
|
|
try:
|
|
response = requests.get(
|
|
settings.IANA_SYNC_PROTOCOLS_URL,
|
|
timeout=30,
|
|
)
|
|
except requests.Timeout as exc:
|
|
log.log(f'GET request timed out retrieving IANA protocols page: {exc}')
|
|
return
|
|
|
|
rfc_numbers = iana.parse_protocol_page(response.text)
|
|
|
|
def batched(l, n):
|
|
"""Split list l up in batches of max size n.
|
|
|
|
For Python 3.12 or later, replace this with itertools.batched()
|
|
"""
|
|
return (l[i:i + n] for i in range(0, len(l), n))
|
|
|
|
for batch in batched(rfc_numbers, 100):
|
|
updated = iana.update_rfc_log_from_protocol_page(
|
|
batch,
|
|
rfc_must_published_later_than,
|
|
)
|
|
|
|
for d in updated:
|
|
log.log("Added history entry for %s" % d.display_name())
|