diff --git a/ietf/bin/rfc-editor-index-updates b/ietf/bin/rfc-editor-index-updates new file mode 100755 index 000000000..1d2c9ee6c --- /dev/null +++ b/ietf/bin/rfc-editor-index-updates @@ -0,0 +1,30 @@ +#!/usr/bin/env python + +import os, sys, re, json, datetime +import syslog + +syslog.openlog(os.path.basename(__file__), syslog.LOG_PID, syslog.LOG_LOCAL0) + +# boilerplate +basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +sys.path = [ basedir ] + sys.path + +from ietf import settings +from django.core import management +management.setup_environ(settings) + + +from ietf.sync.rfceditor import * + +syslog.syslog("Updating document metadata from RFC index from %s" % QUEUE_URL) + +response = fetch_index_xml(INDEX_URL) +data = parse_index(response) + +if len(data) < MIN_INDEX_RESULTS: + syslog.syslog("Not enough results, only %s" % len(data)) + sys.exit(1) + +changed = update_docs_from_rfc_index(data) +for c in changed: + syslog.syslog(c) diff --git a/ietf/bin/rfc-editor-queue-updates b/ietf/bin/rfc-editor-queue-updates new file mode 100755 index 000000000..22562a629 --- /dev/null +++ b/ietf/bin/rfc-editor-queue-updates @@ -0,0 +1,35 @@ +#!/usr/bin/env python + +import os, sys, re, json, datetime +import syslog + +syslog.openlog(os.path.basename(__file__), syslog.LOG_PID, syslog.LOG_LOCAL0) + +# boilerplate +basedir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) +sys.path = [ basedir ] + sys.path + +from ietf import settings +from django.core import management +management.setup_environ(settings) + + +from ietf.sync.rfceditor import * + +syslog.syslog("Updating RFC Editor queue states from %s" % QUEUE_URL) + +response = fetch_queue_xml(QUEUE_URL) +drafts, warnings = parse_queue(response) +for w in warnings: + syslog.syslog(u"WARNING: %s" % w) + +if len(drafts) < MIN_QUEUE_RESULTS: + syslog.syslog("Not enough results, only %s" % len(drafts)) + sys.exit(1) + +changed, warnings = update_drafts_from_queue(drafts) +for w in warnings: + syslog.syslog(u"WARNING: %s" % w) + +for c in changed: + syslog.syslog(u"Updated %s" % c) diff --git a/ietf/idrfc/idrfc_wrapper.py b/ietf/idrfc/idrfc_wrapper.py index 48e806d58..2c1a136c2 100644 --- a/ietf/idrfc/idrfc_wrapper.py +++ b/ietf/idrfc/idrfc_wrapper.py @@ -125,9 +125,9 @@ class IdWrapper: if settings.USE_DB_REDESIGN_PROXY_CLASSES: s = self._draft.get_state("draft-rfceditor") if s: - # extract possible extra states - tags = self._draft.tags.filter(slug__in=("iana-crd", "ref", "missref")) - return " ".join([s.name] + [t.slug.replace("-crd", "").upper() for t in tags]) + # extract possible extra annotations + tags = self._draft.tags.filter(slug__in=("iana", "ref")) + return "*".join([s.name] + [t.slug.upper() for t in tags]) else: return None diff --git a/ietf/idrfc/mirror_rfc_editor_queue.py b/ietf/idrfc/mirror_rfc_editor_queue.py deleted file mode 100644 index 0faf941cb..000000000 --- a/ietf/idrfc/mirror_rfc_editor_queue.py +++ /dev/null @@ -1,293 +0,0 @@ -# Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). -# All rights reserved. Contact: Pasi Eronen -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided -# with the distribution. -# -# * Neither the name of the Nokia Corporation and/or its -# subsidiary(-ies) nor the names of its contributors may be used -# to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from ietf import settings -from django.core import management -management.setup_environ(settings) -from django import db - -from xml.dom import pulldom, Node -import re -import urllib2 -from datetime import datetime -import socket -import sys - -QUEUE_URL = "http://www.rfc-editor.org/queue2.xml" -TABLE = "rfc_editor_queue_mirror" -REF_TABLE = "rfc_editor_queue_mirror_refs" - -log_data = "" -def log(line): - global log_data - if __name__ == '__main__' and len(sys.argv) > 1: - print line - else: - log_data += line + "\n" - -def parse(response): - def getChildText(parentNode, tagName): - for node in parentNode.childNodes: - if node.nodeType == Node.ELEMENT_NODE and node.localName == tagName: - return node.firstChild.data - return None - - events = pulldom.parse(response) - drafts = [] - refs = [] - for (event, node) in events: - if event == pulldom.START_ELEMENT and node.tagName == "entry": - events.expandNode(node) - node.normalize() - draft_name = getChildText(node, "draft").strip() - draft_name = re.sub("(-\d\d)?(.txt){1,2}$", "", draft_name) - date_received = getChildText(node, "date-received") - - states = [] - for child in node.childNodes: - if child.nodeType == Node.ELEMENT_NODE and child.localName == "state": - states.append(child.firstChild.data) - - has_refs = False - for child in node.childNodes: - if child.nodeType == Node.ELEMENT_NODE and child.localName == "normRef": - ref_name = getChildText(child, "ref-name") - ref_state = getChildText(child, "ref-state") - in_queue = ref_state.startswith("IN-QUEUE") - refs.append([draft_name, ref_name, in_queue, True]) - has_refs = True - if has_refs and not "MISSREF" in states: - states.append("REF") - - if len(states) == 0: - state = "?" - else: - state = " ".join(states) - drafts.append([draft_name, date_received, state, stream]) - - elif event == pulldom.START_ELEMENT and node.tagName == "section": - name = node.getAttribute('name') - if name.startswith("IETF"): - stream = 1 - elif name.startswith("IAB"): - stream = 2 - elif name.startswith("IRTF"): - stream = 3 - elif name.startswith("INDEPENDENT"): - stream = 4 - else: - stream = 0 - log("WARNING: unrecognized section "+name) - return (drafts, refs) - -# Find set of all normative references (whether direct or via some -# other normative reference) -def find_indirect_refs(drafts, refs): - result = [] - draft_names = set() - for draft in drafts: - draft_names.add(draft[0]) - - def recurse(draft_name, ref_set, level): - for (source, destination, in_queue, direct) in refs: - if source == draft_name: - if destination not in ref_set: - ref_set.add(destination) - recurse(destination, ref_set, level+1) - if level == 0: - # Remove self-reference - ref_set.remove(draft_name) - # Remove direct references - for (source, destination, in_queue, direct) in refs: - if source == draft_name: - if destination in ref_set: - ref_set.remove(destination) - # The rest are indirect references - for ref in ref_set: - if draft_name != ref: - result.append([draft_name, ref, ref in draft_names, False]) - - for draft_name in draft_names: - recurse(draft_name, set([draft_name]), 0) - return result - -# Convert filenames to id_document_tags -def find_document_ids(cursor, drafts, refs): - draft_ids = {} - drafts2 = [] - for draft in drafts: - cursor.execute("SELECT id_document_tag FROM internet_drafts WHERE filename=%s", [draft[0]]) - row = cursor.fetchone() - if not row: - log("WARNING: cannot find id for "+draft[0]) - else: - draft_ids[draft[0]] = row[0] - drafts2.append([row[0]]+draft[1:]) - refs2 = [] - for ref in refs: - if ref[0] in draft_ids: - refs2.append([draft_ids[ref[0]]]+ref[1:]) - return (drafts2, refs2) - -def parse_all(response): - log("parsing...") - (drafts, refs) = parse(response) - log("got "+ str(len(drafts)) + " drafts and "+str(len(refs))+" direct refs") - - indirect_refs = find_indirect_refs(drafts, refs) - log("found " + str(len(indirect_refs)) + " indirect refs") - refs.extend(indirect_refs) - del(indirect_refs) - - if settings.USE_DB_REDESIGN_PROXY_CLASSES: # note: return before id lookup - return (drafts, refs) - - # convert filenames to id_document_tags - log("connecting to database...") - cursor = db.connection.cursor() - log("finding id_document_tags...") - (drafts, refs) = find_document_ids(cursor, drafts, refs) - cursor.close() - return (drafts, refs) - -def insert_into_database(drafts, refs): - log("connecting to database...") - cursor = db.connection.cursor() - log("removing old data...") - cursor.execute("DELETE FROM "+TABLE) - cursor.execute("DELETE FROM "+REF_TABLE) - log("inserting new data...") - cursor.executemany("INSERT INTO "+TABLE+" (id_document_tag, date_received, state, stream) VALUES (%s, %s, %s, %s)", drafts) - cursor.execute("DELETE FROM "+REF_TABLE) - cursor.executemany("INSERT INTO "+REF_TABLE+" (source, destination, in_queue, direct) VALUES (%s, %s, %s, %s)", refs) - cursor.close() - db.connection._commit() - db.connection.close() - -import django.db.transaction - -def get_rfc_tag_mapping(): - """Return dict with RFC Editor state name -> DocTagName""" - from ietf.name.models import DocTagName - from ietf.name.utils import name - - return { - 'IANA': name(DocTagName, 'iana-crd', 'IANA coordination', "RFC-Editor/IANA Registration Coordination"), - 'REF': name(DocTagName, 'ref', 'Holding for references', "Holding for normative reference"), - 'MISSREF': name(DocTagName, 'missref', 'Missing references', "Awaiting missing normative reference"), - } - -def get_rfc_state_mapping(): - """Return dict with RFC Editor state name -> State""" - from ietf.doc.models import State, StateType - t = StateType.objects.get(slug="draft-rfceditor") - return { - 'AUTH': State.objects.get_or_create(type=t, slug='auth', name='AUTH', desc="Awaiting author action")[0], - 'AUTH48': State.objects.get_or_create(type=t, slug='auth48', name="AUTH48", desc="Awaiting final author approval")[0], - 'AUTH48-DONE': State.objects.get_or_create(type=t, slug='auth48done', name="AUTH48-DONE", desc="Final approvals are complete")[0], - 'EDIT': State.objects.get_or_create(type=t, slug='edit', name='EDIT', desc="Approved by the stream manager (e.g., IESG, IAB, IRSG, ISE), awaiting processing and publishing")[0], - 'IANA': State.objects.get_or_create(type=t, slug='iana-crd', name='IANA', desc="RFC-Editor/IANA Registration Coordination")[0], - 'IESG': State.objects.get_or_create(type=t, slug='iesg', name='IESG', desc="Holding for IESG action")[0], - 'ISR': State.objects.get_or_create(type=t, slug='isr', name='ISR', desc="Independent Submission Review by the ISE ")[0], - 'ISR-AUTH': State.objects.get_or_create(type=t, slug='isr-auth', name='ISR-AUTH', desc="Independent Submission awaiting author update, or in discussion between author and ISE")[0], - 'REF': State.objects.get_or_create(type=t, slug='ref', name='REF', desc="Holding for normative reference")[0], - 'RFC-EDITOR': State.objects.get_or_create(type=t, slug='rfc-edit', name='RFC-EDITOR', desc="Awaiting final RFC Editor review before AUTH48")[0], - 'TO': State.objects.get_or_create(type=t, slug='timeout', name='TO', desc="Time-out period during which the IESG reviews document for conflict/concurrence with other IETF working group work")[0], - 'MISSREF': State.objects.get_or_create(type=t, slug='missref', name='MISSREF', desc="Awaiting missing normative reference")[0], - } - - -@django.db.transaction.commit_on_success -def insert_into_databaseREDESIGN(drafts, refs): - from ietf.doc.models import Document - from ietf.name.models import DocTagName - - tags = get_rfc_tag_mapping() - state_map = get_rfc_state_mapping() - - rfc_editor_tags = tags.values() - - log("removing old data...") - for d in Document.objects.filter(states__type="draft-rfceditor").distinct(): - d.tags.remove(*rfc_editor_tags) - d.unset_state("draft-rfceditor") - - log("inserting new data...") - - for name, date_received, state_info, stream_id in drafts: - try: - d = Document.objects.get(name=name) - except Document.DoesNotExist: - log("unknown document %s" % name) - continue - - state_list = state_info.split(" ") - if state_list: - state = state_list[0] - # For now, ignore the '*R...' that's appeared for some states. - # FIXME : see if we need to add some refinement for this. - if '*' in state: - state = state.split("*")[0] - # first is state - d.set_state(state_map[state]) - - # remainding are tags - for x in state_list[1:]: - d.tags.add(tags[x]) - -if settings.USE_DB_REDESIGN_PROXY_CLASSES: - insert_into_database = insert_into_databaseREDESIGN - - -if __name__ == '__main__': - try: - log("output from mirror_rfc_editor_queue.py:\n") - log("time: "+str(datetime.now())) - log("host: "+socket.gethostname()) - log("url: "+QUEUE_URL) - - log("downloading...") - socket.setdefaulttimeout(30) - response = urllib2.urlopen(QUEUE_URL) - - (drafts, refs) = parse_all(response) - if len(drafts) < 10 or len(refs) < 10: - raise Exception('not enough data') - - insert_into_database(drafts, refs) - - log("all done!") - if log_data.find("WARNING") < 0: - log_data = "" - finally: - if len(log_data) > 0: - print log_data diff --git a/ietf/idrfc/mirror_rfc_index.py b/ietf/idrfc/mirror_rfc_index.py deleted file mode 100644 index afb81cec0..000000000 --- a/ietf/idrfc/mirror_rfc_index.py +++ /dev/null @@ -1,365 +0,0 @@ -# Copyright (C) 2009-2010 Nokia Corporation and/or its subsidiary(-ies). -# All rights reserved. Contact: Pasi Eronen -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions -# are met: -# -# * Redistributions of source code must retain the above copyright -# notice, this list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above -# copyright notice, this list of conditions and the following -# disclaimer in the documentation and/or other materials provided -# with the distribution. -# -# * Neither the name of the Nokia Corporation and/or its -# subsidiary(-ies) nor the names of its contributors may be used -# to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - -from ietf import settings -from django.core import management -management.setup_environ(settings) -from django import db - -from xml.dom import pulldom, Node -import re -import urllib2 -from datetime import datetime, date, timedelta -import socket -import sys - -INDEX_URL = "http://www.rfc-editor.org/rfc/rfc-index.xml" -TABLE = "rfc_index_mirror" - -log_data = "" -def log(line): - global log_data - if __name__ == '__main__' and len(sys.argv) > 1: - print line - else: - log_data += line + "\n" - -# python before 2.7 doesn't have the total_seconds method on datetime.timedelta. -def total_seconds(td): - return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 - -def parse(response): - def getChildText(parentNode, tagName): - for node in parentNode.childNodes: - if node.nodeType == Node.ELEMENT_NODE and node.localName == tagName: - return node.firstChild.data - return None - - def getDocList(parentNode, tagName): - l = [] - for u in parentNode.getElementsByTagName(tagName): - for d in u.getElementsByTagName("doc-id"): - l.append(d.firstChild.data) - if len(l) == 0: - return None - else: - return ",".join(l) - - also_list = {} - data = [] - events = pulldom.parse(response) - for (event, node) in events: - if event == pulldom.START_ELEMENT and node.tagName in ["bcp-entry", "fyi-entry", "std-entry"]: - events.expandNode(node) - node.normalize() - bcpid = getChildText(node, "doc-id") - doclist = getDocList(node, "is-also") - if doclist: - for docid in doclist.split(","): - if docid in also_list: - also_list[docid].append(bcpid) - else: - also_list[docid] = [bcpid] - - elif event == pulldom.START_ELEMENT and node.tagName == "rfc-entry": - events.expandNode(node) - node.normalize() - rfc_number = int(getChildText(node, "doc-id")[3:]) - title = getChildText(node, "title") - - l = [] - for author in node.getElementsByTagName("author"): - l.append(getChildText(author, "name")) - authors = "; ".join(l) - - d = node.getElementsByTagName("date")[0] - year = int(getChildText(d, "year")) - month = getChildText(d, "month") - month = ["January","February","March","April","May","June","July","August","September","October","November","December"].index(month)+1 - rfc_published_date = ("%d-%02d-01" % (year, month)) - - current_status = getChildText(node, "current-status").title() - - updates = getDocList(node, "updates") - updated_by = getDocList(node, "updated-by") - obsoletes = getDocList(node, "obsoletes") - obsoleted_by = getDocList(node, "obsoleted-by") - stream = getChildText(node, "stream") - wg = getChildText(node, "wg_acronym") - if wg and ((wg == "NON WORKING GROUP") or len(wg) > 15): - wg = None - - l = [] - for format in node.getElementsByTagName("format"): - l.append(getChildText(format, "file-format")) - file_formats = (",".join(l)).lower() - - draft = getChildText(node, "draft") - if draft and re.search("-\d\d$", draft): - draft = draft[0:-3] - - if len(node.getElementsByTagName("errata-url")) > 0: - has_errata = 1 - else: - has_errata = 0 - - data.append([rfc_number,title,authors,rfc_published_date,current_status,updates,updated_by,obsoletes,obsoleted_by,None,draft,has_errata,stream,wg,file_formats]) - - for d in data: - k = "RFC%04d" % d[0] - if k in also_list: - d[9] = ",".join(also_list[k]) - return data - -def insert_to_database(data): - log("connecting to database...") - cursor = db.connection.cursor() - log("removing old data...") - cursor.execute("DELETE FROM "+TABLE) - log("inserting new data...") - cursor.executemany("INSERT INTO "+TABLE+" (rfc_number, title, authors, rfc_published_date, current_status,updates,updated_by,obsoletes,obsoleted_by,also,draft,has_errata,stream,wg,file_formats) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)", data) - cursor.close() - db.connection._commit() - db.connection.close() - -def get_std_level_mapping(): - from ietf.name.models import StdLevelName - from ietf.name.utils import name - return { - "Standard": name(StdLevelName, "std", "Standard"), - "Draft Standard": name(StdLevelName, "ds", "Draft Standard"), - "Proposed Standard": name(StdLevelName, "ps", "Proposed Standard"), - "Informational": name(StdLevelName, "inf", "Informational"), - "Experimental": name(StdLevelName, "exp", "Experimental"), - "Best Current Practice": name(StdLevelName, "bcp", "Best Current Practice"), - "Historic": name(StdLevelName, "hist", "Historic"), - "Unknown": name(StdLevelName, "unkn", "Unknown"), - } - -def get_stream_mapping(): - from ietf.name.models import StreamName - from ietf.name.utils import name - - return { - "IETF": name(StreamName, "ietf", "IETF", desc="IETF stream", order=1), - "INDEPENDENT": name(StreamName, "ise", "ISE", desc="Independent Submission Editor stream", order=2), - "IRTF": name(StreamName, "irtf", "IRTF", desc="Independent Submission Editor stream", order=3), - "IAB": name(StreamName, "iab", "IAB", desc="IAB stream", order=4), - "Legacy": name(StreamName, "legacy", "Legacy", desc="Legacy stream", order=5), - } - - -import django.db.transaction - -@django.db.transaction.commit_on_success -def insert_to_databaseREDESIGN(data): - from ietf.person.models import Person - from ietf.doc.models import Document, DocAlias, DocEvent, RelatedDocument, State, save_document_in_history - from ietf.group.models import Group - from ietf.name.models import DocTagName, DocRelationshipName - from ietf.name.utils import name - - system = Person.objects.get(name="(System)") - std_level_mapping = get_std_level_mapping() - stream_mapping = get_stream_mapping() - tag_has_errata = name(DocTagName, 'errata', "Has errata") - relationship_obsoletes = name(DocRelationshipName, "obs", "Obsoletes") - relationship_updates = name(DocRelationshipName, "updates", "Updates") - - skip_older_than_date = (date.today() - timedelta(days=365)).strftime("%Y-%m-%d") - - log("updating data...") - for d in data: - rfc_number, title, authors, rfc_published_date, current_status, updates, updated_by, obsoletes, obsoleted_by, also, draft, has_errata, stream, wg, file_formats = d - - if rfc_published_date < skip_older_than_date: - # speed up the process by skipping old entries - continue - - # we assume two things can happen: we get a new RFC, or an - # attribute has been updated at the RFC Editor (RFC Editor - # attributes currently take precedence over our local - # attributes) - - # make sure we got the document and alias - created = False - doc = None - name = "rfc%s" % rfc_number - a = DocAlias.objects.filter(name=name) - if a: - doc = a[0].document - else: - if draft: - try: - doc = Document.objects.get(name=draft) - except Document.DoesNotExist: - pass - - if not doc: - created = True - log("created document %s" % name) - doc = Document.objects.create(name=name) - - # add alias - DocAlias.objects.create(name=name, document=doc) - if not created: - created = True - log("created alias %s to %s" % (name, doc.name)) - - - # check attributes - changed_attributes = {} - changed_states = [] - created_relations = [] - other_changes = False - if title != doc.title: - changed_attributes["title"] = title - - if std_level_mapping[current_status] != doc.std_level: - changed_attributes["std_level"] = std_level_mapping[current_status] - - if doc.get_state_slug() != "rfc": - changed_states.append(State.objects.get(type="draft", slug="rfc")) - - if doc.stream != stream_mapping[stream]: - changed_attributes["stream"] = stream_mapping[stream] - - if not doc.group and wg: - changed_attributes["group"] = Group.objects.get(acronym=wg) - - if not doc.latest_event(type="published_rfc"): - e = DocEvent(doc=doc, type="published_rfc") - pubdate = datetime.strptime(rfc_published_date, "%Y-%m-%d") - # unfortunately, pubdate doesn't include the correct day - # at the moment because the data only has month/year, so - # try to deduce it - synthesized = datetime.now() - if abs(pubdate - synthesized) > timedelta(days=60): - synthesized = pubdate - else: - direction = -1 if total_seconds(pubdate - synthesized) < 0 else +1 - while synthesized.month != pubdate.month or synthesized.year != pubdate.year: - synthesized += timedelta(days=direction) - e.time = synthesized - e.by = system - e.desc = "RFC published" - e.save() - other_changes = True - - if doc.get_state_slug("draft-iesg") == "rfcqueue": - changed_states.append(State.objects.get(type="draft-iesg", slug="pub")) - - def parse_relation_list(s): - if not s: - return [] - res = [] - for x in s.split(","): - if x[:3] in ("NIC", "IEN", "STD", "RTR"): - # try translating this to RFCs that we can handle - # sensibly; otherwise we'll have to ignore them - l = DocAlias.objects.filter(name__startswith="rfc", document__docalias__name=x.lower()) - else: - l = DocAlias.objects.filter(name=x.lower()) - - for a in l: - if a not in res: - res.append(a) - return res - - for x in parse_relation_list(obsoletes): - if not RelatedDocument.objects.filter(source=doc, target=x, relationship=relationship_obsoletes): - created_relations.append(RelatedDocument(source=doc, target=x, relationship=relationship_obsoletes)) - - for x in parse_relation_list(updates): - if not RelatedDocument.objects.filter(source=doc, target=x, relationship=relationship_updates): - created_relations.append(RelatedDocument(source=doc, target=x, relationship=relationship_updates)) - - if also: - for a in also.lower().split(","): - if not DocAlias.objects.filter(name=a): - DocAlias.objects.create(name=a, document=doc) - other_changes = True - - if has_errata: - if not doc.tags.filter(pk=tag_has_errata.pk): - changed_attributes["tags"] = list(doc.tags.all()) + [tag_has_errata] - else: - if doc.tags.filter(pk=tag_has_errata.pk): - changed_attributes["tags"] = set(doc.tags.all()) - set([tag_has_errata]) - - if changed_attributes or changed_states or created_relations or other_changes: - # apply changes - save_document_in_history(doc) - for k, v in changed_attributes.iteritems(): - setattr(doc, k, v) - - for s in changed_states: - doc.set_state(s) - - for o in created_relations: - o.save() - - doc.time = datetime.now() - doc.save() - - if not created: - log("%s changed" % name) - - -if settings.USE_DB_REDESIGN_PROXY_CLASSES: - insert_to_database = insert_to_databaseREDESIGN - -if __name__ == '__main__': - try: - log("output from mirror_rfc_index.py:\n") - log("time: "+str(datetime.now())) - log("host: "+socket.gethostname()) - log("url: "+INDEX_URL) - - log("downloading...") - socket.setdefaulttimeout(30) - response = urllib2.urlopen(INDEX_URL) - log("parsing...") - data = parse(response) - - log("got " + str(len(data)) + " entries") - if len(data) < 5000: - raise Exception('not enough data') - - insert_to_database(data) - - log("all done!") - log_data = "" - - finally: - if len(log_data) > 0: - print log_data diff --git a/ietf/settings.py b/ietf/settings.py index 4b6eefa8b..2e1e6b124 100644 --- a/ietf/settings.py +++ b/ietf/settings.py @@ -159,6 +159,7 @@ INSTALLED_APPS = ( 'ietf.ietfworkflows', 'ietf.wgchairs', 'ietf.wgcharter', + 'ietf.sync', 'ietf.community', ) diff --git a/ietf/sync/__init__.py b/ietf/sync/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/ietf/sync/models.py b/ietf/sync/models.py new file mode 100644 index 000000000..e69de29bb diff --git a/ietf/sync/rfceditor.py b/ietf/sync/rfceditor.py new file mode 100644 index 000000000..0dbc08ff4 --- /dev/null +++ b/ietf/sync/rfceditor.py @@ -0,0 +1,446 @@ +import re, urllib2, json, email, socket +from xml.dom import pulldom, Node + +from django.utils.http import urlquote + +from ietf.utils.mail import send_mail_text + +from ietf.doc.models import * +from ietf.person.models import * +from ietf.name.models import * +from ietf.doc.utils import add_state_change_event + +QUEUE_URL = "http://www.rfc-editor.org/queue2.xml" +INDEX_URL = "http://www.rfc-editor.org/rfc/rfc-index.xml" + +MIN_QUEUE_RESULTS = 10 +MIN_INDEX_RESULTS = 5000 + +# Python < 2.7 doesn't have the total_seconds method on datetime.timedelta. +def total_seconds(td): + return (td.microseconds + (td.seconds + td.days * 24 * 3600) * 10**6) / 10**6 + +def get_child_text(parent_node, tag_name): + for node in parent_node.childNodes: + if node.nodeType == Node.ELEMENT_NODE and node.localName == tag_name: + return node.firstChild.data + return None + + +def fetch_queue_xml(url): + socket.setdefaulttimeout(30) + return urllib2.urlopen(url) + +def parse_queue(response): + events = pulldom.parse(response) + drafts = [] + warnings = [] + + for event, node in events: + if event == pulldom.START_ELEMENT and node.tagName == "entry": + events.expandNode(node) + node.normalize() + draft_name = get_child_text(node, "draft").strip() + draft_name = re.sub("(-\d\d)?(.txt){1,2}$", "", draft_name) + date_received = get_child_text(node, "date-received") + + state = "" + tags = [] + missref_generation = "" + for child in node.childNodes: + if child.nodeType == Node.ELEMENT_NODE and child.localName == "state": + state = child.firstChild.data + # state has some extra annotations encoded, parse + # them out + if '*R' in state: + tags.append("ref") + state = state.replace("*R", "") + if '*A' in state: + tags.append("iana") + state = state.replace("*A", "") + m = re.search(r"\(([0-9]+)G\)", state) + if m: + missref_generation = m.group(1) + state = state.replace("(%sG)" % missref_generation, "") + + # AUTH48 link + auth48 = "" + for child in node.childNodes: + if child.nodeType == Node.ELEMENT_NODE and child.localName == "auth48-url": + auth48 = child.firstChild.data + + # cluster link (if it ever gets implemented) + cluster = "" + for child in node.childNodes: + if child.nodeType == Node.ELEMENT_NODE and child.localName == "cluster-url": + cluster = child.firstChild.data + + refs = [] + for child in node.childNodes: + if child.nodeType == Node.ELEMENT_NODE and child.localName == "normRef": + ref_name = get_child_text(child, "ref-name") + ref_state = get_child_text(child, "ref-state") + in_queue = ref_state.startswith("IN-QUEUE") + refs.append((ref_name, ref_state, in_queue)) + + drafts.append((draft_name, date_received, state, tags, missref_generation, stream, auth48, cluster, refs)) + + elif event == pulldom.START_ELEMENT and node.tagName == "section": + name = node.getAttribute('name') + if name.startswith("IETF"): + stream = "ietf" + elif name.startswith("IAB"): + stream = "iab" + elif name.startswith("IRTF"): + stream = "irtf" + elif name.startswith("INDEPENDENT"): + stream = "ise" + else: + stream = None + warnings.append("unrecognized section " + name) + + return drafts, warnings + +def update_drafts_from_queue(drafts): + tag_mapping = { + 'IANA': DocTagName.objects.get(slug='iana'), + 'REF': DocTagName.objects.get(slug='ref') + } + + slookup = dict((s.slug, s) + for s in State.objects.filter(type=StateType.objects.get(slug="draft-rfceditor"))) + state_mapping = { + 'AUTH': slookup['auth'], + 'AUTH48': slookup['auth48'], + 'AUTH48-DONE': slookup['auth48-done'], + 'EDIT': slookup['edit'], + 'IANA': slookup['iana'], + 'IESG': slookup['iesg'], + 'ISR': slookup['isr'], + 'ISR-AUTH': slookup['isr-auth'], + 'REF': slookup['ref'], + 'RFC-EDITOR': slookup['rfc-edit'], + 'TO': slookup['timeout'], + 'MISSREF': slookup['missref'], + } + + system = Person.objects.get(name="(System)") + + warnings = [] + + names = [t[0] for t in drafts] + + drafts_in_db = dict((d.name, d) + for d in Document.objects.filter(type="draft", docalias__name__in=names)) + + changed = set() + + for name, date_received, state, tags, missref_generation, stream, auth48, cluster, refs in drafts: + if name not in drafts_in_db: + warnings.append("unknown document %s" % name) + continue + + if not state or state not in state_mapping: + warnings.append("unknown state '%s'" % state) + continue + + d = drafts_in_db[name] + + prev_state = d.get_state("draft-rfceditor") + next_state = state_mapping[state] + + # check if we've noted it's been received + if d.get_state_slug("draft-iesg") == "ann" and not prev_state and not d.latest_event(DocEvent, type="rfc_editor_received_announcement"): + e = DocEvent(doc=d, by=system, type="rfc_editor_received_announcement") + e.desc = "Announcement was received by RFC Editor" + e.save() + send_mail_text(None, "iesg-secretary@ietf.org", None, + '%s in RFC Editor queue' % d.name, + 'The announcement for %s has been received by the RFC Editor.' % d.name) + + + if prev_state != next_state: + save_document_in_history(d) + + d.set_state(next_state) + + e = add_state_change_event(d, system, prev_state, next_state) + + if auth48: + e.desc = re.sub(r"(.*)", r"\1" % auth48, e.desc) + e.save() + + changed.add(name) + + t = DocTagName.objects.filter(slug__in=tags) + if set(t) != set(d.tags.all()): + d.tags = t + changed.add(name) + + + # remove tags and states for those not in the queue anymore + for d in Document.objects.exclude(docalias__name__in=names).filter(states__type="draft-rfceditor").distinct(): + d.tags.remove(*tag_mapping.values()) + d.unset_state("draft-rfceditor") + # we do not add a history entry here - most likely we already + # have something that explains what happened + changed.add(name) + + return changed, warnings + + +def fetch_index_xml(url): + socket.setdefaulttimeout(30) + return urllib2.urlopen(url) + +def parse_index(response): + def getDocList(parentNode, tagName): + l = [] + for u in parentNode.getElementsByTagName(tagName): + for d in u.getElementsByTagName("doc-id"): + l.append(d.firstChild.data) + return l + + also_list = {} + data = [] + events = pulldom.parse(response) + for event, node in events: + if event == pulldom.START_ELEMENT and node.tagName in ["bcp-entry", "fyi-entry", "std-entry"]: + events.expandNode(node) + node.normalize() + bcpid = get_child_text(node, "doc-id") + doclist = getDocList(node, "is-also") + for docid in doclist: + if docid in also_list: + also_list[docid].append(bcpid) + else: + also_list[docid] = [bcpid] + + elif event == pulldom.START_ELEMENT and node.tagName == "rfc-entry": + events.expandNode(node) + node.normalize() + rfc_number = int(get_child_text(node, "doc-id")[3:]) + title = get_child_text(node, "title") + + authors = [] + for author in node.getElementsByTagName("author"): + authors.append(get_child_text(author, "name")) + + d = node.getElementsByTagName("date")[0] + year = int(get_child_text(d, "year")) + month = get_child_text(d, "month") + month = ["January","February","March","April","May","June","July","August","September","October","November","December"].index(month)+1 + rfc_published_date = datetime.date(year, month, 1) + + current_status = get_child_text(node, "current-status").title() + + updates = getDocList(node, "updates") + updated_by = getDocList(node, "updated-by") + obsoletes = getDocList(node, "obsoletes") + obsoleted_by = getDocList(node, "obsoleted-by") + stream = get_child_text(node, "stream") + wg = get_child_text(node, "wg_acronym") + if wg and ((wg == "NON WORKING GROUP") or len(wg) > 15): + wg = None + + l = [] + pages = "" + for fmt in node.getElementsByTagName("format"): + l.append(get_child_text(fmt, "file-format")) + if get_child_text(fmt, "file-format") == "ASCII": + pages = get_child_text(fmt, "page-count") + file_formats = (",".join(l)).lower() + + abstract = "" + for abstract in node.getElementsByTagName("abstract"): + abstract = get_child_text(abstract, "p") + + draft = get_child_text(node, "draft") + if draft and re.search("-\d\d$", draft): + draft = draft[0:-3] + + if len(node.getElementsByTagName("errata-url")) > 0: + has_errata = 1 + else: + has_errata = 0 + + data.append((rfc_number,title,authors,rfc_published_date,current_status,updates,updated_by,obsoletes,obsoleted_by,[],draft,has_errata,stream,wg,file_formats,pages,abstract)) + + for d in data: + k = "RFC%04d" % d[0] + if k in also_list: + d[9].extend(also_list[k]) + return data + + + #skip_older_than_date = date.today() - timedelta(days=365) +def update_docs_from_rfc_index(data, skip_older_than_date=None): + std_level_mapping = { + "Standard": StdLevelName.objects.get(slug="std"), + "Draft Standard": StdLevelName.objects.get(slug="ds"), + "Proposed Standard": StdLevelName.objects.get(slug="ps"), + "Informational": StdLevelName.objects.get(slug="inf"), + "Experimental": StdLevelName.objects.get(slug="exp"), + "Best Current Practice": StdLevelName.objects.get(slug="bcp"), + "Historic": StdLevelName.objects.get(slug="hist"), + "Unknown": StdLevelName.objects.get(slug="unkn"), + } + + stream_mapping = { + "IETF": StreamName.objects.get(slug="ietf"), + "INDEPENDENT": StreamName.objects.get(slug="ise"), + "IRTF": StreamName.objects.get(slug="irtf"), + "IAB": StreamName.objects.get(slug="iab"), + "Legacy": StreamName.objects.get(slug="legacy"), + } + + tag_has_errata = DocTagName.objects.get(slug='errata') + relationship_obsoletes = DocRelationshipName.objects.get(slug="obs") + relationship_updates = DocRelationshipName.objects.get(slug="updates") + + system = Person.objects.get(name="(System)") + + results = [] + + for rfc_number, title, authors, rfc_published_date, current_status, updates, updated_by, obsoletes, obsoleted_by, also, draft, has_errata, stream, wg, file_formats, pages, abstract in data: + + if skip_older_than_date and rfc_published_date < skip_older_than_date: + # speed up the process by skipping old entries + continue + + # we assume two things can happen: we get a new RFC, or an + # attribute has been updated at the RFC Editor (RFC Editor + # attributes take precedence over our local attributes) + + # make sure we got the document and alias + created = False + doc = None + name = "rfc%s" % rfc_number + a = DocAlias.objects.filter(name=name).select_related("document") + if a: + doc = a[0].document + else: + if draft: + try: + doc = Document.objects.get(name=draft) + except Document.DoesNotExist: + pass + + if not doc: + results.append("created document %s" % name) + doc = Document.objects.get_or_create(name=name)[0] + + # add alias + DocAlias.objects.get_or_create(name=name, document=doc) + results.append("created alias %s to %s" % (name, doc.name)) + created = True + + + # check attributes + changed_attributes = {} + changed_states = [] + created_relations = [] + other_changes = False + if title != doc.title: + changed_attributes["title"] = title + + if abstract and abstract != doc.abstract: + changed_attributes["abstract"] = abstract + + if int(pages) != doc.pages: + changed_attributes["pages"] = int(pages) + + if std_level_mapping[current_status] != doc.std_level: + changed_attributes["std_level"] = std_level_mapping[current_status] + + if doc.get_state_slug() != "rfc": + changed_states.append(State.objects.get(type="draft", slug="rfc")) + + if doc.stream != stream_mapping[stream]: + changed_attributes["stream"] = stream_mapping[stream] + + if not doc.group and wg: + changed_attributes["group"] = Group.objects.get(acronym=wg) + + if not doc.latest_event(type="published_rfc"): + e = DocEvent(doc=doc, type="published_rfc") + # unfortunately, rfc_published_date doesn't include the correct day + # at the moment because the data only has month/year, so + # try to deduce it + d = datetime.datetime.combine(rfc_published_date, datetime.time()) + synthesized = datetime.datetime.now() + if abs(d - synthesized) > datetime.timedelta(days=60): + synthesized = d + else: + direction = -1 if total_seconds(d - synthesized) < 0 else +1 + while synthesized.month != d.month or synthesized.year != d.year: + synthesized += datetime.timedelta(days=direction) + e.time = synthesized + e.by = system + e.desc = "RFC published" + e.save() + other_changes = True + + results.append("Added RFC published event: %s" % e.time.strftime("%Y-%m-%d")) + + for t in ("draft-iesg", "draft-stream-iab", "draft-stream-irtf", "draft-stream-ise"): + if doc.get_state_slug(t) != "pub": + changed_states.append(State.objects.get(type=t, slug="pub")) + + def parse_relation_list(l): + res = [] + for x in l: + if x[:3] in ("NIC", "IEN", "STD", "RTR"): + # try translating this to RFCs that we can handle + # sensibly; otherwise we'll have to ignore them + l = DocAlias.objects.filter(name__startswith="rfc", document__docalias__name=x.lower()) + else: + l = DocAlias.objects.filter(name=x.lower()) + + for a in l: + if a not in res: + res.append(a) + return res + + for x in parse_relation_list(obsoletes): + if not RelatedDocument.objects.filter(source=doc, target=x, relationship=relationship_obsoletes): + created_relations.append(RelatedDocument(source=doc, target=x, relationship=relationship_obsoletes)) + + for x in parse_relation_list(updates): + if not RelatedDocument.objects.filter(source=doc, target=x, relationship=relationship_updates): + created_relations.append(RelatedDocument(source=doc, target=x, relationship=relationship_updates)) + + if also: + for a in also: + a = a.lower() + if not DocAlias.objects.filter(name=a): + DocAlias.objects.create(name=a, document=doc) + other_changes = True + results.append("Created alias %s to %s" % (a, doc.name)) + + if has_errata: + if not doc.tags.filter(pk=tag_has_errata.pk): + changed_attributes["tags"] = list(doc.tags.all()) + [tag_has_errata] + else: + if doc.tags.filter(pk=tag_has_errata.pk): + changed_attributes["tags"] = set(doc.tags.all()) - set([tag_has_errata]) + + if changed_attributes or changed_states or created_relations or other_changes: + # apply changes + save_document_in_history(doc) + for k, v in changed_attributes.iteritems(): + setattr(doc, k, v) + results.append("Changed %s to %s on %s" % (k, v, doc.name)) + + for s in changed_states: + doc.set_state(s) + results.append("Set state %s on %s" % (s, doc.name)) + + for o in created_relations: + o.save() + results.append("Created %s" % o) + + doc.time = datetime.datetime.now() + doc.save() + + return results