diff --git a/ietf/utils/rfcmarkup.py b/ietf/utils/rfcmarkup.py new file mode 100644 index 000000000..4536900f9 --- /dev/null +++ b/ietf/utils/rfcmarkup.py @@ -0,0 +1,379 @@ +import re +import cgi +import urllib + +def markup(text, path=".", script="", extra=""): + + # ------------------------------------------------------------------------ + # Start of markup handling + + # Convert \r which is not followed or preceded by a \n to \n + # (in case this is a mac document) + text = re.sub("([^\n])\r([^\n])", "\g<1>\n\g<2>", text) + # Strip \r (in case this is a ms format document): + text = text.replace("\r","") + + # ------------- + # Normalization + + # Remove whitespace at the end of lines + text = re.sub("[\t ]+\n", "\n", text) + + # Remove whitespace (including formfeeds) at the end of the document. + # (Trailing formfeeds will result in trailing blank pages.) + text = re.sub("[\t \r\n\f]+$", "\n", text) + + text = text.expandtabs() + + # Remove extra blank lines at the start of the document + text = re.sub("^\n*", "", text, 1) + + # Fix up page breaks: + # \f should aways be preceeded and followed by \n + text = re.sub("([^\n])\f", "\g<1>\n\f", text) + text = re.sub("\f([^\n])", "\f\n\g<1>", text) + + # [Page nn] should be followed by \n\f\n + text = re.sub("(?i)(\[Page [0-9ivxlc]+\])[\n\f\t ]*(\n *[^\n\f\t ])", "\g<1>\n\f\g<2>", text) + + # Normalize indentation + linestarts = re.findall("(?m)^([ ]*)\S", text); + prefixlen = 72 + for start in linestarts: + if len(start) < prefixlen: + prefixlen = len(start) + if prefixlen: + text = re.sub("\n"+(" "*prefixlen), "\n", text) + + # reference name tag markup + reference = {} + ref_url = {} + + ## Locate the start of the References section as the first reference + ## definition after the last reference usage + ## Incomplete 05 Aug 2010 17:05:27 XXXX Complete this!! + + ##ref_usages = re.findall("(\W)(\[)([-\w.]+)((, ?[-\w.]+)*\])", text) + ref_defs = re.findall("(?sm)^( *\n *)\[([-\w.]+?)\]( +)(.*?)(\n *)$", text) + + ##ref_pos = [ match.start() for match in ref_usages ] + ##def_pos = [ match.start() for match in ref_defs ] + ##ref_pos = [ pos for pos in ref_pos if not pos in ref_defs ] + ##last_ref_pos = ref_pos[-1] if ref_pos else None + + #sys.stderr.write("ref_defs: %s\n" % repr(ref_defs)) + for tuple in ref_defs: + title_match = re.search("(?sm)^(.*?(\"[^\"]+?\").+?|.*?(,[^,]+?,)[^,]+?)$", tuple[3]) + if title_match: + reftitle = title_match.group(2) or title_match.group(3).strip("[ ,]+") + # Get rid of page break information inside the title + reftitle = re.sub("(?s)\n\n\S+.*\n\n", "", reftitle) + reftitle = cgi.escape(reftitle, quote=True) + reftitle = re.sub("[\n\t ]+", " ", reftitle) # Remove newlines and tabs + reference[tuple[1]] = reftitle + url_match = re.search(r"(http|https|ftp)://\S+", tuple[3]) + if url_match: + ref_url[tuple[1]] = url_match.group(0) + + # ------------- + # escape any html significant characters + text = cgi.escape(text); + + + # ------------- + # Adding markup + + text = "
"+text+"
" + + # Typewriter-style underline: + text = re.sub("_[\b](.)", "\g<1>", text) + + # Line number markup goes here + + + # Obsoletes: ... markup + + def rfclist_replace(keyword, text): + def replacement(match): + group = list(match.groups("")) + group[3] = re.sub("\d+", """\">\g<0>""" % (script, extra), group[3]) + if group[8]: + group[8] = re.sub("\d+", """\">\g<0>""" % (script, extra), group[8]) + else: + group[8] = "" + return "\n%s%s%s\n%s%s" % (group[0], group[3], group[5], group[7], group[8]) + text = re.sub("\n(%s( RFCs| RFC)?: ?( RFCs| RFC)?)(( \d+,| \d+)+)(.*)\n(( *)((\d+, )*(\d+)))*" % keyword, replacement, text, 1) + return text + + text = rfclist_replace("Obsoletes", text) + text = rfclist_replace("Updates", text) + + lines = text.splitlines(True) + head = "".join(lines[:28]) + rest = "".join(lines[28:]) + + # title markup + head = re.sub("""(?im)(([12][0-9][0-9][0-9]|^Obsoletes.*|^Category: (Standards Track|Informational|Experimental|Best Current Practice)) *\n\n+ +)([A-Z][^\n]+)$""", """\g<1>\g<4>""", head, 1) + head = re.sub("""(?i)()(\n +)([^<\n]+)\n""", """\g<1>\g<2>\g<3>\n""", head, 1) + head = re.sub("""(?i)()(\n +)([^<\n]+)\n""", """\g<1>\g<2>\g<3>\n""", head, 1) + + text = head + rest + + # http link markup + # link crossing a line. Not permitting ":" after the line break will + # result in some URLs broken across lines not being recognized, but + # will on the other hand correctly handle a series of URL listed line + # by line, one on each line. + # Link crossing a line, where the continuation contains '.' or '/' + text = re.sub("(?im)(\s|^|[^=]\"|\()((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[./][A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)\"\s]|$)", + "\g<1>\g<6>\">\g<2>\g<5>\g<6>\">\g<6>\g<7>", text) + text = re.sub("(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(>)", + "\g<1>\g<6>\">\g<2>\g<5>\g<6>\">\g<6>\g<7>", text) + # Link crossing a line, where first line ends in '-' or '/' + text = re.sub("(?im)(\s|^|[^=]\"|\()((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?[-/])(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)\"\s]|$)", + "\g<1>\g<6>\">\g<2>\g<5>\g<6>\">\g<6>\g<7>", text) + text = re.sub("(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(>)", + "\g<1>\g<6>\">\g<2>\g<5>\g<6>\">\g<6>\g<7>", text) + # link crossing a line, enclosed in "<" ... ">" + text = re.sub("(?im)<((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])>", + "<\g<1>\g<5>\">\g<1>\g<4>\g<5>\">\g<5>>", text) + text = re.sub("(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&;?#~=-]+[A-Za-z0-9_/@%&;?#~=-])(>)", + "\g<1>\g<6>\">\g<2>\g<5>\g<6>\">\g<6>\g<7>", text) + # link crossing two lines, enclosed in "<" ... ">" + text = re.sub("(?im)<((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])>", + "<\g<1>\g<5>\g<7>\">\g<1>\g<4>\g<5>\g<7>\">\g<5>\g<6>\g<5>\g<7>\">\g<7>>", text) + text = re.sub("(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(\n +)([A-Za-z0-9_./@%&;?#~=-]+[A-Za-z0-9_/@%&;?#~=-])(>)", + "\g<1>\g<6>\g<8>\">\g<2>\g<5>\g<6>\g<8>\">\g<6>\g<7>\g<6>\g<8>\">\g<8>\g<9>", text) + # link on a single line + text = re.sub("(?im)(\s|^|[^=]\"|<|\()((http|https|ftp)://[:A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)\"\s]|>|$)", + "\g<1>\">\g<2>\g<4>", text) +# # Special case for licensing boilerplate +# text = text.replace('http://trustee.ietf.org/\n license-info', +# 'http://trustee.ietf.org/\n licence-info') + + # undo markup if RFC2606 domain + text = re.sub("""(?i)(.*?)""", "\g<5>", text) + + # draft markup + # draft name crossing line break + text = re.sub("([^/#=\?\w-])(draft-([-a-zA-Z0-9]+-)?)(\n +)([-a-zA-Z0-9]+[a-zA-Z0-9](.txt)?)", + "\g<1>\g<5>\">\g<2>\g<4>\g<5>\">\g<5>" % (script, extra, script, extra), text) + # draft name on one line (but don't mess with what we just did above) + text = re.sub("([^/#=\?\w>=-])(draft-[-a-zA-Z0-9]+[a-zA-Z0-9](.txt)?)", + "\g<1>\">\g<2>" % (script, extra), text) + + # rfc markup + # rfc and number on the same line + text = re.sub("""(?i)([^[/\w-])(rfc([- ]?))([0-9]+)(\W)""", + """\g<1>\">\g<2>\g<4>\g<5>""" % (script, extra), text) + # rfc and number on separate lines + text = re.sub("(?i)([^[/\w-])(rfc([-]?))(\n +)([0-9]+)(\W)", + "\g<1>\">\g<2>\g<4>\">\g<5>\g<6>" % (script, extra, script, extra), text) + # spelled out Request For Comments markup + text = re.sub("(?i)(\s)(Request\s+For\s+Comments\s+\([^)]+\)\s+)([0-9]+)", + "\g<1>\g<2>\">\g<3>" % (script, extra), text) + # bcp markup + text = re.sub("(?i)([^[/\w-])(bcp([- ]?))([0-9]+)(\W)", + "\g<1>\">\g<2>\g<4>\g<5>" % (script, extra), text) + text = re.sub("(?i)([^[/\w-])(bcp([-]?))(\n +)([0-9]+)(\W)", + "\g<1>\">\g<2>\g<4>\">\g<5>\g<6>" % (script, extra, script, extra), text) + + def workinprogress_replacement(match): + g1 = match.group(1) + g2 = match.group(2) + g3 = match.group(3) + # eliminate embedded hyperlinks in text we'll use as anchor text + g4 = match.group(4) + g4 = re.sub("(.+?)", "\g<1>", g4) + g4url = urllib.quote_plus(g4) + g5 = match.group(5) + return """%s[%s]%s%s%s""" % (g1, g2, g2, g3, g4url, g4, g5) + + text = re.sub("(\n *\n *)\[([-\w.]+)\](\s+.*?)(\".+\")(,\s+Work\s+in\s+Progress.)", workinprogress_replacement, text) + text = re.sub("(\n *\n *)\[([-\w.]+)\](\s)", "\g<1>[\">\g<2>]\g<3>", text) + + text = re.sub("(\n *\n *)\[(RFC [-\w.]+)\](\s)", "\g<1>[\">\g<2>]\g<3>", text) + + ref_targets = re.findall('%s%s""" % (pre, beg, script, extra, rfcnum, reference[tag], tag, end) + else: + return """%s%s%s%s""" % (pre, beg, script, extra, rfcnum , tag, end) + else: + if tag in ref_targets: + if tag in reference: + return """%s%s%s%s""" % (pre, beg, tag, reference[tag], tag, end) + else: + return """%s%s%s%s""" % (pre, beg, tag, tag, end) + else: + return match.group(0) + + # Group: 1 2 3 45 + text = re.sub("(\W)(\[)([-\w.]+)((, ?[-\w.]+)*\])", reference_replacement, text) + text = re.sub("(\W)(\[)(RFC [0-9]+)((, ?RFC [0-9]+)*\])", reference_replacement, text) + while True: + old = text + text = re.sub("(\W)(\[(?:.*?, ?)+)([-\w.]+)((, ?[-\w.]+)*\])", reference_replacement, text) + if text == old: + break + while True: + old = text + text = re.sub("(\W)(\[(?:.*?, ?)+)(RFC [-\w.]+)((, ?RFC [-\w.]+)*\])", reference_replacement, text) + if text == old: + break + + # greying out the page headers and footers + text = re.sub("\n(.+\[Page \w+\])\n\f\n(.+)\n", """\n\g<1>\n\f\n\g<2>\n""", text) + + # contents link markup: section links + # 1 2 3 4 5 6 7 + text = re.sub("(?m)^(\s*)(\d+(\.\d+)*)(\.?[ ]+)(.*[^ .])( *\. ?\.)(.*[0-9])$", """\g<1>\g<2>\g<4>\g<5>\g<6>\g<7>""", text) + text = re.sub("(?m)^(\s*)(Appendix |)([A-Z](\.\d+)*)(\.?[ ]+)(.*[^ .])( *\. ?\.)(.*[0-9])$", """\g<1>\g<2>\g<3>\g<5>\g<6>\g<7>\g<8>""", text) + + # page number markup + multidoc_separator = "========================================================================" + if re.search(multidoc_separator, text): + parts = re.split(multidoc_separator, text) + for i in range(len(parts)): + parts[i] = re.sub("(?si)(\f)([^\f]*\[Page (\w+)\])", "\g<1>
\" class=\"anchor-target\">
\g<2>"%{"page": "page-%s"%(i+1)}, parts[i]) + parts[i] = re.sub("(?i)(\. ?\. +|\. \. \.|\.\.\. *)([0-9ivxlc]+)( *\n)", "\g<1>\">\g<2>\g<3>"%{"page": "page-%s"%(i+1)}, parts[i]) + text = multidoc_separator.join(parts) + else: + # page name tag markup + text = re.sub("(?si)(\f)([^\f]*\[Page (\w+)\])", "\g<1>
\" class=\"anchor-target\">
\g<2>", text) + # contents link markup: page numbers + text = re.sub("(?i)(\. ?\. +|\. \. \.|\.\.\. *)([0-9ivxlc]+)( *\n)", "\g<1>\">\g<2>\g<3>", text) + + # section number tag markup + def section_anchor_replacement(match): + # exclude TOC entries + mstring = match.group(0) + if " \. \. " in mstring or "\.\.\." in mstring: + return mstring + + level = len(re.findall("[^\.]+", match.group(1)))+1 + if level > 6: + level = 6 + html = """%s%s""" % (level, match.group(1), match.group(1), match.group(1), match.group(3)) + html = html.replace("\n", """
\n""" % level) + return html + + + text = re.sub("(?im)^(\d+(\.\d+)*)(\.?[ ]+\S.*?(\n +\w+.*)?( |$))", section_anchor_replacement, text) + #text = re.sub("(?i)(\n *\n *)(\d+(\.\d+)*)(\.?[ ].*)", section_replacement, text) + # section number link markup + text = re.sub("(?i)(section\s)(\d+(\.\d+)*)", "\">\g<1>\g<2>", text) + text = re.sub("(?i)(section)\n(\s+)(\d+(\.\d+)*)", "\">\g<1>\n\g<2>\">\g<3>", text) + + # Special cases for licensing boilerplate + text = text.replace('Section 4.e of the Trust Legal Provisions', + 'Section 4.e of the Trust Legal Provisions') + + while True: + old = text + text = re.sub("(?i)(sections\s(.*?(,\s|\s?-\s?|\sthrough\s|\sor\s|\sto\s|,?\sand\s))*)(\d+(\.\d+)*)", "\g<1>\">\g<4>", text) + if text == old: + break + + # appendix number tag markup + def appendix_replacement(match): + # exclude TOC entries + mstring = match.group(0) + if " \. \. " in mstring or "\.\.\." in mstring: + return mstring + + txt = match.group(4) + num = match.group(2).rstrip('.') + if num != match.group(2): + txt = "." + txt + level = len(re.findall("[^\.]+", num))+1 + if level > 6: + level = 6 + return """%s%s%s""" % (level, num, num, match.group(1), num, txt) + + text = re.sub("(?m)^(Appendix |)([A-Z](\.|\.\d+)+)(\.?[ ].*)$", appendix_replacement, text) + #text = re.sub("(?i)(\n *\n *)(\d+(\.\d+)*)(\.?[ ].*)", appendix_replacement, text) + # appendix number link markup + text = re.sub(" ([Aa]ppendix\s)([A-Z](\.\d+)*)", " \">\g<1>\g<2>", text) + text = re.sub(" ([Aa]ppendix)\n(\s+)([A-Z](\.\d+)*)", " \">\g<1>\n\g<2>\">\g<3>", text) + +# # section x of draft-y markup +# text = re.sub("(?i)(section)\s(\d+(\.\d+)*)(\.?\s+(of|in)\s+)(draft-[-.a-zA-Z0-9]+[a-zA-Z0-9])", ".txt#section-\g<2>\">\g<1> \g<2>\g<4>\g<6>\g<7>" % (script, extra, rfcs), text) +# # draft-y, section x markup +# text = re.sub("(?i)(draft-[-.a-zA-Z0-9]+[a-zA-Z0-9])(,?\s)(section)\s(\d+(\.\d+)*)", ".txt#section-\g<5>\">\g<1>\g<2>\g<3>\g<4> \g<5>" % (script, extra, rfcs), text) +# # [draft-y], section x markup +# text = re.sub("(?i)\[\"]+\">(draft-[-.a-zA-Z0-9]+[a-zA-Z0-9])\](,?\s)\"]*\">(section)\s(\d+(\.\d+)*)", ".txt#section-\g<5>\">[\g<1>\g<2>]\g<3>\g<4> \g<5>" % (script, extra, rfcs), text) + + for n in ['rfc', 'bcp', 'fyi', 'std']: + # section x of rfc y markup + text = re.sub("(?i)]*>(section)\s(\d+(\.\d+)*)(\.?\s+(of|in)\s+)]*>(%s[- ]?)([0-9]+)"%n, + "\g<1>\">\g<2> \g<3>\g<5>\g<8>\g<9>" % (script, extra, n), text) + text = re.sub("(?i)]*>(section)(\n\s+)]*>(\d+(\.\d+)*)(\.?\s+(of|in)\s+)]*>(%s[- ]?)([0-9]+)"%n, + "\g<1>\">\g<2>\g<3>\g<1>\">\g<4>\g<5>\g<9>\g<10>" % (script, extra, n, script, extra, n), text) + # appendix x of rfc y markup + text = re.sub("(?i)]*>(appendix)\s([A-Z](\.\d+)*)(\.?\s+(of|in)\s+)]*>(%s[- ]?)([0-9]+)"%n, + "\g<1>\">\g<2> \g<3>\g<5>\g<8>\g<9>" % (script, extra, n), text) + text = re.sub("(?i)]*>(appendix)(\n\s+)]*>([A-Z]+(\.\d+)*)(\.?\s+(of|in)\s+)]*>(%s[- ]?)([0-9]+)"%n, + "\g<1>\">\g<2>\g<3>\g<1>\">\g<4>\g<5>\g<9>\g<10>" % (script, extra, n, script, extra, n), text) + + # rfc y, section x markup + text = re.sub("(?i)]*>(%s[- ]?)([0-9]+)(,?\s+)]*>(section)\s?(([^<]*))"%n, + "\g<5>\">\g<2>\g<3>\g<4>\g<6> \g<7>" % (script, extra, n), text) + # rfc y, appendix x markup + text = re.sub("(?i)]*>(%s[- ]?)([0-9]+)(,?\s+)]*>(appendix)\s?(([^<]*))"%n, + "\g<5>\">\g<2>\g<3>\g<4>\g<6> \g<7>" % (script, extra, n), text) + + # section x of? [rfc y] markup + text = re.sub("(?i)]*>(section)\s(\d+(\.\d+)*)(\.?\s+(of\s+|in\s+)?)\[]*>(%s[- ]?)([0-9]+)\]"%n, + "\g<1>\">\g<2> \g<3>\g<5>[\g<8>\g<9>]" % (script, extra, n), text) + text = re.sub("(?i)]*>(section)(\n\s+)]*>(\d+(\.\d+)*)(\.?\s+(of\s+|in\s+)?)\[]*>(%s[- ]?)([0-9]+)\]"%n, + "\g<1>\">\g<2>\g<3>\g<1>\">\g<4>\g<6>[\g<9>\g<10>]" % (script, extra, n, script, extra, n), text) + # appendix x of? [rfc y] markup + text = re.sub("(?i)]*>(appendix)\s([A-Z](\.\d+)*)(\.?\s+(of\s+|in\s+)?)\[]*>(%s[- ]?)([0-9]+)\]"%n, + "\g<1>\">\g<2> \g<3>\g<5>[\g<8>\g<9>]" % (script, extra, n), text) + text = re.sub("(?i)]*>(appendix)(\n\s+)]*>([A-Z](\.\d+)*)(\.?\s+(of\s+|in\s+)?)\[]*>(%s[- ]?)([0-9]+)\]"%n, + "\g<1>\">\g<2>\g<3>\g<1>\">\g<4>\g<6>[\g<9>\g<10>]" % (script, extra, n, script, extra, n), text) + + # [rfc y], section x markup + text = re.sub("(?i)\[\"]+)\"[^>]*>(%s[- ]?)([0-9]+)\](,?\s+)\"]*)\"[^>]*>(section)\s(\d+(\.\d+)*)"%n, + "\g<5>\">[\g<2>\g<3>]\g<4>\g<6> \g<7>" % (script, extra, n), text) + # [rfc y], appendix x markup + text = re.sub("(?i)\[\"]+)\"[^>]*>(%s[- ]?)([0-9]+)\](,?\s+)\"]*)\"[^>]*>(appendix)\s([A-Z](\.\d+)*)"%n, + "\g<5>\">[\g<2>\g<3>]\g<4>\g<6> \g<7>" % (script, extra, n), text) + + + # remove section link for section x.x (of|in) + old = text + text = re.sub("(?i)]*>(section\s)(\d+(\.\d+)*)(\.?[a-z]*\s+(of|in)\s+)(\[?)]*)>(.*)(\]?)", + '\g<1>\g<2>\g<4>\g<6>>\g<9>\g<10>', text) + text = re.sub('(?i)(\[?)]*)>(.*?)(\]?,\s+)]*>(section\s)(\d+(\.\d+)*)', + '\g<1>>\g<4>\g<5>\g<6>\g<7>', text) + + # Special fix for referring to the trust legal provisons in + # boilerplate text: + text = re.sub("(?i)]*>(section\s)(\d+(\.\d+)*)(\.?[a-z]*\s+(of|in)\s*\n\s*the Trust Legal Provisions)", + '\g<1>\g<2>\g<4>', text) + + # + #text = re.sub("\f", "
", text) + text = re.sub("\n?\f\n?", "\n
", text)
+
+        # restore indentation
+        if prefixlen:
+            text = re.sub("\n", "\n"+(" "*prefixlen), text)
+
+	if path:
+	    text = re.sub("%s\?(rfc|bcp|std)=" % script, "%s/\g<1>" % path, text)
+	    text = re.sub("%s\?draft=" % script, "%s/" % path, text)
+
+        return text