diff --git a/ietf/utils/rfcmarkup.py b/ietf/utils/rfcmarkup.py new file mode 100644 index 000000000..4536900f9 --- /dev/null +++ b/ietf/utils/rfcmarkup.py @@ -0,0 +1,379 @@ +import re +import cgi +import urllib + +def markup(text, path=".", script="", extra=""): + + # ------------------------------------------------------------------------ + # Start of markup handling + + # Convert \r which is not followed or preceded by a \n to \n + # (in case this is a mac document) + text = re.sub("([^\n])\r([^\n])", "\g<1>\n\g<2>", text) + # Strip \r (in case this is a ms format document): + text = text.replace("\r","") + + # ------------- + # Normalization + + # Remove whitespace at the end of lines + text = re.sub("[\t ]+\n", "\n", text) + + # Remove whitespace (including formfeeds) at the end of the document. + # (Trailing formfeeds will result in trailing blank pages.) + text = re.sub("[\t \r\n\f]+$", "\n", text) + + text = text.expandtabs() + + # Remove extra blank lines at the start of the document + text = re.sub("^\n*", "", text, 1) + + # Fix up page breaks: + # \f should aways be preceeded and followed by \n + text = re.sub("([^\n])\f", "\g<1>\n\f", text) + text = re.sub("\f([^\n])", "\f\n\g<1>", text) + + # [Page nn] should be followed by \n\f\n + text = re.sub("(?i)(\[Page [0-9ivxlc]+\])[\n\f\t ]*(\n *[^\n\f\t ])", "\g<1>\n\f\g<2>", text) + + # Normalize indentation + linestarts = re.findall("(?m)^([ ]*)\S", text); + prefixlen = 72 + for start in linestarts: + if len(start) < prefixlen: + prefixlen = len(start) + if prefixlen: + text = re.sub("\n"+(" "*prefixlen), "\n", text) + + # reference name tag markup + reference = {} + ref_url = {} + + ## Locate the start of the References section as the first reference + ## definition after the last reference usage + ## Incomplete 05 Aug 2010 17:05:27 XXXX Complete this!! + + ##ref_usages = re.findall("(\W)(\[)([-\w.]+)((, ?[-\w.]+)*\])", text) + ref_defs = re.findall("(?sm)^( *\n *)\[([-\w.]+?)\]( +)(.*?)(\n *)$", text) + + ##ref_pos = [ match.start() for match in ref_usages ] + ##def_pos = [ match.start() for match in ref_defs ] + ##ref_pos = [ pos for pos in ref_pos if not pos in ref_defs ] + ##last_ref_pos = ref_pos[-1] if ref_pos else None + + #sys.stderr.write("ref_defs: %s\n" % repr(ref_defs)) + for tuple in ref_defs: + title_match = re.search("(?sm)^(.*?(\"[^\"]+?\").+?|.*?(,[^,]+?,)[^,]+?)$", tuple[3]) + if title_match: + reftitle = title_match.group(2) or title_match.group(3).strip("[ ,]+") + # Get rid of page break information inside the title + reftitle = re.sub("(?s)\n\n\S+.*\n\n", "", reftitle) + reftitle = cgi.escape(reftitle, quote=True) + reftitle = re.sub("[\n\t ]+", " ", reftitle) # Remove newlines and tabs + reference[tuple[1]] = reftitle + url_match = re.search(r"(http|https|ftp)://\S+", tuple[3]) + if url_match: + ref_url[tuple[1]] = url_match.group(0) + + # ------------- + # escape any html significant characters + text = cgi.escape(text); + + + # ------------- + # Adding markup + + text = "
"+text+"" + + # Typewriter-style underline: + text = re.sub("_[\b](.)", "\g<1>", text) + + # Line number markup goes here + + + # Obsoletes: ... markup + + def rfclist_replace(keyword, text): + def replacement(match): + group = list(match.groups("")) + group[3] = re.sub("\d+", """\">\g<0>""" % (script, extra), group[3]) + if group[8]: + group[8] = re.sub("\d+", """\">\g<0>""" % (script, extra), group[8]) + else: + group[8] = "" + return "\n%s%s%s\n%s%s" % (group[0], group[3], group[5], group[7], group[8]) + text = re.sub("\n(%s( RFCs| RFC)?: ?( RFCs| RFC)?)(( \d+,| \d+)+)(.*)\n(( *)((\d+, )*(\d+)))*" % keyword, replacement, text, 1) + return text + + text = rfclist_replace("Obsoletes", text) + text = rfclist_replace("Updates", text) + + lines = text.splitlines(True) + head = "".join(lines[:28]) + rest = "".join(lines[28:]) + + # title markup + head = re.sub("""(?im)(([12][0-9][0-9][0-9]|^Obsoletes.*|^Category: (Standards Track|Informational|Experimental|Best Current Practice)) *\n\n+ +)([A-Z][^\n]+)$""", """\g<1>\g<4>""", head, 1) + head = re.sub("""(?i)()(\n +)([^<\n]+)\n""", """\g<1>\g<2>\g<3>\n""", head, 1) + head = re.sub("""(?i)()(\n +)([^<\n]+)\n""", """\g<1>\g<2>\g<3>\n""", head, 1) + + text = head + rest + + # http link markup + # link crossing a line. Not permitting ":" after the line break will + # result in some URLs broken across lines not being recognized, but + # will on the other hand correctly handle a series of URL listed line + # by line, one on each line. + # Link crossing a line, where the continuation contains '.' or '/' + text = re.sub("(?im)(\s|^|[^=]\"|\()((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[./][A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)\"\s]|$)", + "\g<1>\g<6>\">\g<2>\g<5>\g<6>\">\g<6>\g<7>", text) + text = re.sub("(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(>)", + "\g<1>\g<6>\">\g<2>\g<5>\g<6>\">\g<6>\g<7>", text) + # Link crossing a line, where first line ends in '-' or '/' + text = re.sub("(?im)(\s|^|[^=]\"|\()((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?[-/])(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)\"\s]|$)", + "\g<1>\g<6>\">\g<2>\g<5>\g<6>\">\g<6>\g<7>", text) + text = re.sub("(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(>)", + "\g<1>\g<6>\">\g<2>\g<5>\g<6>\">\g<6>\g<7>", text) + # link crossing a line, enclosed in "<" ... ">" + text = re.sub("(?im)<((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])>", + "<\g<1>\g<5>\">\g<1>\g<4>\g<5>\">\g<5>>", text) + text = re.sub("(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&;?#~=-]+[A-Za-z0-9_/@%&;?#~=-])(>)", + "\g<1>\g<6>\">\g<2>\g<5>\g<6>\">\g<6>\g<7>", text) + # link crossing two lines, enclosed in "<" ... ">" + text = re.sub("(?im)<((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])>", + "<\g<1>\g<5>\g<7>\">\g<1>\g<4>\g<5>\g<7>\">\g<5>\g<6>\g<5>\g<7>\">\g<7>>", text) + text = re.sub("(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(\n +)([A-Za-z0-9_./@%&;?#~=-]+[A-Za-z0-9_/@%&;?#~=-])(>)", + "\g<1>\g<6>\g<8>\">\g<2>\g<5>\g<6>\g<8>\">\g<6>\g<7>\g<6>\g<8>\">\g<8>\g<9>", text) + # link on a single line + text = re.sub("(?im)(\s|^|[^=]\"|<|\()((http|https|ftp)://[:A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)\"\s]|>|$)", + "\g<1>\">\g<2>\g<4>", text) +# # Special case for licensing boilerplate +# text = text.replace('http://trustee.ietf.org/\n license-info', +# 'http://trustee.ietf.org/\n licence-info') + + # undo markup if RFC2606 domain + text = re.sub("""(?i)(.*?)""", "\g<5>", text) + + # draft markup + # draft name crossing line break + text = re.sub("([^/#=\?\w-])(draft-([-a-zA-Z0-9]+-)?)(\n +)([-a-zA-Z0-9]+[a-zA-Z0-9](.txt)?)", + "\g<1>\g<5>\">\g<2>\g<4>\g<5>\">\g<5>" % (script, extra, script, extra), text) + # draft name on one line (but don't mess with what we just did above) + text = re.sub("([^/#=\?\w>=-])(draft-[-a-zA-Z0-9]+[a-zA-Z0-9](.txt)?)", + "\g<1>\">\g<2>" % (script, extra), text) + + # rfc markup + # rfc and number on the same line + text = re.sub("""(?i)([^[/\w-])(rfc([- ]?))([0-9]+)(\W)""", + """\g<1>\">\g<2>\g<4>\g<5>""" % (script, extra), text) + # rfc and number on separate lines + text = re.sub("(?i)([^[/\w-])(rfc([-]?))(\n +)([0-9]+)(\W)", + "\g<1>\">\g<2>\g<4>\">\g<5>\g<6>" % (script, extra, script, extra), text) + # spelled out Request For Comments markup + text = re.sub("(?i)(\s)(Request\s+For\s+Comments\s+\([^)]+\)\s+)([0-9]+)", + "\g<1>\g<2>\">\g<3>" % (script, extra), text) + # bcp markup + text = re.sub("(?i)([^[/\w-])(bcp([- ]?))([0-9]+)(\W)", + "\g<1>\">\g<2>\g<4>\g<5>" % (script, extra), text) + text = re.sub("(?i)([^[/\w-])(bcp([-]?))(\n +)([0-9]+)(\W)", + "\g<1>\">\g<2>\g<4>\">\g<5>\g<6>" % (script, extra, script, extra), text) + + def workinprogress_replacement(match): + g1 = match.group(1) + g2 = match.group(2) + g3 = match.group(3) + # eliminate embedded hyperlinks in text we'll use as anchor text + g4 = match.group(4) + g4 = re.sub("
", text) + + # restore indentation + if prefixlen: + text = re.sub("\n", "\n"+(" "*prefixlen), text) + + if path: + text = re.sub("%s\?(rfc|bcp|std)=" % script, "%s/\g<1>" % path, text) + text = re.sub("%s\?draft=" % script, "%s/" % path, text) + + return text