Added rfcmarkup code. Will probably be migrated to a separate lib in time.

- Legacy-Id: 12994
This commit is contained in:
Henrik Levkowetz 2017-03-11 14:02:16 +00:00
parent 58dae78a43
commit 319a4850d6

379
ietf/utils/rfcmarkup.py Normal file
View file

@ -0,0 +1,379 @@
import re
import cgi
import urllib
def markup(text, path=".", script="", extra=""):
# ------------------------------------------------------------------------
# Start of markup handling
# Convert \r which is not followed or preceded by a \n to \n
# (in case this is a mac document)
text = re.sub("([^\n])\r([^\n])", "\g<1>\n\g<2>", text)
# Strip \r (in case this is a ms format document):
text = text.replace("\r","")
# -------------
# Normalization
# Remove whitespace at the end of lines
text = re.sub("[\t ]+\n", "\n", text)
# Remove whitespace (including formfeeds) at the end of the document.
# (Trailing formfeeds will result in trailing blank pages.)
text = re.sub("[\t \r\n\f]+$", "\n", text)
text = text.expandtabs()
# Remove extra blank lines at the start of the document
text = re.sub("^\n*", "", text, 1)
# Fix up page breaks:
# \f should aways be preceeded and followed by \n
text = re.sub("([^\n])\f", "\g<1>\n\f", text)
text = re.sub("\f([^\n])", "\f\n\g<1>", text)
# [Page nn] should be followed by \n\f\n
text = re.sub("(?i)(\[Page [0-9ivxlc]+\])[\n\f\t ]*(\n *[^\n\f\t ])", "\g<1>\n\f\g<2>", text)
# Normalize indentation
linestarts = re.findall("(?m)^([ ]*)\S", text);
prefixlen = 72
for start in linestarts:
if len(start) < prefixlen:
prefixlen = len(start)
if prefixlen:
text = re.sub("\n"+(" "*prefixlen), "\n", text)
# reference name tag markup
reference = {}
ref_url = {}
## Locate the start of the References section as the first reference
## definition after the last reference usage
## Incomplete 05 Aug 2010 17:05:27 XXXX Complete this!!
##ref_usages = re.findall("(\W)(\[)([-\w.]+)((, ?[-\w.]+)*\])", text)
ref_defs = re.findall("(?sm)^( *\n *)\[([-\w.]+?)\]( +)(.*?)(\n *)$", text)
##ref_pos = [ match.start() for match in ref_usages ]
##def_pos = [ match.start() for match in ref_defs ]
##ref_pos = [ pos for pos in ref_pos if not pos in ref_defs ]
##last_ref_pos = ref_pos[-1] if ref_pos else None
#sys.stderr.write("ref_defs: %s\n" % repr(ref_defs))
for tuple in ref_defs:
title_match = re.search("(?sm)^(.*?(\"[^\"]+?\").+?|.*?(,[^,]+?,)[^,]+?)$", tuple[3])
if title_match:
reftitle = title_match.group(2) or title_match.group(3).strip("[ ,]+")
# Get rid of page break information inside the title
reftitle = re.sub("(?s)\n\n\S+.*\n\n", "", reftitle)
reftitle = cgi.escape(reftitle, quote=True)
reftitle = re.sub("[\n\t ]+", " ", reftitle) # Remove newlines and tabs
reference[tuple[1]] = reftitle
url_match = re.search(r"(http|https|ftp)://\S+", tuple[3])
if url_match:
ref_url[tuple[1]] = url_match.group(0)
# -------------
# escape any html significant characters
text = cgi.escape(text);
# -------------
# Adding markup
text = "<pre>"+text+"</pre>"
# Typewriter-style underline:
text = re.sub("_[\b](.)", "<u>\g<1></u>", text)
# Line number markup goes here
# Obsoletes: ... markup
def rfclist_replace(keyword, text):
def replacement(match):
group = list(match.groups(""))
group[3] = re.sub("\d+", """<a href=\"%s?%srfc=\g<0>\">\g<0></a>""" % (script, extra), group[3])
if group[8]:
group[8] = re.sub("\d+", """<a href=\"%s?%srfc=\g<0>\">\g<0></a>""" % (script, extra), group[8])
else:
group[8] = ""
return "\n%s%s%s\n%s%s" % (group[0], group[3], group[5], group[7], group[8])
text = re.sub("\n(%s( RFCs| RFC)?: ?( RFCs| RFC)?)(( \d+,| \d+)+)(.*)\n(( *)((\d+, )*(\d+)))*" % keyword, replacement, text, 1)
return text
text = rfclist_replace("Obsoletes", text)
text = rfclist_replace("Updates", text)
lines = text.splitlines(True)
head = "".join(lines[:28])
rest = "".join(lines[28:])
# title markup
head = re.sub("""(?im)(([12][0-9][0-9][0-9]|^Obsoletes.*|^Category: (Standards Track|Informational|Experimental|Best Current Practice)) *\n\n+ +)([A-Z][^\n]+)$""", """\g<1><span class=\"h1\">\g<4></span>""", head, 1)
head = re.sub("""(?i)(<span class="h1".+</span>)(\n +)([^<\n]+)\n""", """\g<1>\g<2><span class="h1">\g<3></span>\n""", head, 1)
head = re.sub("""(?i)(<span class="h1".+</span>)(\n +)([^<\n]+)\n""", """\g<1>\g<2><span class="h1">\g<3></span>\n""", head, 1)
text = head + rest
# http link markup
# link crossing a line. Not permitting ":" after the line break will
# result in some URLs broken across lines not being recognized, but
# will on the other hand correctly handle a series of URL listed line
# by line, one on each line.
# Link crossing a line, where the continuation contains '.' or '/'
text = re.sub("(?im)(\s|^|[^=]\"|\()((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[./][A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)\"\s]|$)",
"\g<1><a href=\"\g<2>\g<6>\">\g<2></a>\g<5><a href=\"\g<2>\g<6>\">\g<6></a>\g<7>", text)
text = re.sub("(?im)(&lt;)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(&gt;)",
"\g<1><a href=\"\g<2>\g<6>\">\g<2></a>\g<5><a href=\"\g<2>\g<6>\">\g<6></a>\g<7>", text)
# Link crossing a line, where first line ends in '-' or '/'
text = re.sub("(?im)(\s|^|[^=]\"|\()((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?[-/])(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)\"\s]|$)",
"\g<1><a href=\"\g<2>\g<6>\">\g<2></a>\g<5><a href=\"\g<2>\g<6>\">\g<6></a>\g<7>", text)
text = re.sub("(?im)(&lt;)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(&gt;)",
"\g<1><a href=\"\g<2>\g<6>\">\g<2></a>\g<5><a href=\"\g<2>\g<6>\">\g<6></a>\g<7>", text)
# link crossing a line, enclosed in "<" ... ">"
text = re.sub("(?im)<((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])>",
"<\g<1><a href=\"\g<1>\g<5>\">\g<1></a>\g<4><a href=\"\g<1>\g<5>\">\g<5></a>>", text)
text = re.sub("(?im)(&lt;)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&;?#~=-]+[A-Za-z0-9_/@%&;?#~=-])(&gt;)",
"\g<1><a href=\"\g<2>\g<6>\">\g<2></a>\g<5><a href=\"\g<2>\g<6>\">\g<6></a>\g<7>", text)
# link crossing two lines, enclosed in "<" ... ">"
text = re.sub("(?im)<((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])>",
"<\g<1><a href=\"\g<1>\g<5>\g<7>\">\g<1></a>\g<4><a href=\"\g<1>\g<5>\g<7>\">\g<5></a>\g<6><a href=\"\g<1>\g<5>\g<7>\">\g<7></a>>", text)
text = re.sub("(?im)(&lt;)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(\n +)([A-Za-z0-9_./@%&;?#~=-]+[A-Za-z0-9_/@%&;?#~=-])(&gt;)",
"\g<1><a href=\"\g<2>\g<6>\g<8>\">\g<2></a>\g<5><a href=\"\g<2>\g<6>\g<8>\">\g<6></a>\g<7><a href=\"\g<2>\g<6>\g<8>\">\g<8></a>\g<9>", text)
# link on a single line
text = re.sub("(?im)(\s|^|[^=]\"|&lt;|\()((http|https|ftp)://[:A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)\"\s]|&gt;|$)",
"\g<1><a href=\"\g<2>\">\g<2></a>\g<4>", text)
# # Special case for licensing boilerplate
# text = text.replace('<a href="http://trustee.ietf.org/">http://trustee.ietf.org/</a>\n license-info',
# '<a href="http://trustee.ietf.org/licence-info">http://trustee.ietf.org/</a>\n <a href="http://trustee.ietf.org/licence-info">licence-info</a>')
# undo markup if RFC2606 domain
text = re.sub("""(?i)<a href="[a-z]*?://([a-z0-9_-]+?\.)?example(\.(com|org|net))?(/.*?)?">(.*?)</a>""", "\g<5>", text)
# draft markup
# draft name crossing line break
text = re.sub("([^/#=\?\w-])(draft-([-a-zA-Z0-9]+-)?)(\n +)([-a-zA-Z0-9]+[a-zA-Z0-9](.txt)?)",
"\g<1><a href=\"%s?%sdraft=\g<2>\g<5>\">\g<2></a>\g<4><a href=\"%s?%sdraft=\g<2>\g<5>\">\g<5></a>" % (script, extra, script, extra), text)
# draft name on one line (but don't mess with what we just did above)
text = re.sub("([^/#=\?\w>=-])(draft-[-a-zA-Z0-9]+[a-zA-Z0-9](.txt)?)",
"\g<1><a href=\"%s?%sdraft=\g<2>\">\g<2></a>" % (script, extra), text)
# rfc markup
# rfc and number on the same line
text = re.sub("""(?i)([^[/\w-])(rfc([- ]?))([0-9]+)(\W)""",
"""\g<1><a href=\"%s?%srfc=\g<4>\">\g<2>\g<4></a>\g<5>""" % (script, extra), text)
# rfc and number on separate lines
text = re.sub("(?i)([^[/\w-])(rfc([-]?))(\n +)([0-9]+)(\W)",
"\g<1><a href=\"%s?%srfc=\g<5>\">\g<2></a>\g<4><a href=\"%s?%srfc=\g<5>\">\g<5></a>\g<6>" % (script, extra, script, extra), text)
# spelled out Request For Comments markup
text = re.sub("(?i)(\s)(Request\s+For\s+Comments\s+\([^)]+\)\s+)([0-9]+)",
"\g<1>\g<2><a href=\"%s?%srfc=\g<3>\">\g<3></a>" % (script, extra), text)
# bcp markup
text = re.sub("(?i)([^[/\w-])(bcp([- ]?))([0-9]+)(\W)",
"\g<1><a href=\"%s?%sbcp=\g<4>\">\g<2>\g<4></a>\g<5>" % (script, extra), text)
text = re.sub("(?i)([^[/\w-])(bcp([-]?))(\n +)([0-9]+)(\W)",
"\g<1><a href=\"%s?%sbcp=\g<5>\">\g<2></a>\g<4><a href=\"%s?%sbcp=\g<5>\">\g<5></a>\g<6>" % (script, extra, script, extra), text)
def workinprogress_replacement(match):
g1 = match.group(1)
g2 = match.group(2)
g3 = match.group(3)
# eliminate embedded hyperlinks in text we'll use as anchor text
g4 = match.group(4)
g4 = re.sub("<a.+?>(.+?)</a>", "\g<1>", g4)
g4url = urllib.quote_plus(g4)
g5 = match.group(5)
return """%s[<a id=\"ref-%s\">%s</a>]%s<a style=\"text-decoration: none\" href='https://www.google.com/search?sitesearch=tools.ietf.org%%2Fhtml%%2F&amp;q=inurl:draft-+%s'>%s</a>%s""" % (g1, g2, g2, g3, g4url, g4, g5)
text = re.sub("(\n *\n *)\[([-\w.]+)\](\s+.*?)(\".+\")(,\s+Work\s+in\s+Progress.)", workinprogress_replacement, text)
text = re.sub("(\n *\n *)\[([-\w.]+)\](\s)", "\g<1>[<a id=\"ref-\g<2>\">\g<2></a>]\g<3>", text)
text = re.sub("(\n *\n *)\[(RFC [-\w.]+)\](\s)", "\g<1>[<a id=\"ref-\g<2>\">\g<2></a>]\g<3>", text)
ref_targets = re.findall('<a id="ref-(.*?)"', text)
# reference link markup
def reference_replacement(match):
pre = match.group(1)
beg = match.group(2)
tag = match.group(3)
end = match.group(4)
isrfc = re.match("(?i)^rfc[ -]?([0-9]+)$", tag)
if isrfc:
rfcnum = isrfc.group(1)
if tag in reference:
return """%s%s<a href="%s?%srfc=%s" title="%s">%s</a>%s""" % (pre, beg, script, extra, rfcnum, reference[tag], tag, end)
else:
return """%s%s<a href="%s?%srfc=%s">%s</a>%s""" % (pre, beg, script, extra, rfcnum , tag, end)
else:
if tag in ref_targets:
if tag in reference:
return """%s%s<a href="#ref-%s" title="%s">%s</a>%s""" % (pre, beg, tag, reference[tag], tag, end)
else:
return """%s%s<a href="#ref-%s">%s</a>%s""" % (pre, beg, tag, tag, end)
else:
return match.group(0)
# Group: 1 2 3 45
text = re.sub("(\W)(\[)([-\w.]+)((, ?[-\w.]+)*\])", reference_replacement, text)
text = re.sub("(\W)(\[)(RFC [0-9]+)((, ?RFC [0-9]+)*\])", reference_replacement, text)
while True:
old = text
text = re.sub("(\W)(\[(?:<a.*?>.*?</a>, ?)+)([-\w.]+)((, ?[-\w.]+)*\])", reference_replacement, text)
if text == old:
break
while True:
old = text
text = re.sub("(\W)(\[(?:<a.*?>.*?</a>, ?)+)(RFC [-\w.]+)((, ?RFC [-\w.]+)*\])", reference_replacement, text)
if text == old:
break
# greying out the page headers and footers
text = re.sub("\n(.+\[Page \w+\])\n\f\n(.+)\n", """\n<span class="grey">\g<1></span>\n\f\n<span class="grey">\g<2></span>\n""", text)
# contents link markup: section links
# 1 2 3 4 5 6 7
text = re.sub("(?m)^(\s*)(\d+(\.\d+)*)(\.?[ ]+)(.*[^ .])( *\. ?\.)(.*[0-9])$", """\g<1><a href="#section-\g<2>">\g<2></a>\g<4>\g<5>\g<6>\g<7>""", text)
text = re.sub("(?m)^(\s*)(Appendix |)([A-Z](\.\d+)*)(\.?[ ]+)(.*[^ .])( *\. ?\.)(.*[0-9])$", """\g<1><a href="#appendix-\g<3>">\g<2>\g<3></a>\g<5>\g<6>\g<7>\g<8>""", text)
# page number markup
multidoc_separator = "========================================================================"
if re.search(multidoc_separator, text):
parts = re.split(multidoc_separator, text)
for i in range(len(parts)):
parts[i] = re.sub("(?si)(\f)([^\f]*\[Page (\w+)\])", "\g<1><div id=\"%(page)s-\g<3>\" class=\"anchor-target\"></div>\g<2>"%{"page": "page-%s"%(i+1)}, parts[i])
parts[i] = re.sub("(?i)(\. ?\. +|\. \. \.|\.\.\. *)([0-9ivxlc]+)( *\n)", "\g<1><a href=\"#%(page)s-\g<2>\">\g<2></a>\g<3>"%{"page": "page-%s"%(i+1)}, parts[i])
text = multidoc_separator.join(parts)
else:
# page name tag markup
text = re.sub("(?si)(\f)([^\f]*\[Page (\w+)\])", "\g<1><div id=\"page-\g<3>\" class=\"anchor-target\"></div>\g<2>", text)
# contents link markup: page numbers
text = re.sub("(?i)(\. ?\. +|\. \. \.|\.\.\. *)([0-9ivxlc]+)( *\n)", "\g<1><a href=\"#page-\g<2>\">\g<2></a>\g<3>", text)
# section number tag markup
def section_anchor_replacement(match):
# exclude TOC entries
mstring = match.group(0)
if " \. \. " in mstring or "\.\.\." in mstring:
return mstring
level = len(re.findall("[^\.]+", match.group(1)))+1
if level > 6:
level = 6
html = """<span class="h%s anchor-target"><a class=\"selflink\" id=\"section-%s\" href=\"#section-%s\">%s</a>%s</span>""" % (level, match.group(1), match.group(1), match.group(1), match.group(3))
html = html.replace("\n", """</span>\n<span class="h%s anchor-target">""" % level)
return html
text = re.sub("(?im)^(\d+(\.\d+)*)(\.?[ ]+\S.*?(\n +\w+.*)?( |$))", section_anchor_replacement, text)
#text = re.sub("(?i)(\n *\n *)(\d+(\.\d+)*)(\.?[ ].*)", section_replacement, text)
# section number link markup
text = re.sub("(?i)(section\s)(\d+(\.\d+)*)", "<a href=\"#section-\g<2>\">\g<1>\g<2></a>", text)
text = re.sub("(?i)(section)\n(\s+)(\d+(\.\d+)*)", "<a href=\"#section-\g<3>\">\g<1></a>\n\g<2><a href=\"#section-\g<3>\">\g<3></a>", text)
# Special cases for licensing boilerplate
text = text.replace('<a href="#section-4">Section 4</a>.e of the Trust Legal Provisions',
'Section 4.e of the <a href="https://trustee.ietf.org/license-info">Trust Legal Provisions</a>')
while True:
old = text
text = re.sub("(?i)(sections\s(<a.*?>.*?</a>(,\s|\s?-\s?|\sthrough\s|\sor\s|\sto\s|,?\sand\s))*)(\d+(\.\d+)*)", "\g<1><a href=\"#section-\g<4>\">\g<4></a>", text)
if text == old:
break
# appendix number tag markup
def appendix_replacement(match):
# exclude TOC entries
mstring = match.group(0)
if " \. \. " in mstring or "\.\.\." in mstring:
return mstring
txt = match.group(4)
num = match.group(2).rstrip('.')
if num != match.group(2):
txt = "." + txt
level = len(re.findall("[^\.]+", num))+1
if level > 6:
level = 6
return """<span class="h%s anchortarget"><a class=\"selflink\" id=\"appendix-%s\" href=\"#appendix-%s\">%s%s</a>%s</span>""" % (level, num, num, match.group(1), num, txt)
text = re.sub("(?m)^(Appendix |)([A-Z](\.|\.\d+)+)(\.?[ ].*)$", appendix_replacement, text)
#text = re.sub("(?i)(\n *\n *)(\d+(\.\d+)*)(\.?[ ].*)", appendix_replacement, text)
# appendix number link markup
text = re.sub(" ([Aa]ppendix\s)([A-Z](\.\d+)*)", " <a href=\"#appendix-\g<2>\">\g<1>\g<2></a>", text)
text = re.sub(" ([Aa]ppendix)\n(\s+)([A-Z](\.\d+)*)", " <a href=\"#appendix-\g<3>\">\g<1></a>\n\g<2><a href=\"#appendix-\g<3>\">\g<3></a>", text)
# # section x of draft-y markup
# text = re.sub("(?i)<a href=\"[^\"]*\">(section)\s(\d+(\.\d+)*)</a>(\.?\s+(of|in)\s+)<a href=\"[^\"]*\">(draft-[-.a-zA-Z0-9]+[a-zA-Z0-9])</a>", "<a href=\"%s?%surl=%s/rfc\g<7>.txt#section-\g<2>\">\g<1>&nbsp;\g<2>\g<4>\g<6>\g<7></a>" % (script, extra, rfcs), text)
# # draft-y, section x markup
# text = re.sub("(?i)<a href=\"[^\"]*\">(draft-[-.a-zA-Z0-9]+[a-zA-Z0-9])</a>(,?\s)<a href=\"[^\"]*\">(section)\s(\d+(\.\d+)*)</a>", "<a href=\"%s?%surl=%s/rfc\g<2>.txt#section-\g<5>\">\g<1>\g<2>\g<3>\g<4>&nbsp;\g<5></a>" % (script, extra, rfcs), text)
# # [draft-y], section x markup
# text = re.sub("(?i)\[<a href=\"[^>\"]+\">(draft-[-.a-zA-Z0-9]+[a-zA-Z0-9])</a>\](,?\s)<a href=\"[^>\"]*\">(section)\s(\d+(\.\d+)*)</a>", "<a href=\"%s?%surl=%s/rfc\g<2>.txt#section-\g<5>\">[\g<1>\g<2>]\g<3>\g<4>&nbsp;\g<5></a>" % (script, extra, rfcs), text)
for n in ['rfc', 'bcp', 'fyi', 'std']:
# section x of rfc y markup
text = re.sub("(?i)<a href=\"([^\"]*)\"[^>]*>(section)\s(\d+(\.\d+)*)</a>(\.?\s+(of|in)\s+)<a href=\"([^\"]*)\"[^>]*>(%s[- ]?)([0-9]+)</a>"%n,
"<a href=\"%s?%s%s=\g<9>\g<1>\">\g<2>&nbsp;\g<3>\g<5>\g<8>\g<9></a>" % (script, extra, n), text)
text = re.sub("(?i)<a href=\"([^\"]*)\"[^>]*>(section)</a>(\n\s+)<a href=\"(?:[^\"]*)\"[^>]*>(\d+(\.\d+)*)</a>(\.?\s+(of|in)\s+)<a href=\"([^\"]*)\"[^>]*>(%s[- ]?)([0-9]+)</a>"%n,
"<a href=\"%s?%s%s=\g<10>\g<1>\">\g<2></a>\g<3><a href=\"%s?%s%s=\g<10>\g<1>\">\g<4>\g<5>\g<9>\g<10></a>" % (script, extra, n, script, extra, n), text)
# appendix x of rfc y markup
text = re.sub("(?i)<a href=\"([^\"]*)\"[^>]*>(appendix)\s([A-Z](\.\d+)*)</a>(\.?\s+(of|in)\s+)<a href=\"([^\"]*)\"[^>]*>(%s[- ]?)([0-9]+)</a>"%n,
"<a href=\"%s?%s%s=\g<9>\g<1>\">\g<2>&nbsp;\g<3>\g<5>\g<8>\g<9></a>" % (script, extra, n), text)
text = re.sub("(?i)<a href=\"([^\"]*)\"[^>]*>(appendix)</a>(\n\s+)<a href=\"(?:[^\"]*)\"[^>]*>([A-Z]+(\.\d+)*)</a>(\.?\s+(of|in)\s+)<a href=\"([^\"]*)\"[^>]*>(%s[- ]?)([0-9]+)</a>"%n,
"<a href=\"%s?%s%s=\g<10>\g<1>\">\g<2></a>\g<3><a href=\"%s?%s%s=\g<10>\g<1>\">\g<4>\g<5>\g<9>\g<10></a>" % (script, extra, n, script, extra, n), text)
# rfc y, section x markup
text = re.sub("(?i)<a href=\"([^\"]*)\"[^>]*>(%s[- ]?)([0-9]+)</a>(,?\s+)<a href=\"([^\"]*)\"[^>]*>(section)\s?(([^<]*))</a>"%n,
"<a href=\"%s?%s%s=\g<3>\g<5>\">\g<2>\g<3>\g<4>\g<6>&nbsp;\g<7></a>" % (script, extra, n), text)
# rfc y, appendix x markup
text = re.sub("(?i)<a href=\"([^\"]*)\"[^>]*>(%s[- ]?)([0-9]+)</a>(,?\s+)<a href=\"([^\"]*)\"[^>]*>(appendix)\s?(([^<]*))</a>"%n,
"<a href=\"%s?%s%s=\g<3>\g<5>\">\g<2>\g<3>\g<4>\g<6>&nbsp;\g<7></a>" % (script, extra, n), text)
# section x of? [rfc y] markup
text = re.sub("(?i)<a href=\"([^\"]*)\"[^>]*>(section)\s(\d+(\.\d+)*)</a>(\.?\s+(of\s+|in\s+)?)\[<a href=\"([^\"]*)\"[^>]*>(%s[- ]?)([0-9]+)</a>\]"%n,
"<a href=\"%s?%s%s=\g<9>\g<1>\">\g<2>&nbsp;\g<3>\g<5>[\g<8>\g<9>]</a>" % (script, extra, n), text)
text = re.sub("(?i)<a href=\"([^\"]*)\"[^>]*>(section)</a>(\n\s+)<a href=\"(?:[^\"]*)\"[^>]*>(\d+(\.\d+)*)</a>(\.?\s+(of\s+|in\s+)?)\[<a href=\"([^\"]*)\"[^>]*>(%s[- ]?)([0-9]+)</a>\]"%n,
"<a href=\"%s?%s%s=\g<10>\g<1>\">\g<2></a>\g<3><a href=\"%s?%s%s=\g<10>\g<1>\">\g<4>\g<6>[\g<9>\g<10>]</a>" % (script, extra, n, script, extra, n), text)
# appendix x of? [rfc y] markup
text = re.sub("(?i)<a href=\"([^\"]*)\"[^>]*>(appendix)\s([A-Z](\.\d+)*)</a>(\.?\s+(of\s+|in\s+)?)\[<a href=\"([^\"]*)\"[^>]*>(%s[- ]?)([0-9]+)</a>\]"%n,
"<a href=\"%s?%s%s=\g<9>\g<1>\">\g<2>&nbsp;\g<3>\g<5>[\g<8>\g<9>]</a>" % (script, extra, n), text)
text = re.sub("(?i)<a href=\"([^\"]*)\"[^>]*>(appendix)</a>(\n\s+)<a href=\"(?:[^\"]*)\"[^>]*>([A-Z](\.\d+)*)</a>(\.?\s+(of\s+|in\s+)?)\[<a href=\"([^\"]*)\"[^>]*>(%s[- ]?)([0-9]+)</a>\]"%n,
"<a href=\"%s?%s%s=\g<10>\g<1>\">\g<2></a>\g<3><a href=\"%s?%s%s=\g<10>\g<1>\">\g<4>\g<6>[\g<9>\g<10>]</a>" % (script, extra, n, script, extra, n), text)
# [rfc y], section x markup
text = re.sub("(?i)\[<a href=\"([^>\"]+)\"[^>]*>(%s[- ]?)([0-9]+)</a>\](,?\s+)<a href=\"([^>\"]*)\"[^>]*>(section)\s(\d+(\.\d+)*)</a>"%n,
"<a href=\"%s?%s%s=\g<3>\g<5>\">[\g<2>\g<3>]\g<4>\g<6>&nbsp;\g<7></a>" % (script, extra, n), text)
# [rfc y], appendix x markup
text = re.sub("(?i)\[<a href=\"([^>\"]+)\"[^>]*>(%s[- ]?)([0-9]+)</a>\](,?\s+)<a href=\"([^>\"]*)\"[^>]*>(appendix)\s([A-Z](\.\d+)*)</a>"%n,
"<a href=\"%s?%s%s=\g<3>\g<5>\">[\g<2>\g<3>]\g<4>\g<6>&nbsp;\g<7></a>" % (script, extra, n), text)
# remove section link for section x.x (of|in) <something else>
old = text
text = re.sub("(?i)<a href=\"[^\"]*\"[^>]*>(section\s)(\d+(\.\d+)*)</a>(\.?[a-z]*\s+(of|in)\s+)(\[?)<a href=\"([^\"]*)\"([^>]*)>(.*)</a>(\]?)",
'\g<1>\g<2>\g<4>\g<6><a href="\g<7>"\g<8>>\g<9></a>\g<10>', text)
text = re.sub('(?i)(\[?)<a href="([^"]*#ref[^"]*)"([^>]*)>(.*?)</a>(\]?,\s+)<a href="[^"]*"[^>]*>(section\s)(\d+(\.\d+)*)</a>',
'\g<1><a href="\g<2>"\g<3>>\g<4></a>\g<5>\g<6>\g<7>', text)
# Special fix for referring to the trust legal provisons in
# boilerplate text:
text = re.sub("(?i)<a href=\"[^\"]*\"[^>]*>(section\s)(\d+(\.\d+)*)</a>(\.?[a-z]*\s+(of|in)\s*\n\s*the Trust Legal Provisions)",
'\g<1>\g<2>\g<4>', text)
#
#text = re.sub("\f", "<div class=\"newpage\" />", text)
text = re.sub("\n?\f\n?", "</pre>\n<hr class='noprint' style='width: 78ex; margin-left: 0;'/><!--NewPage--><pre class='newpage'>", text)
# restore indentation
if prefixlen:
text = re.sub("\n", "\n"+(" "*prefixlen), text)
if path:
text = re.sub("%s\?(rfc|bcp|std)=" % script, "%s/\g<1>" % path, text)
text = re.sub("%s\?draft=" % script, "%s/" % path, text)
return text