From 7f512b488933816ad827b646e539d64ee36d0b78 Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Mon, 11 Jun 2007 07:47:56 +0000 Subject: [PATCH] make soup2text convert numeric character codes (e.g., "'") too. - Legacy-Id: 306 --- ietf/utils/soup2text.py | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py index b28ef4f34..3e7a50af0 100755 --- a/ietf/utils/soup2text.py +++ b/ietf/utils/soup2text.py @@ -13,7 +13,31 @@ pre_tags = ["pre"] entities = [("<", "<"), (">", ">"), (""", '"'), ("'", "'"), (" ", " "), - ("&", "&"), ] + ("&", "&"), ] # ampersand last + +def unescape(text): + # Unescape character codes (if possible) + start = 0 + while True: + try: + pos = text.index("&#", start) + except ValueError: + break + match = re.match("&#\d+;", text[pos:]) + if match: + str = match.group() + num = int(str[2:-1]) + if num < 256: + text = text[:pos] + chr(num) + text[pos+len(str):] + start = pos + 1 + else: + start = pos + len(str) + else: + start = pos + 2 + # unescape character entities + for entity, char in entities: + text = text.replace(entity, char) # replace ampersand last + return text def para(words, pre): text = " ".join(words) @@ -23,8 +47,7 @@ def para(words, pre): now = words[i-1]+" "+words[i] fix = words[i-1]+words[i] text = text.replace(now, fix) - for entity, char in entities: - text = text.replace(entity, char) + text = unescape(text) if not pre: text = re.sub("[\r\n\t ]+", " ", text) text = textwrap.fill(text)