From aba06af322b0cebe20c4f7b45aa982908bdcd794 Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Tue, 12 Jun 2007 01:32:05 +0000 Subject: [PATCH] Another soup2html() tweak to better avoid indentation at paragraph start. - Legacy-Id: 330 --- ietf/utils/soup2text.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py index 817dc7b6f..dc8c91b47 100755 --- a/ietf/utils/soup2text.py +++ b/ietf/utils/soup2text.py @@ -43,14 +43,17 @@ def para(words, pre): text = "".join(words) text = unescape(text) if not pre: - text = re.sub("[\r\n\t ]+", " ", text.strip()) + #print "*** Text to be wrapped:" + #print "["+text+"]" + text = re.sub("[\t ]+", " ", text) + text = text.strip("\n") text = textwrap.fill(text) return text def normalize(str): # Normalize whitespace at the beginning and end of the string - str = re.sub("^[ \t\n]+", " ", str) - str = re.sub("[ \t\n]+$", " ", str) + str = re.sub("^[ \t]+", " ", str) + str = re.sub("[ \t]+$", " ", str) # remove comments str = re.sub("(?s)", "", str) # remove xml PIs and metainformation @@ -108,7 +111,7 @@ def soup2text(html): # some preprocessing to handle common pathological cases html = re.sub("
[ \t\n]*(
)+", "

", html) html = re.sub("
([^\n])", r"
\n\1", html) - html = re.sub("(]*>)([^ \t\n])", r"\1 \2", html) + html = re.sub("([^ \t\n])()", r"\1 \2", html) soup = TextSoup(html) return str(soup)