From 712cd8aa17d031e3d5f948794b339a48a7bef435 Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Tue, 12 Jun 2007 20:23:09 +0000 Subject: [PATCH] Tweak to again avoid space at the beginning of a paragraph. - Legacy-Id: 345 --- ietf/utils/soup2text.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py index 173f4d174..ad86a31c0 100755 --- a/ietf/utils/soup2text.py +++ b/ietf/utils/soup2text.py @@ -55,8 +55,6 @@ def normalize(str): # Normalize whitespace at the beginning and end of the string str = re.sub("^[ \t]+", " ", str) str = re.sub("[ \t]+$", " ", str) - # remove comments - str = re.sub("(?s)", "", str) # remove xml PIs and metainformation str = re.sub("]*>", "", str) str = re.sub("<\?[^>]*\?>", "", str) @@ -87,9 +85,9 @@ def render(node, encoding='latin-1', pre=False): blocks.append(child.text+"\n\n") node.is_block = True else: + words.append(child.text) if child.name in space_tags and not (words and words[-1] and words[-1][-1] in [" ", "\t", "\n"]): words.append(" ") - words.append(child.text) else: raise ValueError("Unexpected node type: '%s'" % child) if words: @@ -111,6 +109,8 @@ class TextSoup(BeautifulSoup): def soup2text(html): # Line ending normalization html = html.replace("\r\n", "\n").replace("\r", "\n") + # remove comments + html = re.sub("(?s)", "", html) # some preprocessing to handle common pathological cases html = re.sub("
[ \t\n]*(
)+", "

", html) html = re.sub("
([^\n])", r"
\n\1", html)