From 1cafcf3e9d0361a9ef26ddcc20114ae591c84ae1 Mon Sep 17 00:00:00 2001
From: Henrik Levkowetz <henrik@levkowetz.com>
Date: Mon, 11 Jun 2007 20:28:19 +0000
Subject: [PATCH] Changed approach to space normalization in soup2text(). 
 Plain whitespace stripping followed by reassembly caused too large
 information loss.  Accompanying changes in generic diff files.  - Legacy-Id:
 321

---
 ietf/utils/soup2text.py              | 23 +++++++++++++----------
 test/diff/generic-diff_produced-by-1 |  6 +++---
 test/diff/generic-diff_produced-by-2 |  8 ++++----
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py
index f93de65f3..9e9a2c1d1 100755
--- a/ietf/utils/soup2text.py
+++ b/ietf/utils/soup2text.py
@@ -39,20 +39,23 @@ def unescape(text):
         text = text.replace(entity, char) # replace ampersand last
     return text
 
-def para(words, pre):    
-    text = " ".join(words)
-    # Fix occasional bad sentence end merges
-    for i in range(1,len(words)):
-        if words[i].startswith(". "):
-            now = words[i-1]+" "+words[i]
-            fix = words[i-1]+words[i]
-            text = text.replace(now, fix)
+def para(words, pre):
+    text = "".join(words)
     text = unescape(text)
     if not pre:
         text = re.sub("[\r\n\t ]+", " ", text)
         text = textwrap.fill(text)  
     return text
 
+def normalize(str):
+    # Normalize whitespace at the beginning and end of the string
+    str = re.sub("^[ \t\n]+", " ", str)
+    str = re.sub("[ \t\n]+$", " ", str)
+    # remove xml PIs and metainformation
+    str = re.sub("<![^>]*>", "", str)
+    str = re.sub("<\?[^>]*\?>", "", str)
+    return str
+
 def render(node, encoding='latin-1', pre=False):
     blocks = []
     words = []
@@ -62,8 +65,8 @@ def render(node, encoding='latin-1', pre=False):
         if isinstance(child, NavigableString):
             str = child.__str__(encoding)
             if str and not node.pre:
-                str = str.strip()
-            if str and not str.startswith("<!") and not str.startswith("<?"):
+                str = normalize(str)
+            if str:
                 words.append(str)
         elif isinstance(child, Tag):
             if child.name in ignore_tags:
diff --git a/test/diff/generic-diff_produced-by-1 b/test/diff/generic-diff_produced-by-1
index a8fb983ed..186be67ff 100644
--- a/test/diff/generic-diff_produced-by-1
+++ b/test/diff/generic-diff_produced-by-1
@@ -1,6 +1,6 @@
 @@ -12,0 +12,5 @@
-+Did you find a bug? Let us know .
++Did you find a bug? Let us know.
 +
-+Any question or suggestion ?
++Any question or suggestion?
 +
-+This page produced by the IETF Secretariat for the IESG
\ No newline at end of file
++This page produced by the IETF Secretariat for the IESG
diff --git a/test/diff/generic-diff_produced-by-2 b/test/diff/generic-diff_produced-by-2
index 2f803726e..a8ac74456 100644
--- a/test/diff/generic-diff_produced-by-2
+++ b/test/diff/generic-diff_produced-by-2
@@ -1,7 +1,7 @@
-@@ -27,0 +23,1 @@
-+Did you find a bug? Let us know .
-@@ -28,0 +25,5 @@
-+Any question or suggestion ?
+@@ -17,0 +17,1 @@
++Did you find a bug? Let us know.
+@@ -18,0 +19,5 @@
++Any question or suggestion?
 +
 +This page produced by the IETF Secretariat for the IESG
 +