From 1cafcf3e9d0361a9ef26ddcc20114ae591c84ae1 Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz <henrik@levkowetz.com> Date: Mon, 11 Jun 2007 20:28:19 +0000 Subject: [PATCH] Changed approach to space normalization in soup2text(). Plain whitespace stripping followed by reassembly caused too large information loss. Accompanying changes in generic diff files. - Legacy-Id: 321 --- ietf/utils/soup2text.py | 23 +++++++++++++---------- test/diff/generic-diff_produced-by-1 | 6 +++--- test/diff/generic-diff_produced-by-2 | 8 ++++---- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py index f93de65f3..9e9a2c1d1 100755 --- a/ietf/utils/soup2text.py +++ b/ietf/utils/soup2text.py @@ -39,20 +39,23 @@ def unescape(text): text = text.replace(entity, char) # replace ampersand last return text -def para(words, pre): - text = " ".join(words) - # Fix occasional bad sentence end merges - for i in range(1,len(words)): - if words[i].startswith(". "): - now = words[i-1]+" "+words[i] - fix = words[i-1]+words[i] - text = text.replace(now, fix) +def para(words, pre): + text = "".join(words) text = unescape(text) if not pre: text = re.sub("[\r\n\t ]+", " ", text) text = textwrap.fill(text) return text +def normalize(str): + # Normalize whitespace at the beginning and end of the string + str = re.sub("^[ \t\n]+", " ", str) + str = re.sub("[ \t\n]+$", " ", str) + # remove xml PIs and metainformation + str = re.sub("<![^>]*>", "", str) + str = re.sub("<\?[^>]*\?>", "", str) + return str + def render(node, encoding='latin-1', pre=False): blocks = [] words = [] @@ -62,8 +65,8 @@ def render(node, encoding='latin-1', pre=False): if isinstance(child, NavigableString): str = child.__str__(encoding) if str and not node.pre: - str = str.strip() - if str and not str.startswith("<!") and not str.startswith("<?"): + str = normalize(str) + if str: words.append(str) elif isinstance(child, Tag): if child.name in ignore_tags: diff --git a/test/diff/generic-diff_produced-by-1 b/test/diff/generic-diff_produced-by-1 index a8fb983ed..186be67ff 100644 --- a/test/diff/generic-diff_produced-by-1 +++ b/test/diff/generic-diff_produced-by-1 @@ -1,6 +1,6 @@ @@ -12,0 +12,5 @@ -+Did you find a bug? Let us know . ++Did you find a bug? Let us know. + -+Any question or suggestion ? ++Any question or suggestion? + -+This page produced by the IETF Secretariat for the IESG \ No newline at end of file ++This page produced by the IETF Secretariat for the IESG diff --git a/test/diff/generic-diff_produced-by-2 b/test/diff/generic-diff_produced-by-2 index 2f803726e..a8ac74456 100644 --- a/test/diff/generic-diff_produced-by-2 +++ b/test/diff/generic-diff_produced-by-2 @@ -1,7 +1,7 @@ -@@ -27,0 +23,1 @@ -+Did you find a bug? Let us know . -@@ -28,0 +25,5 @@ -+Any question or suggestion ? +@@ -17,0 +17,1 @@ ++Did you find a bug? Let us know. +@@ -18,0 +19,5 @@ ++Any question or suggestion? + +This page produced by the IETF Secretariat for the IESG +