Changed approach to space normalization in soup2text(). Plain whitespace stripping followed by reassembly caused too large information loss. Accompanying changes in generic diff files.

- Legacy-Id: 321
This commit is contained in:
Henrik Levkowetz 2007-06-11 20:28:19 +00:00
parent 49ee9f8cd7
commit 1cafcf3e9d
3 changed files with 20 additions and 17 deletions

View file

@ -39,20 +39,23 @@ def unescape(text):
text = text.replace(entity, char) # replace ampersand last
return text
def para(words, pre):
text = " ".join(words)
# Fix occasional bad sentence end merges
for i in range(1,len(words)):
if words[i].startswith(". "):
now = words[i-1]+" "+words[i]
fix = words[i-1]+words[i]
text = text.replace(now, fix)
def para(words, pre):
text = "".join(words)
text = unescape(text)
if not pre:
text = re.sub("[\r\n\t ]+", " ", text)
text = textwrap.fill(text)
return text
def normalize(str):
# Normalize whitespace at the beginning and end of the string
str = re.sub("^[ \t\n]+", " ", str)
str = re.sub("[ \t\n]+$", " ", str)
# remove xml PIs and metainformation
str = re.sub("<![^>]*>", "", str)
str = re.sub("<\?[^>]*\?>", "", str)
return str
def render(node, encoding='latin-1', pre=False):
blocks = []
words = []
@ -62,8 +65,8 @@ def render(node, encoding='latin-1', pre=False):
if isinstance(child, NavigableString):
str = child.__str__(encoding)
if str and not node.pre:
str = str.strip()
if str and not str.startswith("<!") and not str.startswith("<?"):
str = normalize(str)
if str:
words.append(str)
elif isinstance(child, Tag):
if child.name in ignore_tags:

View file

@ -1,6 +1,6 @@
@@ -12,0 +12,5 @@
+Did you find a bug? Let us know .
+Did you find a bug? Let us know.
+
+Any question or suggestion ?
+Any question or suggestion?
+
+This page produced by the IETF Secretariat for the IESG
+This page produced by the IETF Secretariat for the IESG

View file

@ -1,7 +1,7 @@
@@ -27,0 +23,1 @@
+Did you find a bug? Let us know .
@@ -28,0 +25,5 @@
+Any question or suggestion ?
@@ -17,0 +17,1 @@
+Did you find a bug? Let us know.
@@ -18,0 +19,5 @@
+Any question or suggestion?
+
+This page produced by the IETF Secretariat for the IESG
+