Changed approach to space normalization in soup2text(). Plain whitespace stripping followed by reassembly caused too large information loss. Accompanying changes in generic diff files.
- Legacy-Id: 321
This commit is contained in:
parent
49ee9f8cd7
commit
1cafcf3e9d
|
@ -40,19 +40,22 @@ def unescape(text):
|
|||
return text
|
||||
|
||||
def para(words, pre):
|
||||
text = " ".join(words)
|
||||
# Fix occasional bad sentence end merges
|
||||
for i in range(1,len(words)):
|
||||
if words[i].startswith(". "):
|
||||
now = words[i-1]+" "+words[i]
|
||||
fix = words[i-1]+words[i]
|
||||
text = text.replace(now, fix)
|
||||
text = "".join(words)
|
||||
text = unescape(text)
|
||||
if not pre:
|
||||
text = re.sub("[\r\n\t ]+", " ", text)
|
||||
text = textwrap.fill(text)
|
||||
return text
|
||||
|
||||
def normalize(str):
|
||||
# Normalize whitespace at the beginning and end of the string
|
||||
str = re.sub("^[ \t\n]+", " ", str)
|
||||
str = re.sub("[ \t\n]+$", " ", str)
|
||||
# remove xml PIs and metainformation
|
||||
str = re.sub("<![^>]*>", "", str)
|
||||
str = re.sub("<\?[^>]*\?>", "", str)
|
||||
return str
|
||||
|
||||
def render(node, encoding='latin-1', pre=False):
|
||||
blocks = []
|
||||
words = []
|
||||
|
@ -62,8 +65,8 @@ def render(node, encoding='latin-1', pre=False):
|
|||
if isinstance(child, NavigableString):
|
||||
str = child.__str__(encoding)
|
||||
if str and not node.pre:
|
||||
str = str.strip()
|
||||
if str and not str.startswith("<!") and not str.startswith("<?"):
|
||||
str = normalize(str)
|
||||
if str:
|
||||
words.append(str)
|
||||
elif isinstance(child, Tag):
|
||||
if child.name in ignore_tags:
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
@@ -12,0 +12,5 @@
|
||||
+Did you find a bug? Let us know .
|
||||
+Did you find a bug? Let us know.
|
||||
+
|
||||
+Any question or suggestion ?
|
||||
+Any question or suggestion?
|
||||
+
|
||||
+This page produced by the IETF Secretariat for the IESG
|
|
@ -1,7 +1,7 @@
|
|||
@@ -27,0 +23,1 @@
|
||||
+Did you find a bug? Let us know .
|
||||
@@ -28,0 +25,5 @@
|
||||
+Any question or suggestion ?
|
||||
@@ -17,0 +17,1 @@
|
||||
+Did you find a bug? Let us know.
|
||||
@@ -18,0 +19,5 @@
|
||||
+Any question or suggestion?
|
||||
+
|
||||
+This page produced by the IETF Secretariat for the IESG
|
||||
+
|
||||
|
|
Loading…
Reference in a new issue