Changed approach to space normalization in soup2text(). Plain whitespace stripping followed by reassembly caused too large information loss. Accompanying changes in generic diff files.
- Legacy-Id: 321
This commit is contained in:
parent
49ee9f8cd7
commit
1cafcf3e9d
|
@ -39,20 +39,23 @@ def unescape(text):
|
||||||
text = text.replace(entity, char) # replace ampersand last
|
text = text.replace(entity, char) # replace ampersand last
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def para(words, pre):
|
def para(words, pre):
|
||||||
text = " ".join(words)
|
text = "".join(words)
|
||||||
# Fix occasional bad sentence end merges
|
|
||||||
for i in range(1,len(words)):
|
|
||||||
if words[i].startswith(". "):
|
|
||||||
now = words[i-1]+" "+words[i]
|
|
||||||
fix = words[i-1]+words[i]
|
|
||||||
text = text.replace(now, fix)
|
|
||||||
text = unescape(text)
|
text = unescape(text)
|
||||||
if not pre:
|
if not pre:
|
||||||
text = re.sub("[\r\n\t ]+", " ", text)
|
text = re.sub("[\r\n\t ]+", " ", text)
|
||||||
text = textwrap.fill(text)
|
text = textwrap.fill(text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
def normalize(str):
|
||||||
|
# Normalize whitespace at the beginning and end of the string
|
||||||
|
str = re.sub("^[ \t\n]+", " ", str)
|
||||||
|
str = re.sub("[ \t\n]+$", " ", str)
|
||||||
|
# remove xml PIs and metainformation
|
||||||
|
str = re.sub("<![^>]*>", "", str)
|
||||||
|
str = re.sub("<\?[^>]*\?>", "", str)
|
||||||
|
return str
|
||||||
|
|
||||||
def render(node, encoding='latin-1', pre=False):
|
def render(node, encoding='latin-1', pre=False):
|
||||||
blocks = []
|
blocks = []
|
||||||
words = []
|
words = []
|
||||||
|
@ -62,8 +65,8 @@ def render(node, encoding='latin-1', pre=False):
|
||||||
if isinstance(child, NavigableString):
|
if isinstance(child, NavigableString):
|
||||||
str = child.__str__(encoding)
|
str = child.__str__(encoding)
|
||||||
if str and not node.pre:
|
if str and not node.pre:
|
||||||
str = str.strip()
|
str = normalize(str)
|
||||||
if str and not str.startswith("<!") and not str.startswith("<?"):
|
if str:
|
||||||
words.append(str)
|
words.append(str)
|
||||||
elif isinstance(child, Tag):
|
elif isinstance(child, Tag):
|
||||||
if child.name in ignore_tags:
|
if child.name in ignore_tags:
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
@@ -12,0 +12,5 @@
|
@@ -12,0 +12,5 @@
|
||||||
+Did you find a bug? Let us know .
|
+Did you find a bug? Let us know.
|
||||||
+
|
+
|
||||||
+Any question or suggestion ?
|
+Any question or suggestion?
|
||||||
+
|
+
|
||||||
+This page produced by the IETF Secretariat for the IESG
|
+This page produced by the IETF Secretariat for the IESG
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
@@ -27,0 +23,1 @@
|
@@ -17,0 +17,1 @@
|
||||||
+Did you find a bug? Let us know .
|
+Did you find a bug? Let us know.
|
||||||
@@ -28,0 +25,5 @@
|
@@ -18,0 +19,5 @@
|
||||||
+Any question or suggestion ?
|
+Any question or suggestion?
|
||||||
+
|
+
|
||||||
+This page produced by the IETF Secretariat for the IESG
|
+This page produced by the IETF Secretariat for the IESG
|
||||||
+
|
+
|
||||||
|
|
Loading…
Reference in a new issue