Two soup2text tweaks.

- Legacy-Id: 324
This commit is contained in:
Henrik Levkowetz 2007-06-11 23:52:51 +00:00
parent 24b3eeb3d3
commit bfcb0e6c78

View file

@ -97,7 +97,7 @@ class TextSoup(BeautifulSoup):
node = render(self, encoding) node = render(self, encoding)
str = node.text str = node.text
str = re.sub("[ \t]+", " ", str) str = re.sub("[ \t]+", " ", str)
str = re.sub("\n\n+", "\n\n", str) str = re.sub("\n\n+ *", "\n\n", str)
return str return str
def soup2text(html): def soup2text(html):
@ -105,6 +105,7 @@ def soup2text(html):
html = html.replace("\r\n", "\n").replace("\r", "\n") html = html.replace("\r\n", "\n").replace("\r", "\n")
# some preprocessing to handle common pathological cases # some preprocessing to handle common pathological cases
html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html) html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html)
html = re.sub("<br */?>([^\n])", r"<br />\n\1", html)
soup = TextSoup(html) soup = TextSoup(html)
return str(soup) return str(soup)