Another soup2html() tweak to better avoid indentation at paragraph start.

- Legacy-Id: 330
This commit is contained in:
Henrik Levkowetz 2007-06-12 01:32:05 +00:00
parent 541b041cdc
commit aba06af322

View file

@ -43,14 +43,17 @@ def para(words, pre):
text = "".join(words) text = "".join(words)
text = unescape(text) text = unescape(text)
if not pre: if not pre:
text = re.sub("[\r\n\t ]+", " ", text.strip()) #print "*** Text to be wrapped:"
#print "["+text+"]"
text = re.sub("[\t ]+", " ", text)
text = text.strip("\n")
text = textwrap.fill(text) text = textwrap.fill(text)
return text return text
def normalize(str): def normalize(str):
# Normalize whitespace at the beginning and end of the string # Normalize whitespace at the beginning and end of the string
str = re.sub("^[ \t\n]+", " ", str) str = re.sub("^[ \t]+", " ", str)
str = re.sub("[ \t\n]+$", " ", str) str = re.sub("[ \t]+$", " ", str)
# remove comments # remove comments
str = re.sub("(?s)<!--.*?-->", "", str) str = re.sub("(?s)<!--.*?-->", "", str)
# remove xml PIs and metainformation # remove xml PIs and metainformation
@ -108,7 +111,7 @@ def soup2text(html):
# some preprocessing to handle common pathological cases # some preprocessing to handle common pathological cases
html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html) html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html)
html = re.sub("<br */?>([^\n])", r"<br />\n\1", html) html = re.sub("<br */?>([^\n])", r"<br />\n\1", html)
html = re.sub("(<t[hd][^>]*>)([^ \t\n])", r"\1 \2", html) html = re.sub("([^ \t\n])(</t[hd].*?>)", r"\1 \2", html)
soup = TextSoup(html) soup = TextSoup(html)
return str(soup) return str(soup)