Added the ability to give fill and pre(formatted) switches to the soup2text command

- Legacy-Id: 403
This commit is contained in:
Henrik Levkowetz 2007-06-15 13:28:12 +00:00
parent d8866a4495
commit de9a7ddbc4

View file

@ -41,14 +41,15 @@ def unescape(text):
text = text.replace(entity, char) # replace ampersand last text = text.replace(entity, char) # replace ampersand last
return text return text
def para(words, pre): def para(words, pre, fill):
text = "".join(words) text = "".join(words)
text = unescape(text) text = unescape(text)
if not pre: if not pre:
text = text.strip("\n") text = text.strip("\n")
text = text.lstrip() text = text.lstrip()
text = re.sub("[\t\n ]+", " ", text) text = re.sub("[\t\n ]+", " ", text)
text = textwrap.fill(text) if fill:
text = textwrap.fill(text)
return text return text
def normalize(str): def normalize(str):
@ -60,7 +61,7 @@ def normalize(str):
str = re.sub("<\?[^>]*\?>", "", str) str = re.sub("<\?[^>]*\?>", "", str)
return str return str
def render(node, encoding='latin-1', pre=False): def render(node, encoding='latin-1', pre=False, fill=True, clean=True):
blocks = [] blocks = []
words = [] words = []
node.pre = pre or node.name in pre_tags node.pre = pre or node.name in pre_tags
@ -76,11 +77,11 @@ def render(node, encoding='latin-1', pre=False):
if child.name in ignore_tags: if child.name in ignore_tags:
pass pass
else: else:
child = render(child, encoding, node.pre) child = render(child, encoding, node.pre, fill, clean)
if child.text: if child.text:
if child.is_block: if child.is_block:
if words : if words :
blocks.append(para(words, node.pre)+"\n") blocks.append(para(words, node.pre, fill)+"\n")
words = [] words = []
blocks.append(child.text+"\n\n") blocks.append(child.text+"\n\n")
node.is_block = True node.is_block = True
@ -94,22 +95,31 @@ def render(node, encoding='latin-1', pre=False):
else: else:
raise ValueError("Unexpected node type: '%s'" % child) raise ValueError("Unexpected node type: '%s'" % child)
if words: if words:
blocks.append(para(words, node.pre)) blocks.append(para(words, node.pre, fill))
node.text = ''.join(blocks) node.text = ''.join(blocks)
return node return node
class TextSoup(BeautifulSoup): class TextSoup(BeautifulSoup):
def as_text(self, encoding='latin-1', pre=False, fill=True, clean=True):
node = render(self, encoding, pre, fill, clean)
str = node.text
if clean:
str = re.sub("[ \t]+", " ", str)
str = re.sub("\n\n+", "\n\n", str)
return str
def __str__(self, encoding='latin-1', def __str__(self, encoding='latin-1',
prettyPrint=False, indentLevel=0): prettyPrint=False, indentLevel=0):
node = render(self, encoding) node = render(self, encoding, fill=False)
str = node.text str = node.text
str = re.sub("[ \t]+", " ", str) str = re.sub("[ \t]+", " ", str)
str = re.sub("\n\n+", "\n\n", str) str = re.sub("\n\n+", "\n\n", str)
return str return str
def soup2text(html): def soup2text(html, encoding='latin-1', pre=False, fill=True):
# Line ending normalization # Line ending normalization
html = html.replace("\r\n", "\n").replace("\r", "\n") html = html.replace("\r\n", "\n").replace("\r", "\n")
# remove comments # remove comments
@ -118,7 +128,7 @@ def soup2text(html):
html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html) html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html)
html = re.sub("<br */?>([^\n])", r"<br />\n\1", html) html = re.sub("<br */?>([^\n])", r"<br />\n\1", html)
soup = TextSoup(html) soup = TextSoup(html)
return str(soup) return soup.as_text(encoding, pre, fill)
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys