Added the ability to give fill and pre(formatted) switches to the soup2text command
- Legacy-Id: 403
This commit is contained in:
parent
d8866a4495
commit
de9a7ddbc4
|
@ -41,14 +41,15 @@ def unescape(text):
|
||||||
text = text.replace(entity, char) # replace ampersand last
|
text = text.replace(entity, char) # replace ampersand last
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def para(words, pre):
|
def para(words, pre, fill):
|
||||||
text = "".join(words)
|
text = "".join(words)
|
||||||
text = unescape(text)
|
text = unescape(text)
|
||||||
if not pre:
|
if not pre:
|
||||||
text = text.strip("\n")
|
text = text.strip("\n")
|
||||||
text = text.lstrip()
|
text = text.lstrip()
|
||||||
text = re.sub("[\t\n ]+", " ", text)
|
text = re.sub("[\t\n ]+", " ", text)
|
||||||
text = textwrap.fill(text)
|
if fill:
|
||||||
|
text = textwrap.fill(text)
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def normalize(str):
|
def normalize(str):
|
||||||
|
@ -60,7 +61,7 @@ def normalize(str):
|
||||||
str = re.sub("<\?[^>]*\?>", "", str)
|
str = re.sub("<\?[^>]*\?>", "", str)
|
||||||
return str
|
return str
|
||||||
|
|
||||||
def render(node, encoding='latin-1', pre=False):
|
def render(node, encoding='latin-1', pre=False, fill=True, clean=True):
|
||||||
blocks = []
|
blocks = []
|
||||||
words = []
|
words = []
|
||||||
node.pre = pre or node.name in pre_tags
|
node.pre = pre or node.name in pre_tags
|
||||||
|
@ -76,11 +77,11 @@ def render(node, encoding='latin-1', pre=False):
|
||||||
if child.name in ignore_tags:
|
if child.name in ignore_tags:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
child = render(child, encoding, node.pre)
|
child = render(child, encoding, node.pre, fill, clean)
|
||||||
if child.text:
|
if child.text:
|
||||||
if child.is_block:
|
if child.is_block:
|
||||||
if words :
|
if words :
|
||||||
blocks.append(para(words, node.pre)+"\n")
|
blocks.append(para(words, node.pre, fill)+"\n")
|
||||||
words = []
|
words = []
|
||||||
blocks.append(child.text+"\n\n")
|
blocks.append(child.text+"\n\n")
|
||||||
node.is_block = True
|
node.is_block = True
|
||||||
|
@ -94,22 +95,31 @@ def render(node, encoding='latin-1', pre=False):
|
||||||
else:
|
else:
|
||||||
raise ValueError("Unexpected node type: '%s'" % child)
|
raise ValueError("Unexpected node type: '%s'" % child)
|
||||||
if words:
|
if words:
|
||||||
blocks.append(para(words, node.pre))
|
blocks.append(para(words, node.pre, fill))
|
||||||
|
|
||||||
node.text = ''.join(blocks)
|
node.text = ''.join(blocks)
|
||||||
return node
|
return node
|
||||||
|
|
||||||
class TextSoup(BeautifulSoup):
|
class TextSoup(BeautifulSoup):
|
||||||
|
|
||||||
|
def as_text(self, encoding='latin-1', pre=False, fill=True, clean=True):
|
||||||
|
node = render(self, encoding, pre, fill, clean)
|
||||||
|
str = node.text
|
||||||
|
if clean:
|
||||||
|
str = re.sub("[ \t]+", " ", str)
|
||||||
|
str = re.sub("\n\n+", "\n\n", str)
|
||||||
|
return str
|
||||||
|
|
||||||
|
|
||||||
def __str__(self, encoding='latin-1',
|
def __str__(self, encoding='latin-1',
|
||||||
prettyPrint=False, indentLevel=0):
|
prettyPrint=False, indentLevel=0):
|
||||||
node = render(self, encoding)
|
node = render(self, encoding, fill=False)
|
||||||
str = node.text
|
str = node.text
|
||||||
str = re.sub("[ \t]+", " ", str)
|
str = re.sub("[ \t]+", " ", str)
|
||||||
str = re.sub("\n\n+", "\n\n", str)
|
str = re.sub("\n\n+", "\n\n", str)
|
||||||
return str
|
return str
|
||||||
|
|
||||||
def soup2text(html):
|
def soup2text(html, encoding='latin-1', pre=False, fill=True):
|
||||||
# Line ending normalization
|
# Line ending normalization
|
||||||
html = html.replace("\r\n", "\n").replace("\r", "\n")
|
html = html.replace("\r\n", "\n").replace("\r", "\n")
|
||||||
# remove comments
|
# remove comments
|
||||||
|
@ -118,7 +128,7 @@ def soup2text(html):
|
||||||
html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html)
|
html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html)
|
||||||
html = re.sub("<br */?>([^\n])", r"<br />\n\1", html)
|
html = re.sub("<br */?>([^\n])", r"<br />\n\1", html)
|
||||||
soup = TextSoup(html)
|
soup = TextSoup(html)
|
||||||
return str(soup)
|
return soup.as_text(encoding, pre, fill)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
import sys
|
import sys
|
||||||
|
|
Loading…
Reference in a new issue