From de9a7ddbc45d20567b48da1d29af6a7bd38cf94c Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Fri, 15 Jun 2007 13:28:12 +0000 Subject: [PATCH] Added the ability to give fill and pre(formatted) switches to the soup2text command - Legacy-Id: 403 --- ietf/utils/soup2text.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py index fe04349c9..311ee3bb2 100755 --- a/ietf/utils/soup2text.py +++ b/ietf/utils/soup2text.py @@ -41,14 +41,15 @@ def unescape(text): text = text.replace(entity, char) # replace ampersand last return text -def para(words, pre): +def para(words, pre, fill): text = "".join(words) text = unescape(text) if not pre: text = text.strip("\n") text = text.lstrip() text = re.sub("[\t\n ]+", " ", text) - text = textwrap.fill(text) + if fill: + text = textwrap.fill(text) return text def normalize(str): @@ -60,7 +61,7 @@ def normalize(str): str = re.sub("<\?[^>]*\?>", "", str) return str -def render(node, encoding='latin-1', pre=False): +def render(node, encoding='latin-1', pre=False, fill=True, clean=True): blocks = [] words = [] node.pre = pre or node.name in pre_tags @@ -76,11 +77,11 @@ def render(node, encoding='latin-1', pre=False): if child.name in ignore_tags: pass else: - child = render(child, encoding, node.pre) + child = render(child, encoding, node.pre, fill, clean) if child.text: if child.is_block: if words : - blocks.append(para(words, node.pre)+"\n") + blocks.append(para(words, node.pre, fill)+"\n") words = [] blocks.append(child.text+"\n\n") node.is_block = True @@ -94,22 +95,31 @@ def render(node, encoding='latin-1', pre=False): else: raise ValueError("Unexpected node type: '%s'" % child) if words: - blocks.append(para(words, node.pre)) + blocks.append(para(words, node.pre, fill)) node.text = ''.join(blocks) return node class TextSoup(BeautifulSoup): + def as_text(self, encoding='latin-1', pre=False, fill=True, clean=True): + node = render(self, encoding, pre, fill, clean) + str = node.text + if clean: + str = re.sub("[ \t]+", " ", str) + str = re.sub("\n\n+", "\n\n", str) + return str + + def __str__(self, encoding='latin-1', prettyPrint=False, indentLevel=0): - node = render(self, encoding) + node = render(self, encoding, fill=False) str = node.text str = re.sub("[ \t]+", " ", str) str = re.sub("\n\n+", "\n\n", str) return str -def soup2text(html): +def soup2text(html, encoding='latin-1', pre=False, fill=True): # Line ending normalization html = html.replace("\r\n", "\n").replace("\r", "\n") # remove comments @@ -118,7 +128,7 @@ def soup2text(html): html = re.sub("
[ \t\n]*(
)+", "

", html) html = re.sub("
([^\n])", r"
\n\1", html) soup = TextSoup(html) - return str(soup) + return soup.as_text(encoding, pre, fill) if __name__ == "__main__": import sys