From 10ce0e07dd33f8bf2a1354ee45d40207327318f5 Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Sun, 10 Jun 2007 11:27:02 +0000 Subject: [PATCH] 'soup2text' is a html-to-text converter which uses the BeautifulSoup.py module. It converts html to plain paragraph-filled readable text. - Legacy-Id: 277 --- ietf/utils/__init__.py | 1 + ietf/utils/soup2text.py | 83 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100755 ietf/utils/soup2text.py diff --git a/ietf/utils/__init__.py b/ietf/utils/__init__.py index b01f5b7df..69fd3bb07 100644 --- a/ietf/utils/__init__.py +++ b/ietf/utils/__init__.py @@ -2,6 +2,7 @@ from listop import orl, flattenl from log import log from cache_foreign_key import FKAsOneToOne from templated_form import makeTemplatedForm +from soup2text import TextSoup, soup2text makeFormattingForm = makeTemplatedForm diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py new file mode 100755 index 000000000..f0529d43b --- /dev/null +++ b/ietf/utils/soup2text.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +import re +import textwrap +from BeautifulSoup import Tag, BeautifulSoup, NavigableString + +block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", ] +ignore_tags = ["head", "script", "style"] +pre_tags = ["pre"] +entities = [("<", "<"), (">", ">"), + (""", '"'), ("'", "'"), + (" ", " "), + ("&", "&"), ] + +def para(words, pre): + text = " ".join(words) + for entity, char in entities: + text = text.replace(entity, char) + if not pre: + text = re.sub("[\r\n\t ]+", " ", text) + text = textwrap.fill(text) + return text + +def render(node, encoding='latin-1', pre=False): + blocks = [] + words = [] + node.pre = pre or node.name in pre_tags + node.is_block = node.name in block_tags + for child in node: + if isinstance(child, NavigableString): + str = child.__str__(encoding) + if str and not node.pre: + str = str.strip() + if str and not str.startswith("