145 lines
4.7 KiB
Python
Executable file
145 lines
4.7 KiB
Python
Executable file
#!/usr/bin/env python
|
|
# Copyright The IETF Trust 2007, All Rights Reserved
|
|
|
|
import re
|
|
import textwrap
|
|
try:
|
|
from ietf.contrib.BeautifulSoup import Tag, BeautifulSoup, NavigableString
|
|
except:
|
|
from BeautifulSoup import Tag, BeautifulSoup, NavigableString
|
|
|
|
block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", "li", "option"]
|
|
space_tags = ["th", "td"]
|
|
break_tags = ["br"]
|
|
ignore_tags = ["head", "script", "style"]
|
|
pre_tags = ["pre", "option"]
|
|
entities = [("<", "<"), (">", ">"),
|
|
(""", '"'), ("'", "'"),
|
|
(" ", " "),
|
|
("&", "&"), ] # ampersand last
|
|
|
|
def unescape(text):
|
|
# Unescape character codes (if possible)
|
|
start = 0
|
|
while True:
|
|
try:
|
|
pos = text.index("&#", start)
|
|
except ValueError:
|
|
break
|
|
match = re.match("&#\d+;", text[pos:])
|
|
if match:
|
|
str = match.group()
|
|
num = int(str[2:-1])
|
|
if num < 256:
|
|
text = text[:pos] + chr(num) + text[pos+len(str):]
|
|
start = pos + 1
|
|
else:
|
|
start = pos + len(str)
|
|
else:
|
|
start = pos + 2
|
|
# unescape character entities
|
|
for entity, char in entities:
|
|
text = text.replace(entity, char) # replace ampersand last
|
|
return text
|
|
|
|
def para(words, pre, fill):
|
|
text = "".join(words)
|
|
text = unescape(text)
|
|
if not pre:
|
|
text = text.strip("\n")
|
|
text = text.lstrip()
|
|
text = re.sub("[\t\n ]+", " ", text)
|
|
if fill:
|
|
text = textwrap.fill(text)
|
|
return text
|
|
|
|
def normalize(str):
|
|
# Normalize whitespace at the beginning and end of the string
|
|
str = re.sub("^[ \t]+", " ", str)
|
|
str = re.sub("[ \t]+$", " ", str)
|
|
# remove xml PIs and metainformation
|
|
str = re.sub("<![^>]*>", "", str)
|
|
str = re.sub("<\?[^>]*\?>", "", str)
|
|
return str
|
|
|
|
def render(node, encoding='latin-1', pre=False, fill=True, clean=True):
|
|
blocks = []
|
|
words = []
|
|
node.pre = pre or node.name in pre_tags
|
|
node.is_block = node.name in block_tags
|
|
for child in node:
|
|
if isinstance(child, NavigableString):
|
|
str = child.__str__(encoding)
|
|
if str and not node.pre:
|
|
str = normalize(str)
|
|
if str:
|
|
words.append(str)
|
|
elif isinstance(child, Tag):
|
|
if child.name in ignore_tags:
|
|
pass
|
|
else:
|
|
child = render(child, encoding, node.pre, fill, clean)
|
|
if child.text:
|
|
if child.is_block:
|
|
if words :
|
|
blocks.append(para(words, node.pre, fill)+"\n")
|
|
words = []
|
|
blocks.append(child.text+"\n\n")
|
|
node.is_block = True
|
|
else:
|
|
words.append(child.text)
|
|
if child.text[-1] not in [" ", "\t", "\n"]:
|
|
if child.name in space_tags:
|
|
words.append(" ")
|
|
if child.name in break_tags:
|
|
words.append("\n")
|
|
else:
|
|
raise ValueError("Unexpected node type: '%s'" % child)
|
|
if words:
|
|
blocks.append(para(words, node.pre, fill))
|
|
|
|
node.text = ''.join(blocks)
|
|
return node
|
|
|
|
class TextSoup(BeautifulSoup):
|
|
|
|
def as_text(self, encoding='latin-1', pre=False, fill=True, clean=True):
|
|
node = render(self, encoding, pre, fill, clean)
|
|
str = node.text
|
|
if clean:
|
|
str = re.sub("[ \t]+", " ", str)
|
|
str = re.sub("\n\n+", "\n\n", str)
|
|
return str
|
|
|
|
|
|
def __str__(self, encoding='latin-1',
|
|
prettyPrint=False, indentLevel=0):
|
|
node = render(self, encoding, fill=False)
|
|
str = node.text
|
|
str = re.sub("[ \t]+", " ", str)
|
|
str = re.sub("\n\n+", "\n\n", str)
|
|
return str
|
|
|
|
def soup2text(html, encoding='latin-1', pre=False, fill=True):
|
|
# Line ending normalization
|
|
html = html.replace("\r\n", "\n").replace("\r", "\n")
|
|
# remove comments
|
|
html = re.sub("(?s)<!--.*?-->", "", html)
|
|
# some preprocessing to handle common pathological cases
|
|
html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html)
|
|
html = re.sub("<br */?>([^\n])", r"<br />\n\1", html)
|
|
soup = TextSoup(html)
|
|
return soup.as_text(encoding, pre, fill)
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
import urllib2 as urllib
|
|
for arg in sys.argv[1:]:
|
|
if arg[:6] in ["http:/", "https:", "ftp://"]:
|
|
file = urllib.urlopen(arg)
|
|
else:
|
|
file = open(arg)
|
|
html = file.read()
|
|
file.close()
|
|
print soup2text(html)
|