diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py index 9c6f9f33b..fe04349c9 100755 --- a/ietf/utils/soup2text.py +++ b/ietf/utils/soup2text.py @@ -7,10 +7,11 @@ try: except: from BeautifulSoup import Tag, BeautifulSoup, NavigableString -block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", "li"] -space_tags = ["th", "td", "br"] +block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", "li", "option"] +space_tags = ["th", "td"] +break_tags = ["br"] ignore_tags = ["head", "script", "style"] -pre_tags = ["pre"] +pre_tags = ["pre", "option"] entities = [("<", "<"), (">", ">"), (""", '"'), ("'", "'"), (" ", " "), @@ -85,8 +86,11 @@ def render(node, encoding='latin-1', pre=False): node.is_block = True else: words.append(child.text) - if child.name in space_tags and not (words and words[-1] and words[-1][-1] in [" ", "\t", "\n"]): - words.append(" ") + if child.text[-1] not in [" ", "\t", "\n"]: + if child.name in space_tags: + words.append(" ") + if child.name in break_tags: + words.append("\n") else: raise ValueError("Unexpected node type: '%s'" % child) if words: diff --git a/test/check_url b/test/check_url new file mode 100755 index 000000000..f3f7b8333 --- /dev/null +++ b/test/check_url @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +import sys +import os + +# Warning: The following code assumes that this file is located in the svn +# checkout directory, and hasn't been moved: +ietfpath = os.path.abspath(__file__.rsplit("/", 1)[0] + "/..") +sys.path.append(ietfpath) + +os.environ["DJANGO_SETTINGS_MODULE"] = "ietf.settings" + +from ietf.utils.soup2text import soup2text as html2text +from difflib import unified_diff +import urllib2 as urllib +from ietf.tests import read_testurls + +django_server = os.environ.get("DJANGO_SERVER", "http://merlot.tools.ietf.org:31415") +django_server.rstrip("/") + +testtuples = [] +for root, dirs, files in os.walk(ietfpath): + if "testurl.list" in files: + testtuples += read_testurls(root+"/testurl.list") + if "testurls.list" in files: + testtuples += read_testurls(root+"/testurls.list") +testurls = dict([ (tuple[1], tuple) for tuple in testtuples ]) + +def fetch(url): + file = urllib.urlopen(url) + html = file.read() + file.close() + return html + +for url in sys.argv[1:]: + tuple = testurls[url] + if len(tuple) > 2: + url1 = tuple[2] + url2 = django_server + tuple[1] + print "Fetching %s ..." % url1 + text1 = html2text(fetch(url1)).split("\n") + print "Fetching %s ..." % url2 + text2 = html2text(fetch(url2)).split("\n") + print "\n".join(unified_diff(text1, text2, url1, url2, "", "", 3, lineterm="")) + + + \ No newline at end of file