diff --git a/ietf/tests.py b/ietf/tests.py index 8237b9c3b..51d37680f 100644 --- a/ietf/tests.py +++ b/ietf/tests.py @@ -178,7 +178,8 @@ class UrlTestCase(TestCase): print "OK cmp %s" % (url) else: contextlines = 0 - diff = "\n".join(unified_diff(goodtext, testtext, master, url, "", "", contextlines, lineterm="")) + difflist = list(unified_diff(goodtext, testtext, master, url, "", "", contextlines, lineterm="")) + diff = "\n".join(difflist) for chunk in self.diffchunks: #print "*** Checking for chunk:", chunk[:24] while re.search(chunk, diff): @@ -201,7 +202,9 @@ class UrlTestCase(TestCase): print "OK cmp %s" % (url) else: print "Diff: %s" % (url) - print diff + print "\n".join(difflist[:100]) + if len(difflist) > 100: + print "... (skipping %s lines of diff)" % (len(difflist)-100) else: print "OK cmp %s" % (url) diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py index dc8c91b47..28e11d862 100755 --- a/ietf/utils/soup2text.py +++ b/ietf/utils/soup2text.py @@ -7,7 +7,8 @@ try: except: from BeautifulSoup import Tag, BeautifulSoup, NavigableString -block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", ] +block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", "li"] +space_tags = ["th", "td", "br"] ignore_tags = ["head", "script", "style"] pre_tags = ["pre"] entities = [("<", "<"), (">", ">"), @@ -86,7 +87,10 @@ def render(node, encoding='latin-1', pre=False): blocks.append(child.text+"\n\n") node.is_block = True else: - words.append(child.text) + if child.text: + if child.name in space_tags and not words[-1][-1] in [" ", "\t", "\n"]: + words.append(" ") + words.append(child.text) else: raise ValueError("Unexpected node type: '%s'" % child) if words: @@ -111,7 +115,6 @@ def soup2text(html): # some preprocessing to handle common pathological cases html = re.sub("
[ \t\n]*(
)+", "

", html) html = re.sub("
([^\n])", r"
\n\1", html) - html = re.sub("([^ \t\n])()", r"\1 \2", html) soup = TextSoup(html) return str(soup)