Only print the first 100 lines of a long diff. New soup2html code for spacing associated with certain tags.

- Legacy-Id: 337
This commit is contained in:
Henrik Levkowetz 2007-06-12 17:52:07 +00:00
parent 9b4b6c5297
commit dd37257c0c
2 changed files with 11 additions and 5 deletions

View file

@ -178,7 +178,8 @@ class UrlTestCase(TestCase):
print "OK cmp %s" % (url)
else:
contextlines = 0
diff = "\n".join(unified_diff(goodtext, testtext, master, url, "", "", contextlines, lineterm=""))
difflist = list(unified_diff(goodtext, testtext, master, url, "", "", contextlines, lineterm=""))
diff = "\n".join(difflist)
for chunk in self.diffchunks:
#print "*** Checking for chunk:", chunk[:24]
while re.search(chunk, diff):
@ -201,7 +202,9 @@ class UrlTestCase(TestCase):
print "OK cmp %s" % (url)
else:
print "Diff: %s" % (url)
print diff
print "\n".join(difflist[:100])
if len(difflist) > 100:
print "... (skipping %s lines of diff)" % (len(difflist)-100)
else:
print "OK cmp %s" % (url)

View file

@ -7,7 +7,8 @@ try:
except:
from BeautifulSoup import Tag, BeautifulSoup, NavigableString
block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", ]
block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", "li"]
space_tags = ["th", "td", "br"]
ignore_tags = ["head", "script", "style"]
pre_tags = ["pre"]
entities = [("&lt;", "<"), ("&gt;", ">"),
@ -86,7 +87,10 @@ def render(node, encoding='latin-1', pre=False):
blocks.append(child.text+"\n\n")
node.is_block = True
else:
words.append(child.text)
if child.text:
if child.name in space_tags and not words[-1][-1] in [" ", "\t", "\n"]:
words.append(" ")
words.append(child.text)
else:
raise ValueError("Unexpected node type: '%s'" % child)
if words:
@ -111,7 +115,6 @@ def soup2text(html):
# some preprocessing to handle common pathological cases
html = re.sub("<br */?>[ \t\n]*(<br */?>)+", "<p/>", html)
html = re.sub("<br */?>([^\n])", r"<br />\n\1", html)
html = re.sub("([^ \t\n])(</t[hd].*?>)", r"\1 \2", html)
soup = TextSoup(html)
return str(soup)