Fix occasional bad sentence end merges in ietf/utils/soup2text.py.

Remove some now unneded exceptions from ietf/testurl.list
 - Legacy-Id: 302
This commit is contained in:
Henrik Levkowetz 2007-06-11 04:22:29 +00:00
parent ac288c2d09
commit 9b78963547
2 changed files with 14 additions and 11 deletions

View file

@ -3,14 +3,9 @@
200,404 /accounts/
200,302 /accounts/password_change/
200,302 /accounts/profile/
200,404 /idtracker/status/
200,404 /idtracker/last_call/
skip /my/
skip /idindex/
skip /idindex/showdocs/all/date/
skip /idindex/showdocs/all/name/
200,404 /liaisons/
200,404 /liaisons/managers/
200,404 /mailinglists/area_lists/
200,404 /mailinglists/nonwg_lists/

View file

@ -2,7 +2,10 @@
import re
import textwrap
try:
from ietf.contrib.BeautifulSoup import Tag, BeautifulSoup, NavigableString
except:
from BeautifulSoup import Tag, BeautifulSoup, NavigableString
block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", ]
ignore_tags = ["head", "script", "style"]
@ -14,6 +17,12 @@ entities = [("&lt;", "<"), ("&gt;", ">"),
def para(words, pre):
text = " ".join(words)
# Fix occasional bad sentence end merges
for i in range(1,len(words)):
if words[i].startswith(". "):
now = words[i-1]+" "+words[i]
fix = words[i-1]+words[i]
text = text.replace(now, fix)
for entity, char in entities:
text = text.replace(entity, char)
if not pre:
@ -80,6 +89,5 @@ if __name__ == "__main__":
else:
file = open(arg)
html = file.read()
file.close
soup = TextSoup(html)
print str(soup)
file.close()
print soup2text(html)