diff --git a/ietf/testurl.list b/ietf/testurl.list index 037b640de..2dd246d0c 100644 --- a/ietf/testurl.list +++ b/ietf/testurl.list @@ -3,14 +3,9 @@ 200,404 /accounts/ 200,302 /accounts/password_change/ 200,302 /accounts/profile/ -200,404 /idtracker/status/ -200,404 /idtracker/last_call/ skip /my/ skip /idindex/ skip /idindex/showdocs/all/date/ skip /idindex/showdocs/all/name/ -200,404 /liaisons/ -200,404 /liaisons/managers/ -200,404 /mailinglists/area_lists/ -200,404 /mailinglists/nonwg_lists/ + diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py index c86484077..7a4bef05d 100755 --- a/ietf/utils/soup2text.py +++ b/ietf/utils/soup2text.py @@ -2,7 +2,10 @@ import re import textwrap -from ietf.contrib.BeautifulSoup import Tag, BeautifulSoup, NavigableString +try: + from ietf.contrib.BeautifulSoup import Tag, BeautifulSoup, NavigableString +except: + from BeautifulSoup import Tag, BeautifulSoup, NavigableString block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", ] ignore_tags = ["head", "script", "style"] @@ -12,8 +15,14 @@ entities = [("<", "<"), (">", ">"), (" ", " "), ("&", "&"), ] -def para(words, pre): +def para(words, pre): text = " ".join(words) + # Fix occasional bad sentence end merges + for i in range(1,len(words)): + if words[i].startswith(". "): + now = words[i-1]+" "+words[i] + fix = words[i-1]+words[i] + text = text.replace(now, fix) for entity, char in entities: text = text.replace(entity, char) if not pre: @@ -80,6 +89,5 @@ if __name__ == "__main__": else: file = open(arg) html = file.read() - file.close - soup = TextSoup(html) - print str(soup) + file.close() + print soup2text(html)