From 9b7896354715f01cc32b9d9bf2a403fded1a0e20 Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Mon, 11 Jun 2007 04:22:29 +0000 Subject: [PATCH] Fix occasional bad sentence end merges in ietf/utils/soup2text.py. Remove some now unneded exceptions from ietf/testurl.list - Legacy-Id: 302 --- ietf/testurl.list | 7 +------ ietf/utils/soup2text.py | 18 +++++++++++++----- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/ietf/testurl.list b/ietf/testurl.list index 037b640de..2dd246d0c 100644 --- a/ietf/testurl.list +++ b/ietf/testurl.list @@ -3,14 +3,9 @@ 200,404 /accounts/ 200,302 /accounts/password_change/ 200,302 /accounts/profile/ -200,404 /idtracker/status/ -200,404 /idtracker/last_call/ skip /my/ skip /idindex/ skip /idindex/showdocs/all/date/ skip /idindex/showdocs/all/name/ -200,404 /liaisons/ -200,404 /liaisons/managers/ -200,404 /mailinglists/area_lists/ -200,404 /mailinglists/nonwg_lists/ + diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py index c86484077..7a4bef05d 100755 --- a/ietf/utils/soup2text.py +++ b/ietf/utils/soup2text.py @@ -2,7 +2,10 @@ import re import textwrap -from ietf.contrib.BeautifulSoup import Tag, BeautifulSoup, NavigableString +try: + from ietf.contrib.BeautifulSoup import Tag, BeautifulSoup, NavigableString +except: + from BeautifulSoup import Tag, BeautifulSoup, NavigableString block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", ] ignore_tags = ["head", "script", "style"] @@ -12,8 +15,14 @@ entities = [("<", "<"), (">", ">"), (" ", " "), ("&", "&"), ] -def para(words, pre): +def para(words, pre): text = " ".join(words) + # Fix occasional bad sentence end merges + for i in range(1,len(words)): + if words[i].startswith(". "): + now = words[i-1]+" "+words[i] + fix = words[i-1]+words[i] + text = text.replace(now, fix) for entity, char in entities: text = text.replace(entity, char) if not pre: @@ -80,6 +89,5 @@ if __name__ == "__main__": else: file = open(arg) html = file.read() - file.close - soup = TextSoup(html) - print str(soup) + file.close() + print soup2text(html)