Fix occasional bad sentence end merges in ietf/utils/soup2text.py.
Remove some now unneded exceptions from ietf/testurl.list - Legacy-Id: 302
This commit is contained in:
parent
ac288c2d09
commit
9b78963547
|
@ -3,14 +3,9 @@
|
|||
200,404 /accounts/
|
||||
200,302 /accounts/password_change/
|
||||
200,302 /accounts/profile/
|
||||
200,404 /idtracker/status/
|
||||
200,404 /idtracker/last_call/
|
||||
skip /my/
|
||||
skip /idindex/
|
||||
skip /idindex/showdocs/all/date/
|
||||
skip /idindex/showdocs/all/name/
|
||||
200,404 /liaisons/
|
||||
200,404 /liaisons/managers/
|
||||
200,404 /mailinglists/area_lists/
|
||||
200,404 /mailinglists/nonwg_lists/
|
||||
|
||||
|
||||
|
|
|
@ -2,7 +2,10 @@
|
|||
|
||||
import re
|
||||
import textwrap
|
||||
try:
|
||||
from ietf.contrib.BeautifulSoup import Tag, BeautifulSoup, NavigableString
|
||||
except:
|
||||
from BeautifulSoup import Tag, BeautifulSoup, NavigableString
|
||||
|
||||
block_tags = ["[document]", "html", "body", "div", "blockquote", "table", "tr", "p", "pre", "h1", "h2", "h3", "h4", "h5", "h6", ]
|
||||
ignore_tags = ["head", "script", "style"]
|
||||
|
@ -14,6 +17,12 @@ entities = [("<", "<"), (">", ">"),
|
|||
|
||||
def para(words, pre):
|
||||
text = " ".join(words)
|
||||
# Fix occasional bad sentence end merges
|
||||
for i in range(1,len(words)):
|
||||
if words[i].startswith(". "):
|
||||
now = words[i-1]+" "+words[i]
|
||||
fix = words[i-1]+words[i]
|
||||
text = text.replace(now, fix)
|
||||
for entity, char in entities:
|
||||
text = text.replace(entity, char)
|
||||
if not pre:
|
||||
|
@ -80,6 +89,5 @@ if __name__ == "__main__":
|
|||
else:
|
||||
file = open(arg)
|
||||
html = file.read()
|
||||
file.close
|
||||
soup = TextSoup(html)
|
||||
print str(soup)
|
||||
file.close()
|
||||
print soup2text(html)
|
||||
|
|
Loading…
Reference in a new issue