From a7a6d956af93a55af86e1461e94dbb4122fff6ca Mon Sep 17 00:00:00 2001 From: Henrik Levkowetz Date: Mon, 11 Jun 2007 03:36:08 +0000 Subject: [PATCH] Adding a fix in soup2text for a common pathological case:

used instead of

to indicate paragraph breaks. This changes the failed diff for /iesg/telechat/detail/354/ to show only three differences, where two are whitespace differences and one shows a difference between '@ietf.org. The' and '@ietf.org . The' and is an artifact of the text extraction. Will look at fixing that next. - Legacy-Id: 300 --- ietf/utils/soup2text.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ietf/utils/soup2text.py b/ietf/utils/soup2text.py index bae2bd321..c86484077 100755 --- a/ietf/utils/soup2text.py +++ b/ietf/utils/soup2text.py @@ -66,6 +66,8 @@ class TextSoup(BeautifulSoup): return str def soup2text(html): + # some preprocessing to handle common pathological cases + html = re.sub("
[ \t\r\n]*(
)+", "

", html) soup = TextSoup(html) return str(soup)