make soup2text convert numeric character codes (e.g., "'") too.
- Legacy-Id: 306
This commit is contained in:
parent
94734fab6d
commit
7f512b4889
|
@ -13,7 +13,31 @@ pre_tags = ["pre"]
|
||||||
entities = [("<", "<"), (">", ">"),
|
entities = [("<", "<"), (">", ">"),
|
||||||
(""", '"'), ("'", "'"),
|
(""", '"'), ("'", "'"),
|
||||||
(" ", " "),
|
(" ", " "),
|
||||||
("&", "&"), ]
|
("&", "&"), ] # ampersand last
|
||||||
|
|
||||||
|
def unescape(text):
|
||||||
|
# Unescape character codes (if possible)
|
||||||
|
start = 0
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
pos = text.index("&#", start)
|
||||||
|
except ValueError:
|
||||||
|
break
|
||||||
|
match = re.match("&#\d+;", text[pos:])
|
||||||
|
if match:
|
||||||
|
str = match.group()
|
||||||
|
num = int(str[2:-1])
|
||||||
|
if num < 256:
|
||||||
|
text = text[:pos] + chr(num) + text[pos+len(str):]
|
||||||
|
start = pos + 1
|
||||||
|
else:
|
||||||
|
start = pos + len(str)
|
||||||
|
else:
|
||||||
|
start = pos + 2
|
||||||
|
# unescape character entities
|
||||||
|
for entity, char in entities:
|
||||||
|
text = text.replace(entity, char) # replace ampersand last
|
||||||
|
return text
|
||||||
|
|
||||||
def para(words, pre):
|
def para(words, pre):
|
||||||
text = " ".join(words)
|
text = " ".join(words)
|
||||||
|
@ -23,8 +47,7 @@ def para(words, pre):
|
||||||
now = words[i-1]+" "+words[i]
|
now = words[i-1]+" "+words[i]
|
||||||
fix = words[i-1]+words[i]
|
fix = words[i-1]+words[i]
|
||||||
text = text.replace(now, fix)
|
text = text.replace(now, fix)
|
||||||
for entity, char in entities:
|
text = unescape(text)
|
||||||
text = text.replace(entity, char)
|
|
||||||
if not pre:
|
if not pre:
|
||||||
text = re.sub("[\r\n\t ]+", " ", text)
|
text = re.sub("[\r\n\t ]+", " ", text)
|
||||||
text = textwrap.fill(text)
|
text = textwrap.fill(text)
|
||||||
|
|
Loading…
Reference in a new issue