fix: in htmlization ignore html files that do not parse as html (#4850)

This commit is contained in:
Robert Sparks 2022-12-08 13:55:15 -06:00 committed by GitHub
parent efb9f135c3
commit 44c38abbbb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23

View file

@ -571,7 +571,10 @@ class DocumentInfo(models.Model):
return None
# get body
body = etree.HTML(html).xpath("//body")[0]
etree_html = etree.HTML(html)
if etree_html is None:
return None
body = etree_html.xpath("//body")[0]
body.tag = "div"
if classes:
body.attrib["class"] = classes