1170 lines
35 KiB
Python
1170 lines
35 KiB
Python
import string, gettext
|
|
_ = gettext.gettext
|
|
|
|
try:
|
|
frozenset
|
|
except NameError:
|
|
# Import from the sets module for python 2.3
|
|
from sets import Set as set
|
|
from sets import ImmutableSet as frozenset
|
|
|
|
EOF = None
|
|
|
|
E = {
|
|
"null-character":
|
|
_(u"Null character in input stream, replaced with U+FFFD."),
|
|
"invalid-character":
|
|
_(u"Invalid codepoint in stream."),
|
|
"incorrectly-placed-solidus":
|
|
_(u"Solidus (/) incorrectly placed in tag."),
|
|
"incorrect-cr-newline-entity":
|
|
_(u"Incorrect CR newline entity, replaced with LF."),
|
|
"illegal-windows-1252-entity":
|
|
_(u"Entity used with illegal number (windows-1252 reference)."),
|
|
"cant-convert-numeric-entity":
|
|
_(u"Numeric entity couldn't be converted to character "
|
|
u"(codepoint U+%(charAsInt)08x)."),
|
|
"illegal-codepoint-for-numeric-entity":
|
|
_(u"Numeric entity represents an illegal codepoint: "
|
|
u"U+%(charAsInt)08x."),
|
|
"numeric-entity-without-semicolon":
|
|
_(u"Numeric entity didn't end with ';'."),
|
|
"expected-numeric-entity-but-got-eof":
|
|
_(u"Numeric entity expected. Got end of file instead."),
|
|
"expected-numeric-entity":
|
|
_(u"Numeric entity expected but none found."),
|
|
"named-entity-without-semicolon":
|
|
_(u"Named entity didn't end with ';'."),
|
|
"expected-named-entity":
|
|
_(u"Named entity expected. Got none."),
|
|
"attributes-in-end-tag":
|
|
_(u"End tag contains unexpected attributes."),
|
|
"expected-tag-name-but-got-right-bracket":
|
|
_(u"Expected tag name. Got '>' instead."),
|
|
"expected-tag-name-but-got-question-mark":
|
|
_(u"Expected tag name. Got '?' instead. (HTML doesn't "
|
|
u"support processing instructions.)"),
|
|
"expected-tag-name":
|
|
_(u"Expected tag name. Got something else instead"),
|
|
"expected-closing-tag-but-got-right-bracket":
|
|
_(u"Expected closing tag. Got '>' instead. Ignoring '</>'."),
|
|
"expected-closing-tag-but-got-eof":
|
|
_(u"Expected closing tag. Unexpected end of file."),
|
|
"expected-closing-tag-but-got-char":
|
|
_(u"Expected closing tag. Unexpected character '%(data)s' found."),
|
|
"eof-in-tag-name":
|
|
_(u"Unexpected end of file in the tag name."),
|
|
"expected-attribute-name-but-got-eof":
|
|
_(u"Unexpected end of file. Expected attribute name instead."),
|
|
"eof-in-attribute-name":
|
|
_(u"Unexpected end of file in attribute name."),
|
|
"invalid-character-in-attribute-name":
|
|
_(u"Invalid chracter in attribute name"),
|
|
"duplicate-attribute":
|
|
_(u"Dropped duplicate attribute on tag."),
|
|
"expected-end-of-tag-name-but-got-eof":
|
|
_(u"Unexpected end of file. Expected = or end of tag."),
|
|
"expected-attribute-value-but-got-eof":
|
|
_(u"Unexpected end of file. Expected attribute value."),
|
|
"expected-attribute-value-but-got-right-bracket":
|
|
_(u"Expected attribute value. Got '>' instead."),
|
|
"eof-in-attribute-value-double-quote":
|
|
_(u"Unexpected end of file in attribute value (\")."),
|
|
"eof-in-attribute-value-single-quote":
|
|
_(u"Unexpected end of file in attribute value (')."),
|
|
"eof-in-attribute-value-no-quotes":
|
|
_(u"Unexpected end of file in attribute value."),
|
|
"unexpected-EOF-after-solidus-in-tag":
|
|
_(u"Unexpected end of file in tag. Expected >"),
|
|
"unexpected-character-after-soldius-in-tag":
|
|
_(u"Unexpected character after / in tag. Expected >"),
|
|
"expected-dashes-or-doctype":
|
|
_(u"Expected '--' or 'DOCTYPE'. Not found."),
|
|
"incorrect-comment":
|
|
_(u"Incorrect comment."),
|
|
"eof-in-comment":
|
|
_(u"Unexpected end of file in comment."),
|
|
"eof-in-comment-end-dash":
|
|
_(u"Unexpected end of file in comment (-)"),
|
|
"unexpected-dash-after-double-dash-in-comment":
|
|
_(u"Unexpected '-' after '--' found in comment."),
|
|
"eof-in-comment-double-dash":
|
|
_(u"Unexpected end of file in comment (--)."),
|
|
"unexpected-char-in-comment":
|
|
_(u"Unexpected character in comment found."),
|
|
"need-space-after-doctype":
|
|
_(u"No space after literal string 'DOCTYPE'."),
|
|
"expected-doctype-name-but-got-right-bracket":
|
|
_(u"Unexpected > character. Expected DOCTYPE name."),
|
|
"expected-doctype-name-but-got-eof":
|
|
_(u"Unexpected end of file. Expected DOCTYPE name."),
|
|
"eof-in-doctype-name":
|
|
_(u"Unexpected end of file in DOCTYPE name."),
|
|
"eof-in-doctype":
|
|
_(u"Unexpected end of file in DOCTYPE."),
|
|
"expected-space-or-right-bracket-in-doctype":
|
|
_(u"Expected space or '>'. Got '%(data)s'"),
|
|
"unexpected-end-of-doctype":
|
|
_(u"Unexpected end of DOCTYPE."),
|
|
"unexpected-char-in-doctype":
|
|
_(u"Unexpected character in DOCTYPE."),
|
|
"eof-in-innerhtml":
|
|
_(u"XXX innerHTML EOF"),
|
|
"unexpected-doctype":
|
|
_(u"Unexpected DOCTYPE. Ignored."),
|
|
"non-html-root":
|
|
_(u"html needs to be the first start tag."),
|
|
"expected-doctype-but-got-eof":
|
|
_(u"Unexpected End of file. Expected DOCTYPE."),
|
|
"unknown-doctype":
|
|
_(u"Erroneous DOCTYPE."),
|
|
"expected-doctype-but-got-chars":
|
|
_(u"Unexpected non-space characters. Expected DOCTYPE."),
|
|
"expected-doctype-but-got-start-tag":
|
|
_(u"Unexpected start tag (%(name)s). Expected DOCTYPE."),
|
|
"expected-doctype-but-got-end-tag":
|
|
_(u"Unexpected end tag (%(name)s). Expected DOCTYPE."),
|
|
"end-tag-after-implied-root":
|
|
_(u"Unexpected end tag (%(name)s) after the (implied) root element."),
|
|
"expected-named-closing-tag-but-got-eof":
|
|
_(u"Unexpected end of file. Expected end tag (%(name)s)."),
|
|
"two-heads-are-not-better-than-one":
|
|
_(u"Unexpected start tag head in existing head. Ignored."),
|
|
"unexpected-end-tag":
|
|
_(u"Unexpected end tag (%(name)s). Ignored."),
|
|
"unexpected-start-tag-out-of-my-head":
|
|
_(u"Unexpected start tag (%(name)s) that can be in head. Moved."),
|
|
"unexpected-start-tag":
|
|
_(u"Unexpected start tag (%(name)s)."),
|
|
"missing-end-tag":
|
|
_(u"Missing end tag (%(name)s)."),
|
|
"missing-end-tags":
|
|
_(u"Missing end tags (%(name)s)."),
|
|
"unexpected-start-tag-implies-end-tag":
|
|
_(u"Unexpected start tag (%(startName)s) "
|
|
u"implies end tag (%(endName)s)."),
|
|
"unexpected-start-tag-treated-as":
|
|
_(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."),
|
|
"deprecated-tag":
|
|
_(u"Unexpected start tag %(name)s. Don't use it!"),
|
|
"unexpected-start-tag-ignored":
|
|
_(u"Unexpected start tag %(name)s. Ignored."),
|
|
"expected-one-end-tag-but-got-another":
|
|
_(u"Unexpected end tag (%(gotName)s). "
|
|
u"Missing end tag (%(expectedName)s)."),
|
|
"end-tag-too-early":
|
|
_(u"End tag (%(name)s) seen too early. Expected other end tag."),
|
|
"end-tag-too-early-named":
|
|
_(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."),
|
|
"end-tag-too-early-ignored":
|
|
_(u"End tag (%(name)s) seen too early. Ignored."),
|
|
"adoption-agency-1.1":
|
|
_(u"End tag (%(name)s) violates step 1, "
|
|
u"paragraph 1 of the adoption agency algorithm."),
|
|
"adoption-agency-1.2":
|
|
_(u"End tag (%(name)s) violates step 1, "
|
|
u"paragraph 2 of the adoption agency algorithm."),
|
|
"adoption-agency-1.3":
|
|
_(u"End tag (%(name)s) violates step 1, "
|
|
u"paragraph 3 of the adoption agency algorithm."),
|
|
"unexpected-end-tag-treated-as":
|
|
_(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."),
|
|
"no-end-tag":
|
|
_(u"This element (%(name)s) has no end tag."),
|
|
"unexpected-implied-end-tag-in-table":
|
|
_(u"Unexpected implied end tag (%(name)s) in the table phase."),
|
|
"unexpected-implied-end-tag-in-table-body":
|
|
_(u"Unexpected implied end tag (%(name)s) in the table body phase."),
|
|
"unexpected-char-implies-table-voodoo":
|
|
_(u"Unexpected non-space characters in "
|
|
u"table context caused voodoo mode."),
|
|
"unexpected-hidden-input-in-table":
|
|
_(u"Unexpected input with type hidden in table context."),
|
|
"unexpected-form-in-table":
|
|
_(u"Unexpected form in table context."),
|
|
"unexpected-start-tag-implies-table-voodoo":
|
|
_(u"Unexpected start tag (%(name)s) in "
|
|
u"table context caused voodoo mode."),
|
|
"unexpected-end-tag-implies-table-voodoo":
|
|
_(u"Unexpected end tag (%(name)s) in "
|
|
u"table context caused voodoo mode."),
|
|
"unexpected-cell-in-table-body":
|
|
_(u"Unexpected table cell start tag (%(name)s) "
|
|
u"in the table body phase."),
|
|
"unexpected-cell-end-tag":
|
|
_(u"Got table cell end tag (%(name)s) "
|
|
u"while required end tags are missing."),
|
|
"unexpected-end-tag-in-table-body":
|
|
_(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."),
|
|
"unexpected-implied-end-tag-in-table-row":
|
|
_(u"Unexpected implied end tag (%(name)s) in the table row phase."),
|
|
"unexpected-end-tag-in-table-row":
|
|
_(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."),
|
|
"unexpected-select-in-select":
|
|
_(u"Unexpected select start tag in the select phase "
|
|
u"treated as select end tag."),
|
|
"unexpected-input-in-select":
|
|
_(u"Unexpected input start tag in the select phase."),
|
|
"unexpected-start-tag-in-select":
|
|
_(u"Unexpected start tag token (%(name)s in the select phase. "
|
|
u"Ignored."),
|
|
"unexpected-end-tag-in-select":
|
|
_(u"Unexpected end tag (%(name)s) in the select phase. Ignored."),
|
|
"unexpected-table-element-start-tag-in-select-in-table":
|
|
_(u"Unexpected table element start tag (%(name)s) in the select in table phase."),
|
|
"unexpected-table-element-end-tag-in-select-in-table":
|
|
_(u"Unexpected table element end tag (%(name)s) in the select in table phase."),
|
|
"unexpected-char-after-body":
|
|
_(u"Unexpected non-space characters in the after body phase."),
|
|
"unexpected-start-tag-after-body":
|
|
_(u"Unexpected start tag token (%(name)s)"
|
|
u" in the after body phase."),
|
|
"unexpected-end-tag-after-body":
|
|
_(u"Unexpected end tag token (%(name)s)"
|
|
u" in the after body phase."),
|
|
"unexpected-char-in-frameset":
|
|
_(u"Unepxected characters in the frameset phase. Characters ignored."),
|
|
"unexpected-start-tag-in-frameset":
|
|
_(u"Unexpected start tag token (%(name)s)"
|
|
u" in the frameset phase. Ignored."),
|
|
"unexpected-frameset-in-frameset-innerhtml":
|
|
_(u"Unexpected end tag token (frameset) "
|
|
u"in the frameset phase (innerHTML)."),
|
|
"unexpected-end-tag-in-frameset":
|
|
_(u"Unexpected end tag token (%(name)s)"
|
|
u" in the frameset phase. Ignored."),
|
|
"unexpected-char-after-frameset":
|
|
_(u"Unexpected non-space characters in the "
|
|
u"after frameset phase. Ignored."),
|
|
"unexpected-start-tag-after-frameset":
|
|
_(u"Unexpected start tag (%(name)s)"
|
|
u" in the after frameset phase. Ignored."),
|
|
"unexpected-end-tag-after-frameset":
|
|
_(u"Unexpected end tag (%(name)s)"
|
|
u" in the after frameset phase. Ignored."),
|
|
"unexpected-end-tag-after-body-innerhtml":
|
|
_(u"Unexpected end tag after body(innerHtml)"),
|
|
"expected-eof-but-got-char":
|
|
_(u"Unexpected non-space characters. Expected end of file."),
|
|
"expected-eof-but-got-start-tag":
|
|
_(u"Unexpected start tag (%(name)s)"
|
|
u". Expected end of file."),
|
|
"expected-eof-but-got-end-tag":
|
|
_(u"Unexpected end tag (%(name)s)"
|
|
u". Expected end of file."),
|
|
"eof-in-table":
|
|
_(u"Unexpected end of file. Expected table content."),
|
|
"eof-in-select":
|
|
_(u"Unexpected end of file. Expected select content."),
|
|
"eof-in-frameset":
|
|
_(u"Unexpected end of file. Expected frameset content."),
|
|
"eof-in-script-in-script":
|
|
_(u"Unexpected end of file. Expected script content."),
|
|
"non-void-element-with-trailing-solidus":
|
|
_(u"Trailing solidus not allowed on element %(name)s"),
|
|
"unexpected-html-element-in-foreign-content":
|
|
_(u"Element %(name)s not allowed in a non-html context"),
|
|
"unexpected-end-tag-before-html":
|
|
_(u"Unexpected end tag (%(name)s) before html."),
|
|
"XXX-undefined-error":
|
|
(u"Undefined error (this sucks and should be fixed)"),
|
|
}
|
|
|
|
namespaces = {
|
|
"html":"http://www.w3.org/1999/xhtml",
|
|
"mathml":"http://www.w3.org/1998/Math/MathML",
|
|
"svg":"http://www.w3.org/2000/svg",
|
|
"xlink":"http://www.w3.org/1999/xlink",
|
|
"xml":"http://www.w3.org/XML/1998/namespace",
|
|
"xmlns":"http://www.w3.org/2000/xmlns/"
|
|
}
|
|
|
|
scopingElements = frozenset((
|
|
(namespaces["html"], "applet"),
|
|
(namespaces["html"], "button"),
|
|
(namespaces["html"], "caption"),
|
|
(namespaces["html"], "html"),
|
|
(namespaces["html"], "marquee"),
|
|
(namespaces["html"], "object"),
|
|
(namespaces["html"], "table"),
|
|
(namespaces["html"], "td"),
|
|
(namespaces["html"], "th"),
|
|
(namespaces["svg"], "foreignObject")
|
|
))
|
|
|
|
formattingElements = frozenset((
|
|
(namespaces["html"], "a"),
|
|
(namespaces["html"], "b"),
|
|
(namespaces["html"], "big"),
|
|
(namespaces["html"], "code"),
|
|
(namespaces["html"], "em"),
|
|
(namespaces["html"], "font"),
|
|
(namespaces["html"], "i"),
|
|
(namespaces["html"], "nobr"),
|
|
(namespaces["html"], "s"),
|
|
(namespaces["html"], "small"),
|
|
(namespaces["html"], "strike"),
|
|
(namespaces["html"], "strong"),
|
|
(namespaces["html"], "tt"),
|
|
(namespaces["html"], "u")
|
|
))
|
|
|
|
specialElements = frozenset((
|
|
(namespaces["html"], "address"),
|
|
(namespaces["html"], "area"),
|
|
(namespaces["html"], "article"),
|
|
(namespaces["html"], "aside"),
|
|
(namespaces["html"], "base"),
|
|
(namespaces["html"], "basefont"),
|
|
(namespaces["html"], "bgsound"),
|
|
(namespaces["html"], "blockquote"),
|
|
(namespaces["html"], "body"),
|
|
(namespaces["html"], "br"),
|
|
(namespaces["html"], "center"),
|
|
(namespaces["html"], "col"),
|
|
(namespaces["html"], "colgroup"),
|
|
(namespaces["html"], "command"),
|
|
(namespaces["html"], "datagrid"),
|
|
(namespaces["html"], "dd"),
|
|
(namespaces["html"], "details"),
|
|
(namespaces["html"], "dialog"),
|
|
(namespaces["html"], "dir"),
|
|
(namespaces["html"], "div"),
|
|
(namespaces["html"], "dl"),
|
|
(namespaces["html"], "dt"),
|
|
(namespaces["html"], "embed"),
|
|
(namespaces["html"], "event-source"),
|
|
(namespaces["html"], "fieldset"),
|
|
(namespaces["html"], "figure"),
|
|
(namespaces["html"], "footer"),
|
|
(namespaces["html"], "form"),
|
|
(namespaces["html"], "frame"),
|
|
(namespaces["html"], "frameset"),
|
|
(namespaces["html"], "h1"),
|
|
(namespaces["html"], "h2"),
|
|
(namespaces["html"], "h3"),
|
|
(namespaces["html"], "h4"),
|
|
(namespaces["html"], "h5"),
|
|
(namespaces["html"], "h6"),
|
|
(namespaces["html"], "head"),
|
|
(namespaces["html"], "header"),
|
|
(namespaces["html"], "hr"),
|
|
(namespaces["html"], "iframe"),
|
|
# Note that image is commented out in the spec as "this isn't an
|
|
# element that can end up on the stack, so it doesn't matter,"
|
|
(namespaces["html"], "image"),
|
|
(namespaces["html"], "img"),
|
|
(namespaces["html"], "input"),
|
|
(namespaces["html"], "isindex"),
|
|
(namespaces["html"], "li"),
|
|
(namespaces["html"], "link"),
|
|
(namespaces["html"], "listing"),
|
|
(namespaces["html"], "menu"),
|
|
(namespaces["html"], "meta"),
|
|
(namespaces["html"], "nav"),
|
|
(namespaces["html"], "noembed"),
|
|
(namespaces["html"], "noframes"),
|
|
(namespaces["html"], "noscript"),
|
|
(namespaces["html"], "ol"),
|
|
(namespaces["html"], "optgroup"),
|
|
(namespaces["html"], "option"),
|
|
(namespaces["html"], "p"),
|
|
(namespaces["html"], "param"),
|
|
(namespaces["html"], "plaintext"),
|
|
(namespaces["html"], "pre"),
|
|
(namespaces["html"], "script"),
|
|
(namespaces["html"], "section"),
|
|
(namespaces["html"], "select"),
|
|
(namespaces["html"], "spacer"),
|
|
(namespaces["html"], "style"),
|
|
(namespaces["html"], "tbody"),
|
|
(namespaces["html"], "textarea"),
|
|
(namespaces["html"], "tfoot"),
|
|
(namespaces["html"], "thead"),
|
|
(namespaces["html"], "title"),
|
|
(namespaces["html"], "tr"),
|
|
(namespaces["html"], "ul"),
|
|
(namespaces["html"], "wbr")
|
|
))
|
|
|
|
spaceCharacters = frozenset((
|
|
u"\t",
|
|
u"\n",
|
|
u"\u000C",
|
|
u" ",
|
|
u"\r"
|
|
))
|
|
|
|
tableInsertModeElements = frozenset((
|
|
"table",
|
|
"tbody",
|
|
"tfoot",
|
|
"thead",
|
|
"tr"
|
|
))
|
|
|
|
asciiLowercase = frozenset(string.ascii_lowercase)
|
|
asciiUppercase = frozenset(string.ascii_uppercase)
|
|
asciiLetters = frozenset(string.ascii_letters)
|
|
digits = frozenset(string.digits)
|
|
hexDigits = frozenset(string.hexdigits)
|
|
|
|
asciiUpper2Lower = dict([(ord(c),ord(c.lower()))
|
|
for c in string.ascii_uppercase])
|
|
|
|
# Heading elements need to be ordered
|
|
headingElements = (
|
|
"h1",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"h5",
|
|
"h6"
|
|
)
|
|
|
|
voidElements = frozenset((
|
|
"base",
|
|
"command",
|
|
"event-source",
|
|
"link",
|
|
"meta",
|
|
"hr",
|
|
"br",
|
|
"img",
|
|
"embed",
|
|
"param",
|
|
"area",
|
|
"col",
|
|
"input",
|
|
"source"
|
|
))
|
|
|
|
cdataElements = frozenset(('title', 'textarea'))
|
|
|
|
rcdataElements = frozenset((
|
|
'style',
|
|
'script',
|
|
'xmp',
|
|
'iframe',
|
|
'noembed',
|
|
'noframes',
|
|
'noscript'
|
|
))
|
|
|
|
booleanAttributes = {
|
|
"": frozenset(("irrelevant",)),
|
|
"style": frozenset(("scoped",)),
|
|
"img": frozenset(("ismap",)),
|
|
"audio": frozenset(("autoplay","controls")),
|
|
"video": frozenset(("autoplay","controls")),
|
|
"script": frozenset(("defer", "async")),
|
|
"details": frozenset(("open",)),
|
|
"datagrid": frozenset(("multiple", "disabled")),
|
|
"command": frozenset(("hidden", "disabled", "checked", "default")),
|
|
"menu": frozenset(("autosubmit",)),
|
|
"fieldset": frozenset(("disabled", "readonly")),
|
|
"option": frozenset(("disabled", "readonly", "selected")),
|
|
"optgroup": frozenset(("disabled", "readonly")),
|
|
"button": frozenset(("disabled", "autofocus")),
|
|
"input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")),
|
|
"select": frozenset(("disabled", "readonly", "autofocus", "multiple")),
|
|
"output": frozenset(("disabled", "readonly")),
|
|
}
|
|
|
|
# entitiesWindows1252 has to be _ordered_ and needs to have an index. It
|
|
# therefore can't be a frozenset.
|
|
entitiesWindows1252 = (
|
|
8364, # 0x80 0x20AC EURO SIGN
|
|
65533, # 0x81 UNDEFINED
|
|
8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK
|
|
402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK
|
|
8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK
|
|
8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS
|
|
8224, # 0x86 0x2020 DAGGER
|
|
8225, # 0x87 0x2021 DOUBLE DAGGER
|
|
710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT
|
|
8240, # 0x89 0x2030 PER MILLE SIGN
|
|
352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON
|
|
8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
|
338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE
|
|
65533, # 0x8D UNDEFINED
|
|
381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON
|
|
65533, # 0x8F UNDEFINED
|
|
65533, # 0x90 UNDEFINED
|
|
8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK
|
|
8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK
|
|
8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK
|
|
8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK
|
|
8226, # 0x95 0x2022 BULLET
|
|
8211, # 0x96 0x2013 EN DASH
|
|
8212, # 0x97 0x2014 EM DASH
|
|
732, # 0x98 0x02DC SMALL TILDE
|
|
8482, # 0x99 0x2122 TRADE MARK SIGN
|
|
353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON
|
|
8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
|
339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE
|
|
65533, # 0x9D UNDEFINED
|
|
382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON
|
|
376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS
|
|
)
|
|
|
|
entities = {
|
|
"AElig;": u"\u00C6",
|
|
"AElig": u"\u00C6",
|
|
"AMP;": u"\u0026",
|
|
"AMP": u"\u0026",
|
|
"Aacute;": u"\u00C1",
|
|
"Aacute": u"\u00C1",
|
|
"Acirc;": u"\u00C2",
|
|
"Acirc": u"\u00C2",
|
|
"Agrave;": u"\u00C0",
|
|
"Agrave": u"\u00C0",
|
|
"Alpha;": u"\u0391",
|
|
"Aring;": u"\u00C5",
|
|
"Aring": u"\u00C5",
|
|
"Atilde;": u"\u00C3",
|
|
"Atilde": u"\u00C3",
|
|
"Auml;": u"\u00C4",
|
|
"Auml": u"\u00C4",
|
|
"Beta;": u"\u0392",
|
|
"COPY;": u"\u00A9",
|
|
"COPY": u"\u00A9",
|
|
"Ccedil;": u"\u00C7",
|
|
"Ccedil": u"\u00C7",
|
|
"Chi;": u"\u03A7",
|
|
"Dagger;": u"\u2021",
|
|
"Delta;": u"\u0394",
|
|
"ETH;": u"\u00D0",
|
|
"ETH": u"\u00D0",
|
|
"Eacute;": u"\u00C9",
|
|
"Eacute": u"\u00C9",
|
|
"Ecirc;": u"\u00CA",
|
|
"Ecirc": u"\u00CA",
|
|
"Egrave;": u"\u00C8",
|
|
"Egrave": u"\u00C8",
|
|
"Epsilon;": u"\u0395",
|
|
"Eta;": u"\u0397",
|
|
"Euml;": u"\u00CB",
|
|
"Euml": u"\u00CB",
|
|
"GT;": u"\u003E",
|
|
"GT": u"\u003E",
|
|
"Gamma;": u"\u0393",
|
|
"Iacute;": u"\u00CD",
|
|
"Iacute": u"\u00CD",
|
|
"Icirc;": u"\u00CE",
|
|
"Icirc": u"\u00CE",
|
|
"Igrave;": u"\u00CC",
|
|
"Igrave": u"\u00CC",
|
|
"Iota;": u"\u0399",
|
|
"Iuml;": u"\u00CF",
|
|
"Iuml": u"\u00CF",
|
|
"Kappa;": u"\u039A",
|
|
"LT;": u"\u003C",
|
|
"LT": u"\u003C",
|
|
"Lambda;": u"\u039B",
|
|
"Mu;": u"\u039C",
|
|
"Ntilde;": u"\u00D1",
|
|
"Ntilde": u"\u00D1",
|
|
"Nu;": u"\u039D",
|
|
"OElig;": u"\u0152",
|
|
"Oacute;": u"\u00D3",
|
|
"Oacute": u"\u00D3",
|
|
"Ocirc;": u"\u00D4",
|
|
"Ocirc": u"\u00D4",
|
|
"Ograve;": u"\u00D2",
|
|
"Ograve": u"\u00D2",
|
|
"Omega;": u"\u03A9",
|
|
"Omicron;": u"\u039F",
|
|
"Oslash;": u"\u00D8",
|
|
"Oslash": u"\u00D8",
|
|
"Otilde;": u"\u00D5",
|
|
"Otilde": u"\u00D5",
|
|
"Ouml;": u"\u00D6",
|
|
"Ouml": u"\u00D6",
|
|
"Phi;": u"\u03A6",
|
|
"Pi;": u"\u03A0",
|
|
"Prime;": u"\u2033",
|
|
"Psi;": u"\u03A8",
|
|
"QUOT;": u"\u0022",
|
|
"QUOT": u"\u0022",
|
|
"REG;": u"\u00AE",
|
|
"REG": u"\u00AE",
|
|
"Rho;": u"\u03A1",
|
|
"Scaron;": u"\u0160",
|
|
"Sigma;": u"\u03A3",
|
|
"THORN;": u"\u00DE",
|
|
"THORN": u"\u00DE",
|
|
"TRADE;": u"\u2122",
|
|
"Tau;": u"\u03A4",
|
|
"Theta;": u"\u0398",
|
|
"Uacute;": u"\u00DA",
|
|
"Uacute": u"\u00DA",
|
|
"Ucirc;": u"\u00DB",
|
|
"Ucirc": u"\u00DB",
|
|
"Ugrave;": u"\u00D9",
|
|
"Ugrave": u"\u00D9",
|
|
"Upsilon;": u"\u03A5",
|
|
"Uuml;": u"\u00DC",
|
|
"Uuml": u"\u00DC",
|
|
"Xi;": u"\u039E",
|
|
"Yacute;": u"\u00DD",
|
|
"Yacute": u"\u00DD",
|
|
"Yuml;": u"\u0178",
|
|
"Zeta;": u"\u0396",
|
|
"aacute;": u"\u00E1",
|
|
"aacute": u"\u00E1",
|
|
"acirc;": u"\u00E2",
|
|
"acirc": u"\u00E2",
|
|
"acute;": u"\u00B4",
|
|
"acute": u"\u00B4",
|
|
"aelig;": u"\u00E6",
|
|
"aelig": u"\u00E6",
|
|
"agrave;": u"\u00E0",
|
|
"agrave": u"\u00E0",
|
|
"alefsym;": u"\u2135",
|
|
"alpha;": u"\u03B1",
|
|
"amp;": u"\u0026",
|
|
"amp": u"\u0026",
|
|
"and;": u"\u2227",
|
|
"ang;": u"\u2220",
|
|
"apos;": u"\u0027",
|
|
"aring;": u"\u00E5",
|
|
"aring": u"\u00E5",
|
|
"asymp;": u"\u2248",
|
|
"atilde;": u"\u00E3",
|
|
"atilde": u"\u00E3",
|
|
"auml;": u"\u00E4",
|
|
"auml": u"\u00E4",
|
|
"bdquo;": u"\u201E",
|
|
"beta;": u"\u03B2",
|
|
"brvbar;": u"\u00A6",
|
|
"brvbar": u"\u00A6",
|
|
"bull;": u"\u2022",
|
|
"cap;": u"\u2229",
|
|
"ccedil;": u"\u00E7",
|
|
"ccedil": u"\u00E7",
|
|
"cedil;": u"\u00B8",
|
|
"cedil": u"\u00B8",
|
|
"cent;": u"\u00A2",
|
|
"cent": u"\u00A2",
|
|
"chi;": u"\u03C7",
|
|
"circ;": u"\u02C6",
|
|
"clubs;": u"\u2663",
|
|
"cong;": u"\u2245",
|
|
"copy;": u"\u00A9",
|
|
"copy": u"\u00A9",
|
|
"crarr;": u"\u21B5",
|
|
"cup;": u"\u222A",
|
|
"curren;": u"\u00A4",
|
|
"curren": u"\u00A4",
|
|
"dArr;": u"\u21D3",
|
|
"dagger;": u"\u2020",
|
|
"darr;": u"\u2193",
|
|
"deg;": u"\u00B0",
|
|
"deg": u"\u00B0",
|
|
"delta;": u"\u03B4",
|
|
"diams;": u"\u2666",
|
|
"divide;": u"\u00F7",
|
|
"divide": u"\u00F7",
|
|
"eacute;": u"\u00E9",
|
|
"eacute": u"\u00E9",
|
|
"ecirc;": u"\u00EA",
|
|
"ecirc": u"\u00EA",
|
|
"egrave;": u"\u00E8",
|
|
"egrave": u"\u00E8",
|
|
"empty;": u"\u2205",
|
|
"emsp;": u"\u2003",
|
|
"ensp;": u"\u2002",
|
|
"epsilon;": u"\u03B5",
|
|
"equiv;": u"\u2261",
|
|
"eta;": u"\u03B7",
|
|
"eth;": u"\u00F0",
|
|
"eth": u"\u00F0",
|
|
"euml;": u"\u00EB",
|
|
"euml": u"\u00EB",
|
|
"euro;": u"\u20AC",
|
|
"exist;": u"\u2203",
|
|
"fnof;": u"\u0192",
|
|
"forall;": u"\u2200",
|
|
"frac12;": u"\u00BD",
|
|
"frac12": u"\u00BD",
|
|
"frac14;": u"\u00BC",
|
|
"frac14": u"\u00BC",
|
|
"frac34;": u"\u00BE",
|
|
"frac34": u"\u00BE",
|
|
"frasl;": u"\u2044",
|
|
"gamma;": u"\u03B3",
|
|
"ge;": u"\u2265",
|
|
"gt;": u"\u003E",
|
|
"gt": u"\u003E",
|
|
"hArr;": u"\u21D4",
|
|
"harr;": u"\u2194",
|
|
"hearts;": u"\u2665",
|
|
"hellip;": u"\u2026",
|
|
"iacute;": u"\u00ED",
|
|
"iacute": u"\u00ED",
|
|
"icirc;": u"\u00EE",
|
|
"icirc": u"\u00EE",
|
|
"iexcl;": u"\u00A1",
|
|
"iexcl": u"\u00A1",
|
|
"igrave;": u"\u00EC",
|
|
"igrave": u"\u00EC",
|
|
"image;": u"\u2111",
|
|
"infin;": u"\u221E",
|
|
"int;": u"\u222B",
|
|
"iota;": u"\u03B9",
|
|
"iquest;": u"\u00BF",
|
|
"iquest": u"\u00BF",
|
|
"isin;": u"\u2208",
|
|
"iuml;": u"\u00EF",
|
|
"iuml": u"\u00EF",
|
|
"kappa;": u"\u03BA",
|
|
"lArr;": u"\u21D0",
|
|
"lambda;": u"\u03BB",
|
|
"lang;": u"\u27E8",
|
|
"laquo;": u"\u00AB",
|
|
"laquo": u"\u00AB",
|
|
"larr;": u"\u2190",
|
|
"lceil;": u"\u2308",
|
|
"ldquo;": u"\u201C",
|
|
"le;": u"\u2264",
|
|
"lfloor;": u"\u230A",
|
|
"lowast;": u"\u2217",
|
|
"loz;": u"\u25CA",
|
|
"lrm;": u"\u200E",
|
|
"lsaquo;": u"\u2039",
|
|
"lsquo;": u"\u2018",
|
|
"lt;": u"\u003C",
|
|
"lt": u"\u003C",
|
|
"macr;": u"\u00AF",
|
|
"macr": u"\u00AF",
|
|
"mdash;": u"\u2014",
|
|
"micro;": u"\u00B5",
|
|
"micro": u"\u00B5",
|
|
"middot;": u"\u00B7",
|
|
"middot": u"\u00B7",
|
|
"minus;": u"\u2212",
|
|
"mu;": u"\u03BC",
|
|
"nabla;": u"\u2207",
|
|
"nbsp;": u"\u00A0",
|
|
"nbsp": u"\u00A0",
|
|
"ndash;": u"\u2013",
|
|
"ne;": u"\u2260",
|
|
"ni;": u"\u220B",
|
|
"not;": u"\u00AC",
|
|
"not": u"\u00AC",
|
|
"notin;": u"\u2209",
|
|
"nsub;": u"\u2284",
|
|
"ntilde;": u"\u00F1",
|
|
"ntilde": u"\u00F1",
|
|
"nu;": u"\u03BD",
|
|
"oacute;": u"\u00F3",
|
|
"oacute": u"\u00F3",
|
|
"ocirc;": u"\u00F4",
|
|
"ocirc": u"\u00F4",
|
|
"oelig;": u"\u0153",
|
|
"ograve;": u"\u00F2",
|
|
"ograve": u"\u00F2",
|
|
"oline;": u"\u203E",
|
|
"omega;": u"\u03C9",
|
|
"omicron;": u"\u03BF",
|
|
"oplus;": u"\u2295",
|
|
"or;": u"\u2228",
|
|
"ordf;": u"\u00AA",
|
|
"ordf": u"\u00AA",
|
|
"ordm;": u"\u00BA",
|
|
"ordm": u"\u00BA",
|
|
"oslash;": u"\u00F8",
|
|
"oslash": u"\u00F8",
|
|
"otilde;": u"\u00F5",
|
|
"otilde": u"\u00F5",
|
|
"otimes;": u"\u2297",
|
|
"ouml;": u"\u00F6",
|
|
"ouml": u"\u00F6",
|
|
"para;": u"\u00B6",
|
|
"para": u"\u00B6",
|
|
"part;": u"\u2202",
|
|
"permil;": u"\u2030",
|
|
"perp;": u"\u22A5",
|
|
"phi;": u"\u03C6",
|
|
"pi;": u"\u03C0",
|
|
"piv;": u"\u03D6",
|
|
"plusmn;": u"\u00B1",
|
|
"plusmn": u"\u00B1",
|
|
"pound;": u"\u00A3",
|
|
"pound": u"\u00A3",
|
|
"prime;": u"\u2032",
|
|
"prod;": u"\u220F",
|
|
"prop;": u"\u221D",
|
|
"psi;": u"\u03C8",
|
|
"quot;": u"\u0022",
|
|
"quot": u"\u0022",
|
|
"rArr;": u"\u21D2",
|
|
"radic;": u"\u221A",
|
|
"rang;": u"\u27E9",
|
|
"raquo;": u"\u00BB",
|
|
"raquo": u"\u00BB",
|
|
"rarr;": u"\u2192",
|
|
"rceil;": u"\u2309",
|
|
"rdquo;": u"\u201D",
|
|
"real;": u"\u211C",
|
|
"reg;": u"\u00AE",
|
|
"reg": u"\u00AE",
|
|
"rfloor;": u"\u230B",
|
|
"rho;": u"\u03C1",
|
|
"rlm;": u"\u200F",
|
|
"rsaquo;": u"\u203A",
|
|
"rsquo;": u"\u2019",
|
|
"sbquo;": u"\u201A",
|
|
"scaron;": u"\u0161",
|
|
"sdot;": u"\u22C5",
|
|
"sect;": u"\u00A7",
|
|
"sect": u"\u00A7",
|
|
"shy;": u"\u00AD",
|
|
"shy": u"\u00AD",
|
|
"sigma;": u"\u03C3",
|
|
"sigmaf;": u"\u03C2",
|
|
"sim;": u"\u223C",
|
|
"spades;": u"\u2660",
|
|
"sub;": u"\u2282",
|
|
"sube;": u"\u2286",
|
|
"sum;": u"\u2211",
|
|
"sup1;": u"\u00B9",
|
|
"sup1": u"\u00B9",
|
|
"sup2;": u"\u00B2",
|
|
"sup2": u"\u00B2",
|
|
"sup3;": u"\u00B3",
|
|
"sup3": u"\u00B3",
|
|
"sup;": u"\u2283",
|
|
"supe;": u"\u2287",
|
|
"szlig;": u"\u00DF",
|
|
"szlig": u"\u00DF",
|
|
"tau;": u"\u03C4",
|
|
"there4;": u"\u2234",
|
|
"theta;": u"\u03B8",
|
|
"thetasym;": u"\u03D1",
|
|
"thinsp;": u"\u2009",
|
|
"thorn;": u"\u00FE",
|
|
"thorn": u"\u00FE",
|
|
"tilde;": u"\u02DC",
|
|
"times;": u"\u00D7",
|
|
"times": u"\u00D7",
|
|
"trade;": u"\u2122",
|
|
"uArr;": u"\u21D1",
|
|
"uacute;": u"\u00FA",
|
|
"uacute": u"\u00FA",
|
|
"uarr;": u"\u2191",
|
|
"ucirc;": u"\u00FB",
|
|
"ucirc": u"\u00FB",
|
|
"ugrave;": u"\u00F9",
|
|
"ugrave": u"\u00F9",
|
|
"uml;": u"\u00A8",
|
|
"uml": u"\u00A8",
|
|
"upsih;": u"\u03D2",
|
|
"upsilon;": u"\u03C5",
|
|
"uuml;": u"\u00FC",
|
|
"uuml": u"\u00FC",
|
|
"weierp;": u"\u2118",
|
|
"xi;": u"\u03BE",
|
|
"yacute;": u"\u00FD",
|
|
"yacute": u"\u00FD",
|
|
"yen;": u"\u00A5",
|
|
"yen": u"\u00A5",
|
|
"yuml;": u"\u00FF",
|
|
"yuml": u"\u00FF",
|
|
"zeta;": u"\u03B6",
|
|
"zwj;": u"\u200D",
|
|
"zwnj;": u"\u200C"
|
|
}
|
|
|
|
replacementCharacters = {
|
|
0x0:u"\uFFFD",
|
|
0x0d:u"\u000A",
|
|
0x80:u"\u20AC",
|
|
0x81:u"\u0081",
|
|
0x81:u"\u0081",
|
|
0x82:u"\u201A",
|
|
0x83:u"\u0192",
|
|
0x84:u"\u201E",
|
|
0x85:u"\u2026",
|
|
0x86:u"\u2020",
|
|
0x87:u"\u2021",
|
|
0x88:u"\u02C6",
|
|
0x89:u"\u2030",
|
|
0x8A:u"\u0160",
|
|
0x8B:u"\u2039",
|
|
0x8C:u"\u0152",
|
|
0x8D:u"\u008D",
|
|
0x8E:u"\u017D",
|
|
0x8F:u"\u008F",
|
|
0x90:u"\u0090",
|
|
0x91:u"\u2018",
|
|
0x92:u"\u2019",
|
|
0x93:u"\u201C",
|
|
0x94:u"\u201D",
|
|
0x95:u"\u2022",
|
|
0x96:u"\u2013",
|
|
0x97:u"\u2014",
|
|
0x98:u"\u02DC",
|
|
0x99:u"\u2122",
|
|
0x9A:u"\u0161",
|
|
0x9B:u"\u203A",
|
|
0x9C:u"\u0153",
|
|
0x9D:u"\u009D",
|
|
0x9E:u"\u017E",
|
|
0x9F:u"\u0178",
|
|
}
|
|
|
|
encodings = {
|
|
'437': 'cp437',
|
|
'850': 'cp850',
|
|
'852': 'cp852',
|
|
'855': 'cp855',
|
|
'857': 'cp857',
|
|
'860': 'cp860',
|
|
'861': 'cp861',
|
|
'862': 'cp862',
|
|
'863': 'cp863',
|
|
'865': 'cp865',
|
|
'866': 'cp866',
|
|
'869': 'cp869',
|
|
'ansix341968': 'ascii',
|
|
'ansix341986': 'ascii',
|
|
'arabic': 'iso8859-6',
|
|
'ascii': 'ascii',
|
|
'asmo708': 'iso8859-6',
|
|
'big5': 'big5',
|
|
'big5hkscs': 'big5hkscs',
|
|
'chinese': 'gbk',
|
|
'cp037': 'cp037',
|
|
'cp1026': 'cp1026',
|
|
'cp154': 'ptcp154',
|
|
'cp367': 'ascii',
|
|
'cp424': 'cp424',
|
|
'cp437': 'cp437',
|
|
'cp500': 'cp500',
|
|
'cp775': 'cp775',
|
|
'cp819': 'windows-1252',
|
|
'cp850': 'cp850',
|
|
'cp852': 'cp852',
|
|
'cp855': 'cp855',
|
|
'cp857': 'cp857',
|
|
'cp860': 'cp860',
|
|
'cp861': 'cp861',
|
|
'cp862': 'cp862',
|
|
'cp863': 'cp863',
|
|
'cp864': 'cp864',
|
|
'cp865': 'cp865',
|
|
'cp866': 'cp866',
|
|
'cp869': 'cp869',
|
|
'cp936': 'gbk',
|
|
'cpgr': 'cp869',
|
|
'cpis': 'cp861',
|
|
'csascii': 'ascii',
|
|
'csbig5': 'big5',
|
|
'cseuckr': 'cp949',
|
|
'cseucpkdfmtjapanese': 'euc_jp',
|
|
'csgb2312': 'gbk',
|
|
'cshproman8': 'hp-roman8',
|
|
'csibm037': 'cp037',
|
|
'csibm1026': 'cp1026',
|
|
'csibm424': 'cp424',
|
|
'csibm500': 'cp500',
|
|
'csibm855': 'cp855',
|
|
'csibm857': 'cp857',
|
|
'csibm860': 'cp860',
|
|
'csibm861': 'cp861',
|
|
'csibm863': 'cp863',
|
|
'csibm864': 'cp864',
|
|
'csibm865': 'cp865',
|
|
'csibm866': 'cp866',
|
|
'csibm869': 'cp869',
|
|
'csiso2022jp': 'iso2022_jp',
|
|
'csiso2022jp2': 'iso2022_jp_2',
|
|
'csiso2022kr': 'iso2022_kr',
|
|
'csiso58gb231280': 'gbk',
|
|
'csisolatin1': 'windows-1252',
|
|
'csisolatin2': 'iso8859-2',
|
|
'csisolatin3': 'iso8859-3',
|
|
'csisolatin4': 'iso8859-4',
|
|
'csisolatin5': 'windows-1254',
|
|
'csisolatin6': 'iso8859-10',
|
|
'csisolatinarabic': 'iso8859-6',
|
|
'csisolatincyrillic': 'iso8859-5',
|
|
'csisolatingreek': 'iso8859-7',
|
|
'csisolatinhebrew': 'iso8859-8',
|
|
'cskoi8r': 'koi8-r',
|
|
'csksc56011987': 'cp949',
|
|
'cspc775baltic': 'cp775',
|
|
'cspc850multilingual': 'cp850',
|
|
'cspc862latinhebrew': 'cp862',
|
|
'cspc8codepage437': 'cp437',
|
|
'cspcp852': 'cp852',
|
|
'csptcp154': 'ptcp154',
|
|
'csshiftjis': 'shift_jis',
|
|
'csunicode11utf7': 'utf-7',
|
|
'cyrillic': 'iso8859-5',
|
|
'cyrillicasian': 'ptcp154',
|
|
'ebcdiccpbe': 'cp500',
|
|
'ebcdiccpca': 'cp037',
|
|
'ebcdiccpch': 'cp500',
|
|
'ebcdiccphe': 'cp424',
|
|
'ebcdiccpnl': 'cp037',
|
|
'ebcdiccpus': 'cp037',
|
|
'ebcdiccpwt': 'cp037',
|
|
'ecma114': 'iso8859-6',
|
|
'ecma118': 'iso8859-7',
|
|
'elot928': 'iso8859-7',
|
|
'eucjp': 'euc_jp',
|
|
'euckr': 'cp949',
|
|
'extendedunixcodepackedformatforjapanese': 'euc_jp',
|
|
'gb18030': 'gb18030',
|
|
'gb2312': 'gbk',
|
|
'gb231280': 'gbk',
|
|
'gbk': 'gbk',
|
|
'greek': 'iso8859-7',
|
|
'greek8': 'iso8859-7',
|
|
'hebrew': 'iso8859-8',
|
|
'hproman8': 'hp-roman8',
|
|
'hzgb2312': 'hz',
|
|
'ibm037': 'cp037',
|
|
'ibm1026': 'cp1026',
|
|
'ibm367': 'ascii',
|
|
'ibm424': 'cp424',
|
|
'ibm437': 'cp437',
|
|
'ibm500': 'cp500',
|
|
'ibm775': 'cp775',
|
|
'ibm819': 'windows-1252',
|
|
'ibm850': 'cp850',
|
|
'ibm852': 'cp852',
|
|
'ibm855': 'cp855',
|
|
'ibm857': 'cp857',
|
|
'ibm860': 'cp860',
|
|
'ibm861': 'cp861',
|
|
'ibm862': 'cp862',
|
|
'ibm863': 'cp863',
|
|
'ibm864': 'cp864',
|
|
'ibm865': 'cp865',
|
|
'ibm866': 'cp866',
|
|
'ibm869': 'cp869',
|
|
'iso2022jp': 'iso2022_jp',
|
|
'iso2022jp2': 'iso2022_jp_2',
|
|
'iso2022kr': 'iso2022_kr',
|
|
'iso646irv1991': 'ascii',
|
|
'iso646us': 'ascii',
|
|
'iso88591': 'windows-1252',
|
|
'iso885910': 'iso8859-10',
|
|
'iso8859101992': 'iso8859-10',
|
|
'iso885911987': 'windows-1252',
|
|
'iso885913': 'iso8859-13',
|
|
'iso885914': 'iso8859-14',
|
|
'iso8859141998': 'iso8859-14',
|
|
'iso885915': 'iso8859-15',
|
|
'iso885916': 'iso8859-16',
|
|
'iso8859162001': 'iso8859-16',
|
|
'iso88592': 'iso8859-2',
|
|
'iso885921987': 'iso8859-2',
|
|
'iso88593': 'iso8859-3',
|
|
'iso885931988': 'iso8859-3',
|
|
'iso88594': 'iso8859-4',
|
|
'iso885941988': 'iso8859-4',
|
|
'iso88595': 'iso8859-5',
|
|
'iso885951988': 'iso8859-5',
|
|
'iso88596': 'iso8859-6',
|
|
'iso885961987': 'iso8859-6',
|
|
'iso88597': 'iso8859-7',
|
|
'iso885971987': 'iso8859-7',
|
|
'iso88598': 'iso8859-8',
|
|
'iso885981988': 'iso8859-8',
|
|
'iso88599': 'windows-1254',
|
|
'iso885991989': 'windows-1254',
|
|
'isoceltic': 'iso8859-14',
|
|
'isoir100': 'windows-1252',
|
|
'isoir101': 'iso8859-2',
|
|
'isoir109': 'iso8859-3',
|
|
'isoir110': 'iso8859-4',
|
|
'isoir126': 'iso8859-7',
|
|
'isoir127': 'iso8859-6',
|
|
'isoir138': 'iso8859-8',
|
|
'isoir144': 'iso8859-5',
|
|
'isoir148': 'windows-1254',
|
|
'isoir149': 'cp949',
|
|
'isoir157': 'iso8859-10',
|
|
'isoir199': 'iso8859-14',
|
|
'isoir226': 'iso8859-16',
|
|
'isoir58': 'gbk',
|
|
'isoir6': 'ascii',
|
|
'koi8r': 'koi8-r',
|
|
'koi8u': 'koi8-u',
|
|
'korean': 'cp949',
|
|
'ksc5601': 'cp949',
|
|
'ksc56011987': 'cp949',
|
|
'ksc56011989': 'cp949',
|
|
'l1': 'windows-1252',
|
|
'l10': 'iso8859-16',
|
|
'l2': 'iso8859-2',
|
|
'l3': 'iso8859-3',
|
|
'l4': 'iso8859-4',
|
|
'l5': 'windows-1254',
|
|
'l6': 'iso8859-10',
|
|
'l8': 'iso8859-14',
|
|
'latin1': 'windows-1252',
|
|
'latin10': 'iso8859-16',
|
|
'latin2': 'iso8859-2',
|
|
'latin3': 'iso8859-3',
|
|
'latin4': 'iso8859-4',
|
|
'latin5': 'windows-1254',
|
|
'latin6': 'iso8859-10',
|
|
'latin8': 'iso8859-14',
|
|
'latin9': 'iso8859-15',
|
|
'ms936': 'gbk',
|
|
'mskanji': 'shift_jis',
|
|
'pt154': 'ptcp154',
|
|
'ptcp154': 'ptcp154',
|
|
'r8': 'hp-roman8',
|
|
'roman8': 'hp-roman8',
|
|
'shiftjis': 'shift_jis',
|
|
'tis620': 'cp874',
|
|
'unicode11utf7': 'utf-7',
|
|
'us': 'ascii',
|
|
'usascii': 'ascii',
|
|
'utf16': 'utf-16',
|
|
'utf16be': 'utf-16-be',
|
|
'utf16le': 'utf-16-le',
|
|
'utf8': 'utf-8',
|
|
'windows1250': 'cp1250',
|
|
'windows1251': 'cp1251',
|
|
'windows1252': 'cp1252',
|
|
'windows1253': 'cp1253',
|
|
'windows1254': 'cp1254',
|
|
'windows1255': 'cp1255',
|
|
'windows1256': 'cp1256',
|
|
'windows1257': 'cp1257',
|
|
'windows1258': 'cp1258',
|
|
'windows936': 'gbk',
|
|
'x-x-big5': 'big5'}
|
|
|
|
tokenTypes = {
|
|
"Doctype":0,
|
|
"Characters":1,
|
|
"SpaceCharacters":2,
|
|
"StartTag":3,
|
|
"EndTag":4,
|
|
"EmptyTag":5,
|
|
"Comment":6,
|
|
"ParseError":7
|
|
}
|
|
|
|
tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"],
|
|
tokenTypes["EmptyTag"]))
|
|
|
|
|
|
prefixes = dict([(v,k) for k,v in namespaces.iteritems()])
|
|
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"
|
|
|
|
class DataLossWarning(UserWarning):
|
|
pass
|
|
|
|
class ReparseException(Exception):
|
|
pass
|