import string, gettext _ = gettext.gettext try: frozenset except NameError: # Import from the sets module for python 2.3 from sets import Set as set from sets import ImmutableSet as frozenset EOF = None E = { "null-character": _(u"Null character in input stream, replaced with U+FFFD."), "invalid-character": _(u"Invalid codepoint in stream."), "incorrectly-placed-solidus": _(u"Solidus (/) incorrectly placed in tag."), "incorrect-cr-newline-entity": _(u"Incorrect CR newline entity, replaced with LF."), "illegal-windows-1252-entity": _(u"Entity used with illegal number (windows-1252 reference)."), "cant-convert-numeric-entity": _(u"Numeric entity couldn't be converted to character " u"(codepoint U+%(charAsInt)08x)."), "illegal-codepoint-for-numeric-entity": _(u"Numeric entity represents an illegal codepoint: " u"U+%(charAsInt)08x."), "numeric-entity-without-semicolon": _(u"Numeric entity didn't end with ';'."), "expected-numeric-entity-but-got-eof": _(u"Numeric entity expected. Got end of file instead."), "expected-numeric-entity": _(u"Numeric entity expected but none found."), "named-entity-without-semicolon": _(u"Named entity didn't end with ';'."), "expected-named-entity": _(u"Named entity expected. Got none."), "attributes-in-end-tag": _(u"End tag contains unexpected attributes."), "expected-tag-name-but-got-right-bracket": _(u"Expected tag name. Got '>' instead."), "expected-tag-name-but-got-question-mark": _(u"Expected tag name. Got '?' instead. (HTML doesn't " u"support processing instructions.)"), "expected-tag-name": _(u"Expected tag name. Got something else instead"), "expected-closing-tag-but-got-right-bracket": _(u"Expected closing tag. Got '>' instead. Ignoring ''."), "expected-closing-tag-but-got-eof": _(u"Expected closing tag. Unexpected end of file."), "expected-closing-tag-but-got-char": _(u"Expected closing tag. Unexpected character '%(data)s' found."), "eof-in-tag-name": _(u"Unexpected end of file in the tag name."), "expected-attribute-name-but-got-eof": _(u"Unexpected end of file. Expected attribute name instead."), "eof-in-attribute-name": _(u"Unexpected end of file in attribute name."), "invalid-character-in-attribute-name": _(u"Invalid chracter in attribute name"), "duplicate-attribute": _(u"Dropped duplicate attribute on tag."), "expected-end-of-tag-name-but-got-eof": _(u"Unexpected end of file. Expected = or end of tag."), "expected-attribute-value-but-got-eof": _(u"Unexpected end of file. Expected attribute value."), "expected-attribute-value-but-got-right-bracket": _(u"Expected attribute value. Got '>' instead."), "eof-in-attribute-value-double-quote": _(u"Unexpected end of file in attribute value (\")."), "eof-in-attribute-value-single-quote": _(u"Unexpected end of file in attribute value (')."), "eof-in-attribute-value-no-quotes": _(u"Unexpected end of file in attribute value."), "unexpected-EOF-after-solidus-in-tag": _(u"Unexpected end of file in tag. Expected >"), "unexpected-character-after-soldius-in-tag": _(u"Unexpected character after / in tag. Expected >"), "expected-dashes-or-doctype": _(u"Expected '--' or 'DOCTYPE'. Not found."), "incorrect-comment": _(u"Incorrect comment."), "eof-in-comment": _(u"Unexpected end of file in comment."), "eof-in-comment-end-dash": _(u"Unexpected end of file in comment (-)"), "unexpected-dash-after-double-dash-in-comment": _(u"Unexpected '-' after '--' found in comment."), "eof-in-comment-double-dash": _(u"Unexpected end of file in comment (--)."), "unexpected-char-in-comment": _(u"Unexpected character in comment found."), "need-space-after-doctype": _(u"No space after literal string 'DOCTYPE'."), "expected-doctype-name-but-got-right-bracket": _(u"Unexpected > character. Expected DOCTYPE name."), "expected-doctype-name-but-got-eof": _(u"Unexpected end of file. Expected DOCTYPE name."), "eof-in-doctype-name": _(u"Unexpected end of file in DOCTYPE name."), "eof-in-doctype": _(u"Unexpected end of file in DOCTYPE."), "expected-space-or-right-bracket-in-doctype": _(u"Expected space or '>'. Got '%(data)s'"), "unexpected-end-of-doctype": _(u"Unexpected end of DOCTYPE."), "unexpected-char-in-doctype": _(u"Unexpected character in DOCTYPE."), "eof-in-innerhtml": _(u"XXX innerHTML EOF"), "unexpected-doctype": _(u"Unexpected DOCTYPE. Ignored."), "non-html-root": _(u"html needs to be the first start tag."), "expected-doctype-but-got-eof": _(u"Unexpected End of file. Expected DOCTYPE."), "unknown-doctype": _(u"Erroneous DOCTYPE."), "expected-doctype-but-got-chars": _(u"Unexpected non-space characters. Expected DOCTYPE."), "expected-doctype-but-got-start-tag": _(u"Unexpected start tag (%(name)s). Expected DOCTYPE."), "expected-doctype-but-got-end-tag": _(u"Unexpected end tag (%(name)s). Expected DOCTYPE."), "end-tag-after-implied-root": _(u"Unexpected end tag (%(name)s) after the (implied) root element."), "expected-named-closing-tag-but-got-eof": _(u"Unexpected end of file. Expected end tag (%(name)s)."), "two-heads-are-not-better-than-one": _(u"Unexpected start tag head in existing head. Ignored."), "unexpected-end-tag": _(u"Unexpected end tag (%(name)s). Ignored."), "unexpected-start-tag-out-of-my-head": _(u"Unexpected start tag (%(name)s) that can be in head. Moved."), "unexpected-start-tag": _(u"Unexpected start tag (%(name)s)."), "missing-end-tag": _(u"Missing end tag (%(name)s)."), "missing-end-tags": _(u"Missing end tags (%(name)s)."), "unexpected-start-tag-implies-end-tag": _(u"Unexpected start tag (%(startName)s) " u"implies end tag (%(endName)s)."), "unexpected-start-tag-treated-as": _(u"Unexpected start tag (%(originalName)s). Treated as %(newName)s."), "deprecated-tag": _(u"Unexpected start tag %(name)s. Don't use it!"), "unexpected-start-tag-ignored": _(u"Unexpected start tag %(name)s. Ignored."), "expected-one-end-tag-but-got-another": _(u"Unexpected end tag (%(gotName)s). " u"Missing end tag (%(expectedName)s)."), "end-tag-too-early": _(u"End tag (%(name)s) seen too early. Expected other end tag."), "end-tag-too-early-named": _(u"Unexpected end tag (%(gotName)s). Expected end tag (%(expectedName)s)."), "end-tag-too-early-ignored": _(u"End tag (%(name)s) seen too early. Ignored."), "adoption-agency-1.1": _(u"End tag (%(name)s) violates step 1, " u"paragraph 1 of the adoption agency algorithm."), "adoption-agency-1.2": _(u"End tag (%(name)s) violates step 1, " u"paragraph 2 of the adoption agency algorithm."), "adoption-agency-1.3": _(u"End tag (%(name)s) violates step 1, " u"paragraph 3 of the adoption agency algorithm."), "unexpected-end-tag-treated-as": _(u"Unexpected end tag (%(originalName)s). Treated as %(newName)s."), "no-end-tag": _(u"This element (%(name)s) has no end tag."), "unexpected-implied-end-tag-in-table": _(u"Unexpected implied end tag (%(name)s) in the table phase."), "unexpected-implied-end-tag-in-table-body": _(u"Unexpected implied end tag (%(name)s) in the table body phase."), "unexpected-char-implies-table-voodoo": _(u"Unexpected non-space characters in " u"table context caused voodoo mode."), "unexpected-hidden-input-in-table": _(u"Unexpected input with type hidden in table context."), "unexpected-form-in-table": _(u"Unexpected form in table context."), "unexpected-start-tag-implies-table-voodoo": _(u"Unexpected start tag (%(name)s) in " u"table context caused voodoo mode."), "unexpected-end-tag-implies-table-voodoo": _(u"Unexpected end tag (%(name)s) in " u"table context caused voodoo mode."), "unexpected-cell-in-table-body": _(u"Unexpected table cell start tag (%(name)s) " u"in the table body phase."), "unexpected-cell-end-tag": _(u"Got table cell end tag (%(name)s) " u"while required end tags are missing."), "unexpected-end-tag-in-table-body": _(u"Unexpected end tag (%(name)s) in the table body phase. Ignored."), "unexpected-implied-end-tag-in-table-row": _(u"Unexpected implied end tag (%(name)s) in the table row phase."), "unexpected-end-tag-in-table-row": _(u"Unexpected end tag (%(name)s) in the table row phase. Ignored."), "unexpected-select-in-select": _(u"Unexpected select start tag in the select phase " u"treated as select end tag."), "unexpected-input-in-select": _(u"Unexpected input start tag in the select phase."), "unexpected-start-tag-in-select": _(u"Unexpected start tag token (%(name)s in the select phase. " u"Ignored."), "unexpected-end-tag-in-select": _(u"Unexpected end tag (%(name)s) in the select phase. Ignored."), "unexpected-table-element-start-tag-in-select-in-table": _(u"Unexpected table element start tag (%(name)s) in the select in table phase."), "unexpected-table-element-end-tag-in-select-in-table": _(u"Unexpected table element end tag (%(name)s) in the select in table phase."), "unexpected-char-after-body": _(u"Unexpected non-space characters in the after body phase."), "unexpected-start-tag-after-body": _(u"Unexpected start tag token (%(name)s)" u" in the after body phase."), "unexpected-end-tag-after-body": _(u"Unexpected end tag token (%(name)s)" u" in the after body phase."), "unexpected-char-in-frameset": _(u"Unepxected characters in the frameset phase. Characters ignored."), "unexpected-start-tag-in-frameset": _(u"Unexpected start tag token (%(name)s)" u" in the frameset phase. Ignored."), "unexpected-frameset-in-frameset-innerhtml": _(u"Unexpected end tag token (frameset) " u"in the frameset phase (innerHTML)."), "unexpected-end-tag-in-frameset": _(u"Unexpected end tag token (%(name)s)" u" in the frameset phase. Ignored."), "unexpected-char-after-frameset": _(u"Unexpected non-space characters in the " u"after frameset phase. Ignored."), "unexpected-start-tag-after-frameset": _(u"Unexpected start tag (%(name)s)" u" in the after frameset phase. Ignored."), "unexpected-end-tag-after-frameset": _(u"Unexpected end tag (%(name)s)" u" in the after frameset phase. Ignored."), "unexpected-end-tag-after-body-innerhtml": _(u"Unexpected end tag after body(innerHtml)"), "expected-eof-but-got-char": _(u"Unexpected non-space characters. Expected end of file."), "expected-eof-but-got-start-tag": _(u"Unexpected start tag (%(name)s)" u". Expected end of file."), "expected-eof-but-got-end-tag": _(u"Unexpected end tag (%(name)s)" u". Expected end of file."), "eof-in-table": _(u"Unexpected end of file. Expected table content."), "eof-in-select": _(u"Unexpected end of file. Expected select content."), "eof-in-frameset": _(u"Unexpected end of file. Expected frameset content."), "eof-in-script-in-script": _(u"Unexpected end of file. Expected script content."), "non-void-element-with-trailing-solidus": _(u"Trailing solidus not allowed on element %(name)s"), "unexpected-html-element-in-foreign-content": _(u"Element %(name)s not allowed in a non-html context"), "unexpected-end-tag-before-html": _(u"Unexpected end tag (%(name)s) before html."), "XXX-undefined-error": (u"Undefined error (this sucks and should be fixed)"), } namespaces = { "html":"http://www.w3.org/1999/xhtml", "mathml":"http://www.w3.org/1998/Math/MathML", "svg":"http://www.w3.org/2000/svg", "xlink":"http://www.w3.org/1999/xlink", "xml":"http://www.w3.org/XML/1998/namespace", "xmlns":"http://www.w3.org/2000/xmlns/" } scopingElements = frozenset(( (namespaces["html"], "applet"), (namespaces["html"], "button"), (namespaces["html"], "caption"), (namespaces["html"], "html"), (namespaces["html"], "marquee"), (namespaces["html"], "object"), (namespaces["html"], "table"), (namespaces["html"], "td"), (namespaces["html"], "th"), (namespaces["svg"], "foreignObject") )) formattingElements = frozenset(( (namespaces["html"], "a"), (namespaces["html"], "b"), (namespaces["html"], "big"), (namespaces["html"], "code"), (namespaces["html"], "em"), (namespaces["html"], "font"), (namespaces["html"], "i"), (namespaces["html"], "nobr"), (namespaces["html"], "s"), (namespaces["html"], "small"), (namespaces["html"], "strike"), (namespaces["html"], "strong"), (namespaces["html"], "tt"), (namespaces["html"], "u") )) specialElements = frozenset(( (namespaces["html"], "address"), (namespaces["html"], "area"), (namespaces["html"], "article"), (namespaces["html"], "aside"), (namespaces["html"], "base"), (namespaces["html"], "basefont"), (namespaces["html"], "bgsound"), (namespaces["html"], "blockquote"), (namespaces["html"], "body"), (namespaces["html"], "br"), (namespaces["html"], "center"), (namespaces["html"], "col"), (namespaces["html"], "colgroup"), (namespaces["html"], "command"), (namespaces["html"], "datagrid"), (namespaces["html"], "dd"), (namespaces["html"], "details"), (namespaces["html"], "dialog"), (namespaces["html"], "dir"), (namespaces["html"], "div"), (namespaces["html"], "dl"), (namespaces["html"], "dt"), (namespaces["html"], "embed"), (namespaces["html"], "event-source"), (namespaces["html"], "fieldset"), (namespaces["html"], "figure"), (namespaces["html"], "footer"), (namespaces["html"], "form"), (namespaces["html"], "frame"), (namespaces["html"], "frameset"), (namespaces["html"], "h1"), (namespaces["html"], "h2"), (namespaces["html"], "h3"), (namespaces["html"], "h4"), (namespaces["html"], "h5"), (namespaces["html"], "h6"), (namespaces["html"], "head"), (namespaces["html"], "header"), (namespaces["html"], "hr"), (namespaces["html"], "iframe"), # Note that image is commented out in the spec as "this isn't an # element that can end up on the stack, so it doesn't matter," (namespaces["html"], "image"), (namespaces["html"], "img"), (namespaces["html"], "input"), (namespaces["html"], "isindex"), (namespaces["html"], "li"), (namespaces["html"], "link"), (namespaces["html"], "listing"), (namespaces["html"], "menu"), (namespaces["html"], "meta"), (namespaces["html"], "nav"), (namespaces["html"], "noembed"), (namespaces["html"], "noframes"), (namespaces["html"], "noscript"), (namespaces["html"], "ol"), (namespaces["html"], "optgroup"), (namespaces["html"], "option"), (namespaces["html"], "p"), (namespaces["html"], "param"), (namespaces["html"], "plaintext"), (namespaces["html"], "pre"), (namespaces["html"], "script"), (namespaces["html"], "section"), (namespaces["html"], "select"), (namespaces["html"], "spacer"), (namespaces["html"], "style"), (namespaces["html"], "tbody"), (namespaces["html"], "textarea"), (namespaces["html"], "tfoot"), (namespaces["html"], "thead"), (namespaces["html"], "title"), (namespaces["html"], "tr"), (namespaces["html"], "ul"), (namespaces["html"], "wbr") )) spaceCharacters = frozenset(( u"\t", u"\n", u"\u000C", u" ", u"\r" )) tableInsertModeElements = frozenset(( "table", "tbody", "tfoot", "thead", "tr" )) asciiLowercase = frozenset(string.ascii_lowercase) asciiUppercase = frozenset(string.ascii_uppercase) asciiLetters = frozenset(string.ascii_letters) digits = frozenset(string.digits) hexDigits = frozenset(string.hexdigits) asciiUpper2Lower = dict([(ord(c),ord(c.lower())) for c in string.ascii_uppercase]) # Heading elements need to be ordered headingElements = ( "h1", "h2", "h3", "h4", "h5", "h6" ) voidElements = frozenset(( "base", "command", "event-source", "link", "meta", "hr", "br", "img", "embed", "param", "area", "col", "input", "source" )) cdataElements = frozenset(('title', 'textarea')) rcdataElements = frozenset(( 'style', 'script', 'xmp', 'iframe', 'noembed', 'noframes', 'noscript' )) booleanAttributes = { "": frozenset(("irrelevant",)), "style": frozenset(("scoped",)), "img": frozenset(("ismap",)), "audio": frozenset(("autoplay","controls")), "video": frozenset(("autoplay","controls")), "script": frozenset(("defer", "async")), "details": frozenset(("open",)), "datagrid": frozenset(("multiple", "disabled")), "command": frozenset(("hidden", "disabled", "checked", "default")), "menu": frozenset(("autosubmit",)), "fieldset": frozenset(("disabled", "readonly")), "option": frozenset(("disabled", "readonly", "selected")), "optgroup": frozenset(("disabled", "readonly")), "button": frozenset(("disabled", "autofocus")), "input": frozenset(("disabled", "readonly", "required", "autofocus", "checked", "ismap")), "select": frozenset(("disabled", "readonly", "autofocus", "multiple")), "output": frozenset(("disabled", "readonly")), } # entitiesWindows1252 has to be _ordered_ and needs to have an index. It # therefore can't be a frozenset. entitiesWindows1252 = ( 8364, # 0x80 0x20AC EURO SIGN 65533, # 0x81 UNDEFINED 8218, # 0x82 0x201A SINGLE LOW-9 QUOTATION MARK 402, # 0x83 0x0192 LATIN SMALL LETTER F WITH HOOK 8222, # 0x84 0x201E DOUBLE LOW-9 QUOTATION MARK 8230, # 0x85 0x2026 HORIZONTAL ELLIPSIS 8224, # 0x86 0x2020 DAGGER 8225, # 0x87 0x2021 DOUBLE DAGGER 710, # 0x88 0x02C6 MODIFIER LETTER CIRCUMFLEX ACCENT 8240, # 0x89 0x2030 PER MILLE SIGN 352, # 0x8A 0x0160 LATIN CAPITAL LETTER S WITH CARON 8249, # 0x8B 0x2039 SINGLE LEFT-POINTING ANGLE QUOTATION MARK 338, # 0x8C 0x0152 LATIN CAPITAL LIGATURE OE 65533, # 0x8D UNDEFINED 381, # 0x8E 0x017D LATIN CAPITAL LETTER Z WITH CARON 65533, # 0x8F UNDEFINED 65533, # 0x90 UNDEFINED 8216, # 0x91 0x2018 LEFT SINGLE QUOTATION MARK 8217, # 0x92 0x2019 RIGHT SINGLE QUOTATION MARK 8220, # 0x93 0x201C LEFT DOUBLE QUOTATION MARK 8221, # 0x94 0x201D RIGHT DOUBLE QUOTATION MARK 8226, # 0x95 0x2022 BULLET 8211, # 0x96 0x2013 EN DASH 8212, # 0x97 0x2014 EM DASH 732, # 0x98 0x02DC SMALL TILDE 8482, # 0x99 0x2122 TRADE MARK SIGN 353, # 0x9A 0x0161 LATIN SMALL LETTER S WITH CARON 8250, # 0x9B 0x203A SINGLE RIGHT-POINTING ANGLE QUOTATION MARK 339, # 0x9C 0x0153 LATIN SMALL LIGATURE OE 65533, # 0x9D UNDEFINED 382, # 0x9E 0x017E LATIN SMALL LETTER Z WITH CARON 376 # 0x9F 0x0178 LATIN CAPITAL LETTER Y WITH DIAERESIS ) entities = { "AElig;": u"\u00C6", "AElig": u"\u00C6", "AMP;": u"\u0026", "AMP": u"\u0026", "Aacute;": u"\u00C1", "Aacute": u"\u00C1", "Acirc;": u"\u00C2", "Acirc": u"\u00C2", "Agrave;": u"\u00C0", "Agrave": u"\u00C0", "Alpha;": u"\u0391", "Aring;": u"\u00C5", "Aring": u"\u00C5", "Atilde;": u"\u00C3", "Atilde": u"\u00C3", "Auml;": u"\u00C4", "Auml": u"\u00C4", "Beta;": u"\u0392", "COPY;": u"\u00A9", "COPY": u"\u00A9", "Ccedil;": u"\u00C7", "Ccedil": u"\u00C7", "Chi;": u"\u03A7", "Dagger;": u"\u2021", "Delta;": u"\u0394", "ETH;": u"\u00D0", "ETH": u"\u00D0", "Eacute;": u"\u00C9", "Eacute": u"\u00C9", "Ecirc;": u"\u00CA", "Ecirc": u"\u00CA", "Egrave;": u"\u00C8", "Egrave": u"\u00C8", "Epsilon;": u"\u0395", "Eta;": u"\u0397", "Euml;": u"\u00CB", "Euml": u"\u00CB", "GT;": u"\u003E", "GT": u"\u003E", "Gamma;": u"\u0393", "Iacute;": u"\u00CD", "Iacute": u"\u00CD", "Icirc;": u"\u00CE", "Icirc": u"\u00CE", "Igrave;": u"\u00CC", "Igrave": u"\u00CC", "Iota;": u"\u0399", "Iuml;": u"\u00CF", "Iuml": u"\u00CF", "Kappa;": u"\u039A", "LT;": u"\u003C", "LT": u"\u003C", "Lambda;": u"\u039B", "Mu;": u"\u039C", "Ntilde;": u"\u00D1", "Ntilde": u"\u00D1", "Nu;": u"\u039D", "OElig;": u"\u0152", "Oacute;": u"\u00D3", "Oacute": u"\u00D3", "Ocirc;": u"\u00D4", "Ocirc": u"\u00D4", "Ograve;": u"\u00D2", "Ograve": u"\u00D2", "Omega;": u"\u03A9", "Omicron;": u"\u039F", "Oslash;": u"\u00D8", "Oslash": u"\u00D8", "Otilde;": u"\u00D5", "Otilde": u"\u00D5", "Ouml;": u"\u00D6", "Ouml": u"\u00D6", "Phi;": u"\u03A6", "Pi;": u"\u03A0", "Prime;": u"\u2033", "Psi;": u"\u03A8", "QUOT;": u"\u0022", "QUOT": u"\u0022", "REG;": u"\u00AE", "REG": u"\u00AE", "Rho;": u"\u03A1", "Scaron;": u"\u0160", "Sigma;": u"\u03A3", "THORN;": u"\u00DE", "THORN": u"\u00DE", "TRADE;": u"\u2122", "Tau;": u"\u03A4", "Theta;": u"\u0398", "Uacute;": u"\u00DA", "Uacute": u"\u00DA", "Ucirc;": u"\u00DB", "Ucirc": u"\u00DB", "Ugrave;": u"\u00D9", "Ugrave": u"\u00D9", "Upsilon;": u"\u03A5", "Uuml;": u"\u00DC", "Uuml": u"\u00DC", "Xi;": u"\u039E", "Yacute;": u"\u00DD", "Yacute": u"\u00DD", "Yuml;": u"\u0178", "Zeta;": u"\u0396", "aacute;": u"\u00E1", "aacute": u"\u00E1", "acirc;": u"\u00E2", "acirc": u"\u00E2", "acute;": u"\u00B4", "acute": u"\u00B4", "aelig;": u"\u00E6", "aelig": u"\u00E6", "agrave;": u"\u00E0", "agrave": u"\u00E0", "alefsym;": u"\u2135", "alpha;": u"\u03B1", "amp;": u"\u0026", "amp": u"\u0026", "and;": u"\u2227", "ang;": u"\u2220", "apos;": u"\u0027", "aring;": u"\u00E5", "aring": u"\u00E5", "asymp;": u"\u2248", "atilde;": u"\u00E3", "atilde": u"\u00E3", "auml;": u"\u00E4", "auml": u"\u00E4", "bdquo;": u"\u201E", "beta;": u"\u03B2", "brvbar;": u"\u00A6", "brvbar": u"\u00A6", "bull;": u"\u2022", "cap;": u"\u2229", "ccedil;": u"\u00E7", "ccedil": u"\u00E7", "cedil;": u"\u00B8", "cedil": u"\u00B8", "cent;": u"\u00A2", "cent": u"\u00A2", "chi;": u"\u03C7", "circ;": u"\u02C6", "clubs;": u"\u2663", "cong;": u"\u2245", "copy;": u"\u00A9", "copy": u"\u00A9", "crarr;": u"\u21B5", "cup;": u"\u222A", "curren;": u"\u00A4", "curren": u"\u00A4", "dArr;": u"\u21D3", "dagger;": u"\u2020", "darr;": u"\u2193", "deg;": u"\u00B0", "deg": u"\u00B0", "delta;": u"\u03B4", "diams;": u"\u2666", "divide;": u"\u00F7", "divide": u"\u00F7", "eacute;": u"\u00E9", "eacute": u"\u00E9", "ecirc;": u"\u00EA", "ecirc": u"\u00EA", "egrave;": u"\u00E8", "egrave": u"\u00E8", "empty;": u"\u2205", "emsp;": u"\u2003", "ensp;": u"\u2002", "epsilon;": u"\u03B5", "equiv;": u"\u2261", "eta;": u"\u03B7", "eth;": u"\u00F0", "eth": u"\u00F0", "euml;": u"\u00EB", "euml": u"\u00EB", "euro;": u"\u20AC", "exist;": u"\u2203", "fnof;": u"\u0192", "forall;": u"\u2200", "frac12;": u"\u00BD", "frac12": u"\u00BD", "frac14;": u"\u00BC", "frac14": u"\u00BC", "frac34;": u"\u00BE", "frac34": u"\u00BE", "frasl;": u"\u2044", "gamma;": u"\u03B3", "ge;": u"\u2265", "gt;": u"\u003E", "gt": u"\u003E", "hArr;": u"\u21D4", "harr;": u"\u2194", "hearts;": u"\u2665", "hellip;": u"\u2026", "iacute;": u"\u00ED", "iacute": u"\u00ED", "icirc;": u"\u00EE", "icirc": u"\u00EE", "iexcl;": u"\u00A1", "iexcl": u"\u00A1", "igrave;": u"\u00EC", "igrave": u"\u00EC", "image;": u"\u2111", "infin;": u"\u221E", "int;": u"\u222B", "iota;": u"\u03B9", "iquest;": u"\u00BF", "iquest": u"\u00BF", "isin;": u"\u2208", "iuml;": u"\u00EF", "iuml": u"\u00EF", "kappa;": u"\u03BA", "lArr;": u"\u21D0", "lambda;": u"\u03BB", "lang;": u"\u27E8", "laquo;": u"\u00AB", "laquo": u"\u00AB", "larr;": u"\u2190", "lceil;": u"\u2308", "ldquo;": u"\u201C", "le;": u"\u2264", "lfloor;": u"\u230A", "lowast;": u"\u2217", "loz;": u"\u25CA", "lrm;": u"\u200E", "lsaquo;": u"\u2039", "lsquo;": u"\u2018", "lt;": u"\u003C", "lt": u"\u003C", "macr;": u"\u00AF", "macr": u"\u00AF", "mdash;": u"\u2014", "micro;": u"\u00B5", "micro": u"\u00B5", "middot;": u"\u00B7", "middot": u"\u00B7", "minus;": u"\u2212", "mu;": u"\u03BC", "nabla;": u"\u2207", "nbsp;": u"\u00A0", "nbsp": u"\u00A0", "ndash;": u"\u2013", "ne;": u"\u2260", "ni;": u"\u220B", "not;": u"\u00AC", "not": u"\u00AC", "notin;": u"\u2209", "nsub;": u"\u2284", "ntilde;": u"\u00F1", "ntilde": u"\u00F1", "nu;": u"\u03BD", "oacute;": u"\u00F3", "oacute": u"\u00F3", "ocirc;": u"\u00F4", "ocirc": u"\u00F4", "oelig;": u"\u0153", "ograve;": u"\u00F2", "ograve": u"\u00F2", "oline;": u"\u203E", "omega;": u"\u03C9", "omicron;": u"\u03BF", "oplus;": u"\u2295", "or;": u"\u2228", "ordf;": u"\u00AA", "ordf": u"\u00AA", "ordm;": u"\u00BA", "ordm": u"\u00BA", "oslash;": u"\u00F8", "oslash": u"\u00F8", "otilde;": u"\u00F5", "otilde": u"\u00F5", "otimes;": u"\u2297", "ouml;": u"\u00F6", "ouml": u"\u00F6", "para;": u"\u00B6", "para": u"\u00B6", "part;": u"\u2202", "permil;": u"\u2030", "perp;": u"\u22A5", "phi;": u"\u03C6", "pi;": u"\u03C0", "piv;": u"\u03D6", "plusmn;": u"\u00B1", "plusmn": u"\u00B1", "pound;": u"\u00A3", "pound": u"\u00A3", "prime;": u"\u2032", "prod;": u"\u220F", "prop;": u"\u221D", "psi;": u"\u03C8", "quot;": u"\u0022", "quot": u"\u0022", "rArr;": u"\u21D2", "radic;": u"\u221A", "rang;": u"\u27E9", "raquo;": u"\u00BB", "raquo": u"\u00BB", "rarr;": u"\u2192", "rceil;": u"\u2309", "rdquo;": u"\u201D", "real;": u"\u211C", "reg;": u"\u00AE", "reg": u"\u00AE", "rfloor;": u"\u230B", "rho;": u"\u03C1", "rlm;": u"\u200F", "rsaquo;": u"\u203A", "rsquo;": u"\u2019", "sbquo;": u"\u201A", "scaron;": u"\u0161", "sdot;": u"\u22C5", "sect;": u"\u00A7", "sect": u"\u00A7", "shy;": u"\u00AD", "shy": u"\u00AD", "sigma;": u"\u03C3", "sigmaf;": u"\u03C2", "sim;": u"\u223C", "spades;": u"\u2660", "sub;": u"\u2282", "sube;": u"\u2286", "sum;": u"\u2211", "sup1;": u"\u00B9", "sup1": u"\u00B9", "sup2;": u"\u00B2", "sup2": u"\u00B2", "sup3;": u"\u00B3", "sup3": u"\u00B3", "sup;": u"\u2283", "supe;": u"\u2287", "szlig;": u"\u00DF", "szlig": u"\u00DF", "tau;": u"\u03C4", "there4;": u"\u2234", "theta;": u"\u03B8", "thetasym;": u"\u03D1", "thinsp;": u"\u2009", "thorn;": u"\u00FE", "thorn": u"\u00FE", "tilde;": u"\u02DC", "times;": u"\u00D7", "times": u"\u00D7", "trade;": u"\u2122", "uArr;": u"\u21D1", "uacute;": u"\u00FA", "uacute": u"\u00FA", "uarr;": u"\u2191", "ucirc;": u"\u00FB", "ucirc": u"\u00FB", "ugrave;": u"\u00F9", "ugrave": u"\u00F9", "uml;": u"\u00A8", "uml": u"\u00A8", "upsih;": u"\u03D2", "upsilon;": u"\u03C5", "uuml;": u"\u00FC", "uuml": u"\u00FC", "weierp;": u"\u2118", "xi;": u"\u03BE", "yacute;": u"\u00FD", "yacute": u"\u00FD", "yen;": u"\u00A5", "yen": u"\u00A5", "yuml;": u"\u00FF", "yuml": u"\u00FF", "zeta;": u"\u03B6", "zwj;": u"\u200D", "zwnj;": u"\u200C" } replacementCharacters = { 0x0:u"\uFFFD", 0x0d:u"\u000A", 0x80:u"\u20AC", 0x81:u"\u0081", 0x81:u"\u0081", 0x82:u"\u201A", 0x83:u"\u0192", 0x84:u"\u201E", 0x85:u"\u2026", 0x86:u"\u2020", 0x87:u"\u2021", 0x88:u"\u02C6", 0x89:u"\u2030", 0x8A:u"\u0160", 0x8B:u"\u2039", 0x8C:u"\u0152", 0x8D:u"\u008D", 0x8E:u"\u017D", 0x8F:u"\u008F", 0x90:u"\u0090", 0x91:u"\u2018", 0x92:u"\u2019", 0x93:u"\u201C", 0x94:u"\u201D", 0x95:u"\u2022", 0x96:u"\u2013", 0x97:u"\u2014", 0x98:u"\u02DC", 0x99:u"\u2122", 0x9A:u"\u0161", 0x9B:u"\u203A", 0x9C:u"\u0153", 0x9D:u"\u009D", 0x9E:u"\u017E", 0x9F:u"\u0178", } encodings = { '437': 'cp437', '850': 'cp850', '852': 'cp852', '855': 'cp855', '857': 'cp857', '860': 'cp860', '861': 'cp861', '862': 'cp862', '863': 'cp863', '865': 'cp865', '866': 'cp866', '869': 'cp869', 'ansix341968': 'ascii', 'ansix341986': 'ascii', 'arabic': 'iso8859-6', 'ascii': 'ascii', 'asmo708': 'iso8859-6', 'big5': 'big5', 'big5hkscs': 'big5hkscs', 'chinese': 'gbk', 'cp037': 'cp037', 'cp1026': 'cp1026', 'cp154': 'ptcp154', 'cp367': 'ascii', 'cp424': 'cp424', 'cp437': 'cp437', 'cp500': 'cp500', 'cp775': 'cp775', 'cp819': 'windows-1252', 'cp850': 'cp850', 'cp852': 'cp852', 'cp855': 'cp855', 'cp857': 'cp857', 'cp860': 'cp860', 'cp861': 'cp861', 'cp862': 'cp862', 'cp863': 'cp863', 'cp864': 'cp864', 'cp865': 'cp865', 'cp866': 'cp866', 'cp869': 'cp869', 'cp936': 'gbk', 'cpgr': 'cp869', 'cpis': 'cp861', 'csascii': 'ascii', 'csbig5': 'big5', 'cseuckr': 'cp949', 'cseucpkdfmtjapanese': 'euc_jp', 'csgb2312': 'gbk', 'cshproman8': 'hp-roman8', 'csibm037': 'cp037', 'csibm1026': 'cp1026', 'csibm424': 'cp424', 'csibm500': 'cp500', 'csibm855': 'cp855', 'csibm857': 'cp857', 'csibm860': 'cp860', 'csibm861': 'cp861', 'csibm863': 'cp863', 'csibm864': 'cp864', 'csibm865': 'cp865', 'csibm866': 'cp866', 'csibm869': 'cp869', 'csiso2022jp': 'iso2022_jp', 'csiso2022jp2': 'iso2022_jp_2', 'csiso2022kr': 'iso2022_kr', 'csiso58gb231280': 'gbk', 'csisolatin1': 'windows-1252', 'csisolatin2': 'iso8859-2', 'csisolatin3': 'iso8859-3', 'csisolatin4': 'iso8859-4', 'csisolatin5': 'windows-1254', 'csisolatin6': 'iso8859-10', 'csisolatinarabic': 'iso8859-6', 'csisolatincyrillic': 'iso8859-5', 'csisolatingreek': 'iso8859-7', 'csisolatinhebrew': 'iso8859-8', 'cskoi8r': 'koi8-r', 'csksc56011987': 'cp949', 'cspc775baltic': 'cp775', 'cspc850multilingual': 'cp850', 'cspc862latinhebrew': 'cp862', 'cspc8codepage437': 'cp437', 'cspcp852': 'cp852', 'csptcp154': 'ptcp154', 'csshiftjis': 'shift_jis', 'csunicode11utf7': 'utf-7', 'cyrillic': 'iso8859-5', 'cyrillicasian': 'ptcp154', 'ebcdiccpbe': 'cp500', 'ebcdiccpca': 'cp037', 'ebcdiccpch': 'cp500', 'ebcdiccphe': 'cp424', 'ebcdiccpnl': 'cp037', 'ebcdiccpus': 'cp037', 'ebcdiccpwt': 'cp037', 'ecma114': 'iso8859-6', 'ecma118': 'iso8859-7', 'elot928': 'iso8859-7', 'eucjp': 'euc_jp', 'euckr': 'cp949', 'extendedunixcodepackedformatforjapanese': 'euc_jp', 'gb18030': 'gb18030', 'gb2312': 'gbk', 'gb231280': 'gbk', 'gbk': 'gbk', 'greek': 'iso8859-7', 'greek8': 'iso8859-7', 'hebrew': 'iso8859-8', 'hproman8': 'hp-roman8', 'hzgb2312': 'hz', 'ibm037': 'cp037', 'ibm1026': 'cp1026', 'ibm367': 'ascii', 'ibm424': 'cp424', 'ibm437': 'cp437', 'ibm500': 'cp500', 'ibm775': 'cp775', 'ibm819': 'windows-1252', 'ibm850': 'cp850', 'ibm852': 'cp852', 'ibm855': 'cp855', 'ibm857': 'cp857', 'ibm860': 'cp860', 'ibm861': 'cp861', 'ibm862': 'cp862', 'ibm863': 'cp863', 'ibm864': 'cp864', 'ibm865': 'cp865', 'ibm866': 'cp866', 'ibm869': 'cp869', 'iso2022jp': 'iso2022_jp', 'iso2022jp2': 'iso2022_jp_2', 'iso2022kr': 'iso2022_kr', 'iso646irv1991': 'ascii', 'iso646us': 'ascii', 'iso88591': 'windows-1252', 'iso885910': 'iso8859-10', 'iso8859101992': 'iso8859-10', 'iso885911987': 'windows-1252', 'iso885913': 'iso8859-13', 'iso885914': 'iso8859-14', 'iso8859141998': 'iso8859-14', 'iso885915': 'iso8859-15', 'iso885916': 'iso8859-16', 'iso8859162001': 'iso8859-16', 'iso88592': 'iso8859-2', 'iso885921987': 'iso8859-2', 'iso88593': 'iso8859-3', 'iso885931988': 'iso8859-3', 'iso88594': 'iso8859-4', 'iso885941988': 'iso8859-4', 'iso88595': 'iso8859-5', 'iso885951988': 'iso8859-5', 'iso88596': 'iso8859-6', 'iso885961987': 'iso8859-6', 'iso88597': 'iso8859-7', 'iso885971987': 'iso8859-7', 'iso88598': 'iso8859-8', 'iso885981988': 'iso8859-8', 'iso88599': 'windows-1254', 'iso885991989': 'windows-1254', 'isoceltic': 'iso8859-14', 'isoir100': 'windows-1252', 'isoir101': 'iso8859-2', 'isoir109': 'iso8859-3', 'isoir110': 'iso8859-4', 'isoir126': 'iso8859-7', 'isoir127': 'iso8859-6', 'isoir138': 'iso8859-8', 'isoir144': 'iso8859-5', 'isoir148': 'windows-1254', 'isoir149': 'cp949', 'isoir157': 'iso8859-10', 'isoir199': 'iso8859-14', 'isoir226': 'iso8859-16', 'isoir58': 'gbk', 'isoir6': 'ascii', 'koi8r': 'koi8-r', 'koi8u': 'koi8-u', 'korean': 'cp949', 'ksc5601': 'cp949', 'ksc56011987': 'cp949', 'ksc56011989': 'cp949', 'l1': 'windows-1252', 'l10': 'iso8859-16', 'l2': 'iso8859-2', 'l3': 'iso8859-3', 'l4': 'iso8859-4', 'l5': 'windows-1254', 'l6': 'iso8859-10', 'l8': 'iso8859-14', 'latin1': 'windows-1252', 'latin10': 'iso8859-16', 'latin2': 'iso8859-2', 'latin3': 'iso8859-3', 'latin4': 'iso8859-4', 'latin5': 'windows-1254', 'latin6': 'iso8859-10', 'latin8': 'iso8859-14', 'latin9': 'iso8859-15', 'ms936': 'gbk', 'mskanji': 'shift_jis', 'pt154': 'ptcp154', 'ptcp154': 'ptcp154', 'r8': 'hp-roman8', 'roman8': 'hp-roman8', 'shiftjis': 'shift_jis', 'tis620': 'cp874', 'unicode11utf7': 'utf-7', 'us': 'ascii', 'usascii': 'ascii', 'utf16': 'utf-16', 'utf16be': 'utf-16-be', 'utf16le': 'utf-16-le', 'utf8': 'utf-8', 'windows1250': 'cp1250', 'windows1251': 'cp1251', 'windows1252': 'cp1252', 'windows1253': 'cp1253', 'windows1254': 'cp1254', 'windows1255': 'cp1255', 'windows1256': 'cp1256', 'windows1257': 'cp1257', 'windows1258': 'cp1258', 'windows936': 'gbk', 'x-x-big5': 'big5'} tokenTypes = { "Doctype":0, "Characters":1, "SpaceCharacters":2, "StartTag":3, "EndTag":4, "EmptyTag":5, "Comment":6, "ParseError":7 } tagTokenTypes = frozenset((tokenTypes["StartTag"], tokenTypes["EndTag"], tokenTypes["EmptyTag"])) prefixes = dict([(v,k) for k,v in namespaces.iteritems()]) prefixes["http://www.w3.org/1998/Math/MathML"] = "math" class DataLossWarning(UserWarning): pass class ReparseException(Exception): pass