001 /* 002 * Copyright (c) 2005, 2006 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.iki.hsivonen.htmlparser; 024 025 import java.io.BufferedInputStream; 026 import java.io.BufferedReader; 027 import java.io.IOException; 028 import java.io.InputStream; 029 import java.io.InputStreamReader; 030 import java.io.Reader; 031 import java.nio.charset.CharacterCodingException; 032 import java.nio.charset.Charset; 033 import java.nio.charset.CharsetDecoder; 034 import java.nio.charset.CodingErrorAction; 035 import java.nio.charset.IllegalCharsetNameException; 036 import java.nio.charset.UnsupportedCharsetException; 037 038 import org.xml.sax.ContentHandler; 039 import org.xml.sax.DTDHandler; 040 import org.xml.sax.EntityResolver; 041 import org.xml.sax.ErrorHandler; 042 import org.xml.sax.InputSource; 043 import org.xml.sax.Locator; 044 import org.xml.sax.SAXException; 045 import org.xml.sax.SAXNotRecognizedException; 046 import org.xml.sax.SAXNotSupportedException; 047 import org.xml.sax.SAXParseException; 048 import org.xml.sax.XMLReader; 049 import org.xml.sax.helpers.DefaultHandler; 050 051 import fi.iki.hsivonen.io.EncodingInfo; 052 import fi.iki.hsivonen.io.NonBufferingAsciiInputStreamReader; 053 import fi.iki.hsivonen.xml.AttributesImpl; 054 import fi.iki.hsivonen.xml.ContentHandlerFilter; 055 import fi.iki.hsivonen.xml.EmptyAttributes; 056 import fi.iki.hsivonen.xml.SilentDraconianErrorHandler; 057 import fi.iki.hsivonen.xml.XhtmlSaxEmitter; 058 import fi.iki.hsivonen.xml.checker.NormalizationChecker; 059 060 /** 061 * WARNING: This parser is incomplete. It does not perform tag inference, yet. It does not yet perform 062 * case folding for attribute value like method="POST". 063 * 064 * @version $Id: HtmlParser.java,v 1.20 2006/11/21 10:13:24 hsivonen Exp $ 065 * @author hsivonen 066 */ 067 public final class HtmlParser implements XMLReader, Locator { 068 069 private static final int CASE_MASK = (1 << 5); 070 071 private static final int LEAD_OFFSET = 0xD800 - (0x10000 >> 10); 072 073 private static final int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00; 074 075 private static final char[] LT = { '<' }; 076 077 private static final char[] APOS = { '\'' }; 078 079 private static final char[] OCTYPE = "octype".toCharArray(); 080 081 private static final char[] TML = "tml".toCharArray(); 082 083 private static final char[] UBLIC = "ublic".toCharArray(); 084 085 private static final int PCDATA = 0; 086 087 private static final int SCRIPT = 1; 088 089 private static final int STYLE = 2; 090 091 private String publicId; 092 093 private String systemId; 094 095 private boolean nonWhiteSpaceAllowed; 096 097 private int cdataState; 098 099 private ErrorHandler eh; 100 101 private ContentHandler ch; 102 103 private DoctypeHandler doctypeHandler; 104 105 private XhtmlSaxEmitter emitter; 106 107 private Reader reader; 108 109 private int pos; 110 111 private int cstart; 112 113 private char[] buf = new char[2048]; 114 115 private int bufLen; 116 117 private int line; 118 119 private int col; 120 121 private boolean doctypeSeen; 122 123 private int doctypeMode; 124 125 private boolean html5; 126 127 private char prev; 128 129 private boolean wasLt; 130 131 private char[] strBuf = new char[64]; 132 133 private int strBufLen = 0; 134 135 private char[] attrBuf = new char[1024]; 136 137 private int attrBufLen = 0; 138 139 private AttributesImpl attrs = new AttributesImpl(); 140 141 private char[] bmpChar = { '\u0000' }; 142 143 private char[] astralChar = { '\u0000', '\u0000' }; 144 145 private DTDHandler dtdHandler; 146 147 private EmptyElementFilter eef; 148 149 private TagInferenceFilter tif; 150 151 private CharacterEncodingDeclarationFilter cedf; 152 153 private ContentHandlerFilter pipelineLast; 154 155 private EntityResolver entityResolver = null; 156 157 private String encoding = null; 158 159 private InputStream stream; 160 161 private boolean foldedAttributeValue; 162 163 private boolean alreadyWarnedAboutPrivateUseCharacters; 164 165 private NormalizationChecker normalizationChecker = null; 166 167 public HtmlParser() { 168 eef = new EmptyElementFilter(); 169 tif = new TagInferenceFilter(this); 170 cedf = new CharacterEncodingDeclarationFilter(this); 171 ch = eef; 172 emitter = new XhtmlSaxEmitter(ch); 173 eef.setContentHandler(tif); 174 tif.setContentHandler(cedf); 175 pipelineLast = cedf; 176 setErrorHandler(new SilentDraconianErrorHandler()); 177 setContentHandler(new DefaultHandler()); 178 } 179 180 private void clearStrBuf() { 181 strBufLen = 0; 182 } 183 184 private void appendStrBufAsciiLowerCase(char c) throws SAXException, 185 IOException { 186 if (c >= 'A' && c <= 'Z') { 187 appendStrBuf((char) (c | CASE_MASK)); 188 } else { 189 appendStrBuf(c); 190 } 191 } 192 193 private void appendStrBuf(char c) throws SAXException, IOException { 194 if (strBufLen == strBuf.length) { 195 fatal("Identifier too long."); 196 } else { 197 strBuf[strBufLen] = c; 198 strBufLen++; 199 } 200 } 201 202 private String strBufToString() { 203 return new String(strBuf, 0, strBufLen); 204 } 205 206 private void clearAttrBuf() { 207 attrBufLen = 0; 208 } 209 210 private void appendAttrBuf(char c) throws SAXException, IOException { 211 if (attrBufLen == attrBuf.length) { 212 fatal("Attribute value or other quoted string too long."); 213 } else { 214 attrBuf[attrBufLen] = c; 215 attrBufLen++; 216 } 217 } 218 219 private void appendAttrBufAsciiLowerCase(char c) throws SAXException, 220 IOException { 221 if (c >= 'A' && c <= 'Z') { 222 appendAttrBuf((char) (c | CASE_MASK)); 223 } else if (c <= '\u007F') { 224 appendAttrBuf(c); 225 } else { 226 fatal("Non-ASCII character in an attribute value that is subject to case folding."); 227 } 228 } 229 230 /** 231 * @param cs 232 * @throws SAXException 233 */ 234 private void appendAttrBuf(char[] cs) throws SAXException, IOException { 235 for (int i = 0; i < cs.length; i++) { 236 appendAttrBuf(cs[i]); 237 } 238 } 239 240 /** 241 * @param cs 242 * @throws SAXException 243 */ 244 private void appendAttrBufAsciiLowerCase(char[] cs) throws SAXException, 245 IOException { 246 for (int i = 0; i < cs.length; i++) { 247 appendAttrBufAsciiLowerCase(cs[i]); 248 } 249 } 250 251 private String attrBufToString() { 252 return new String(attrBuf, 0, attrBufLen); 253 } 254 255 private void parse() throws SAXException, IOException { 256 pos = -1; 257 cstart = -1; 258 line = 1; 259 col = 0; 260 doctypeSeen = false; 261 prev = '\u0000'; 262 cdataState = PCDATA; 263 nonWhiteSpaceAllowed = false; 264 wasLt = false; 265 bufLen = 0; 266 html5 = false; 267 char c; 268 for (;;) { 269 if (cdataState == PCDATA) { 270 c = nextMayEnd(); 271 if (c == '\u0000') { 272 flushChars(); 273 if (!doctypeSeen) { 274 err("The document did not have a doctype."); 275 } 276 return; 277 } else if (c == '<') { 278 flushChars(); 279 consumeMarkup(); 280 } else if (c == '&') { 281 flushChars(); 282 emitter.characters(consumeCharRef()); 283 } else if (isWhiteSpace(c)) { 284 if (nonWhiteSpaceAllowed) { 285 if (cstart == -1) { 286 cstart = pos; 287 } 288 } 289 } else { 290 doctypeNotOk(); 291 if (nonWhiteSpaceAllowed) { 292 if (cstart == -1) { 293 cstart = pos; 294 } 295 } else { 296 fatal("Character data not allowed at this point."); 297 } 298 } 299 } else { 300 c = next(); 301 if (c == '<') { 302 wasLt = true; 303 flushChars(); 304 } else if (c == '/') { 305 if (wasLt) { 306 consumeEndTag(); 307 } else if (cstart == -1) { 308 cstart = pos; 309 } 310 wasLt = false; 311 } else { 312 if (wasLt) { 313 emitter.characters(LT); 314 } 315 if (cstart == -1) { 316 cstart = pos; 317 } 318 wasLt = false; 319 } 320 } 321 } 322 } 323 324 /** 325 * @throws SAXException 326 * 327 */ 328 private void doctypeNotOk() throws SAXException, IOException { 329 if (!doctypeSeen) { 330 // there was no doctype 331 err("There was no doctype."); 332 doctypeSeen = true; 333 } 334 } 335 336 /** 337 * @param c 338 * @return 339 */ 340 private boolean isWhiteSpace(char c) { 341 return (c == ' ') || (c == '\t') || (c == '\n'); 342 } 343 344 /** 345 * @throws SAXException 346 * 347 */ 348 private char[] consumeCharRef() throws SAXException, IOException { 349 char c = next(); 350 if (c == '#') { 351 return consumeNCR(); 352 } else if (isNameStart(c)) { 353 return consumeEntityRef(c); 354 } else { 355 // XXX should we err or continue here 356 fatal("& not followed by # or name start."); 357 } 358 throw new RuntimeException("Unreachable"); 359 } 360 361 /** 362 * @param c 363 * @throws SAXException 364 */ 365 private char[] consumeEntityRef(char c) throws SAXException, IOException { 366 clearStrBuf(); 367 appendStrBuf(c); 368 for (;;) { 369 c = next(); 370 if (isNameChar(c)) { 371 appendStrBuf(c); 372 } else if (c == ';') { 373 String name = strBufToString(); 374 char[] rv = html5 ? Entities.resolve5(name) : Entities.resolve(name); 375 if (rv == null) { 376 if ("apos".equals(name)) { 377 if (html5) { 378 warn("' is not supported by IE6."); 379 } else { 380 err("Even though there is a predefined entity called \u201Capos\u201D is XML, there is no such thing in HTML 4.01. Continuing parsing pretending that such an entity exists."); 381 } 382 return APOS; 383 } else { 384 fatal("Unknown entity \u201C" + name + "\u201D."); 385 } 386 } 387 return rv; 388 } else { 389 // XXX should we allow implicit close as in HTML4? 390 fatal("Found a non-name character in entity reference / unterminated entity reference."); 391 } 392 } 393 } 394 395 /** 396 * @throws SAXException 397 * 398 */ 399 private char[] consumeNCR() throws SAXException, IOException { 400 clearStrBuf(); 401 int intVal = 0; 402 char c = next(); 403 if (c == 'x' || c == 'X') { 404 for (int i = 0;; i++) { 405 if (i == 6) { 406 fatal("Hexadecimal character reference too long."); 407 } 408 c = next(); 409 if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') 410 || (c >= 'A' && c <= 'F')) { 411 appendStrBuf(c); 412 } else if (c == ';') { 413 if (i == 0) { 414 fatal("No digits in hexadecimal character reference."); 415 } 416 intVal = Integer.parseInt(strBufToString(), 16); 417 break; 418 } else { 419 fatal("Bad character in hexadecimal character reference."); 420 } 421 } 422 } else if (c >= '0' && c <= '9') { 423 appendStrBuf(c); 424 for (int i = 0;; i++) { 425 if (i == 6) { 426 fatal("Decimal character reference too long."); 427 } 428 c = next(); 429 if (c >= '0' && c <= '9') { 430 appendStrBuf(c); 431 } else if (c == ';') { 432 intVal = Integer.parseInt(strBufToString()); 433 break; 434 } else { 435 fatal("Bad character in decimal character reference."); 436 } 437 } 438 } else { 439 fatal("Bad character in numeric character reference."); 440 } 441 if ((intVal & 0xF800) == 0xD800) { 442 fatal("Character reference expands to a surrogate."); 443 } else if (intVal <= 0xFFFF) { 444 c = (char) intVal; 445 if (isForbidden(c)) { 446 fatal("Character reference expands to a forbidden character."); 447 } 448 if (isPrivateUse(c)) { 449 warnAboutPrivateUseChar(); 450 } 451 bmpChar[0] = c; 452 return bmpChar; 453 } else if (intVal <= 0x10FFFF) { 454 // XXX astral non-characters are not banned 455 if (isNonCharacter(intVal)) { 456 warn("Character reference expands to an astral non-character."); 457 } 458 if (isAstralPrivateUse(intVal)) { 459 warnAboutPrivateUseChar(); 460 } 461 astralChar[0] = (char) (LEAD_OFFSET + (intVal >> 10)); 462 astralChar[1] = (char) (0xDC00 + (intVal & 0x3FF)); 463 return astralChar; 464 } else { 465 fatal("Character reference outside the permissible Unicode range."); 466 } 467 throw new RuntimeException("Unreachable"); 468 } 469 470 /** 471 * @throws SAXException 472 * 473 */ 474 private void consumeMarkup() throws SAXException, IOException { 475 char c = next(); 476 if (c == '!') { 477 consumeMarkupDecl(); 478 } else if (c == '?') { 479 consumePI(); 480 } else if (c == '/') { 481 consumeEndTag(); 482 } else if (isNameStart(c)) { 483 consumeStartTag(c); 484 } else { 485 fatal("Found illegal character after <."); 486 } 487 } 488 489 /** 490 * @return 491 * @throws SAXException 492 */ 493 private char next() throws SAXException, IOException { 494 char c = nextMayEnd(); 495 if (c == '\u0000') { 496 fatal("Unexpected end of file."); 497 } 498 return c; 499 } 500 501 /** 502 * @param c 503 * @throws SAXException 504 */ 505 private void consumeStartTag(char c) throws SAXException, IOException { 506 doctypeNotOk(); 507 clearStrBuf(); 508 appendStrBufAsciiLowerCase(c); 509 for (;;) { 510 c = next(); 511 if (c == '>') { 512 String gi = strBufToString(); 513 maybeBeginCdata(gi); 514 emitter.startElement(gi, 515 EmptyAttributes.EMPTY_ATTRIBUTES); 516 return; 517 } else if (c == '/') { 518 c = next(); 519 if (c == '>') { 520 err("XML-style empty element syntax (<foo/>) is not legal in HTML. Skipping the slash."); 521 String gi = strBufToString(); 522 maybeBeginCdata(gi); 523 emitter.startElement(gi, 524 EmptyAttributes.EMPTY_ATTRIBUTES); 525 return; 526 } else { 527 fatal("Stray slash in start tag."); 528 } 529 } else if (isNameChar(c)) { 530 appendStrBufAsciiLowerCase(c); 531 } else if (isWhiteSpace(c)) { 532 attrs.clear(); 533 String gi = strBufToString(); 534 maybeBeginCdata(gi); 535 c = nextAfterZeroOrMoreWhiteSpace(); 536 for (;;) { 537 if (isNameStart(c)) { 538 c = consumeAttribute(c); 539 } else if (c == '/') { 540 c = next(); 541 if (c == '>') { 542 err("XML-style empty element syntax (<foo />) is not legal in HTML. Skipping the slash."); 543 emitter.startElement(gi, attrs); 544 return; 545 } else { 546 fatal("Stray slash in start tag."); 547 } 548 } else if (c == '>') { 549 emitter.startElement(gi, attrs); 550 return; 551 } else { 552 fatal("Garbage in start tag."); 553 } 554 } 555 } else { 556 fatal("Illegal character in element name."); 557 } 558 } 559 } 560 561 /** 562 * @param gi 563 */ 564 private void maybeBeginCdata(String gi) { 565 if ("style".equals(gi)) { 566 cdataState = STYLE; 567 wasLt = false; 568 } else if ("script".equals(gi)) { 569 cdataState = SCRIPT; 570 wasLt = false; 571 } 572 } 573 574 /** 575 * @param c 576 * @return 577 * @throws SAXException 578 */ 579 private char consumeAttribute(char c) throws SAXException, IOException { 580 clearStrBuf(); 581 appendStrBufAsciiLowerCase(c); 582 for (;;) { 583 c = next(); 584 if (isNameChar(c)) { 585 appendStrBufAsciiLowerCase(c); 586 } else { 587 break; 588 } 589 } 590 String name = strBufToString(); 591 if (name.startsWith("xml")) { 592 fatal("Attribute name must not start with \u201Cxml\u201D."); 593 } 594 if (isWhiteSpace(c)) { 595 c = nextAfterZeroOrMoreWhiteSpace(); 596 } 597 if (c == '=') { 598 foldedAttributeValue = AttributeInfo.isCaseFolded(name); 599 c = nextAfterZeroOrMoreWhiteSpace(); 600 clearAttrBuf(); 601 if (c == '\"') { 602 consumeQuotedAttributeValue('\"'); 603 c = next(); 604 } else if (c == '\'') { 605 consumeQuotedAttributeValue('\''); 606 c = next(); 607 } else if (isUnquotedAttributeChar(c)) { 608 // XXX should the real definition of Name Start and Name Char be 609 // used here? 610 if (foldedAttributeValue) { 611 appendAttrBufAsciiLowerCase(c); 612 } else { 613 appendAttrBuf(c); 614 } 615 for (;;) { 616 c = next(); 617 if (isUnquotedAttributeChar(c)) { 618 if (foldedAttributeValue) { 619 appendAttrBufAsciiLowerCase(c); 620 } else { 621 appendAttrBuf(c); 622 } 623 } else { 624 break; 625 } 626 } 627 } else { 628 fatal("Garbage in place of attribute value. Possibly quotes missing."); 629 } 630 if (isWhiteSpace(c)) { 631 c = nextAfterZeroOrMoreWhiteSpace(); 632 } 633 if ("lang".equals(name)) { 634 fatalIfAttributeExists("xml:lang"); 635 attrs.addAttribute("http://www.w3.org/XML/1998/namespace", 636 "lang", "xml:lang", "CDATA", attrBufToString()); 637 } else { 638 fatalIfAttributeExists(name); 639 attrs.addAttribute(name, attrBufToString()); 640 } 641 return c; 642 } else { 643 if (!AttributeInfo.isBoolean(name)) { 644 fatal("Cannot minimize non-boolean attributes."); 645 } 646 fatalIfAttributeExists(name); 647 attrs.addAttribute(name, name); 648 return c; 649 } 650 } 651 652 /** 653 * @param c 654 * @return 655 */ 656 private boolean isUnquotedAttributeChar(char c) { 657 return (c == '.' || c == '-' || c == '_' || (c >= '0' && c <= ':') 658 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')); 659 } 660 661 /** 662 * @param c 663 * @return 664 * @throws SAXException 665 * @throws IOException 666 */ 667 private void consumeQuotedAttributeValue(char delim) throws SAXException, 668 IOException { 669 char c; 670 for (;;) { 671 c = next(); 672 if (c == delim) { 673 return; 674 } else if (c == '&') { 675 if (foldedAttributeValue) { 676 appendAttrBufAsciiLowerCase(consumeCharRef()); 677 } else { 678 appendAttrBuf(consumeCharRef()); 679 } 680 } else if (isWhiteSpace(c)) { 681 appendAttrBuf(' '); 682 } else { 683 if (foldedAttributeValue) { 684 appendAttrBufAsciiLowerCase(c); 685 } else { 686 appendAttrBuf(c); 687 } 688 } 689 // XXX is a warning called for when there is < or >? 690 } 691 } 692 693 /** 694 * @param name 695 * @throws SAXException 696 */ 697 private void fatalIfAttributeExists(String name) throws SAXException, 698 IOException { 699 if (attrs.getIndex(name) != -1) { 700 fatal("Duplicate attribute."); 701 } 702 } 703 704 /** 705 * @param c 706 * @return 707 */ 708 private boolean isNameStart(char c) { 709 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); 710 } 711 712 /** 713 * @param c 714 * @return 715 */ 716 private boolean isNameChar(char c) { 717 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') 718 || (c >= '0' && c <= '9') || (c == '-'); 719 } 720 721 /** 722 * @throws SAXException 723 * 724 */ 725 private void consumeEndTag() throws SAXException, IOException { 726 doctypeNotOk(); 727 clearStrBuf(); 728 char c = next(); 729 if (isNameStart(c)) { 730 appendStrBufAsciiLowerCase(c); 731 } else { 732 fatal("Element name in end tag did not start with a legal name character."); 733 } 734 for (;;) { 735 c = next(); 736 if (isNameChar(c)) { 737 appendStrBufAsciiLowerCase(c); 738 } else if (c == '>') { 739 String gi = strBufToString(); 740 cdataStateEnd(gi); 741 emitter.endElement(gi); 742 return; 743 } else if (isWhiteSpace(c)) { 744 if (nextAfterZeroOrMoreWhiteSpace() == '>') { 745 String gi = strBufToString(); 746 cdataStateEnd(gi); 747 emitter.endElement(gi); 748 return; 749 } else { 750 fatal("Garbage in end tag."); 751 } 752 } else { 753 fatal("Element name in end tag contained an illegal character."); 754 } 755 } 756 } 757 758 /** 759 * @param gi 760 * @throws SAXException 761 */ 762 private void cdataStateEnd(String gi) throws SAXException, IOException { 763 if (cdataState == STYLE) { 764 if ("style".equals(gi)) { 765 cdataState = PCDATA; 766 } else { 767 fatal("\u201C</\u201D seen inside the style element, but the string did not constitute the start of the end tag of the element."); 768 } 769 } else if (cdataState == SCRIPT) { 770 if ("script".equals(gi)) { 771 cdataState = PCDATA; 772 } else { 773 fatal("\u201C</\u201D seen inside the script element, but the string did not constitute the start of the end tag of the element."); 774 } 775 } 776 } 777 778 /** 779 * @return 780 * @throws SAXException 781 */ 782 private char nextAfterZeroOrMoreWhiteSpace() throws SAXException, 783 IOException { 784 for (;;) { 785 char c = next(); 786 if (!isWhiteSpace(c)) { 787 return c; 788 } 789 } 790 } 791 792 /** 793 * @throws SAXException 794 * 795 */ 796 private void consumePI() throws SAXException, IOException { 797 // XXX should PIs be allowed? 798 fatal("Processing instructions are not allowed."); 799 } 800 801 /** 802 * @throws SAXException 803 * 804 */ 805 private void consumeMarkupDecl() throws SAXException, IOException { 806 char c = next(); 807 if (c == '-') { 808 for (;;) { 809 c = consumeComment(); 810 if (c == '>') { 811 return; 812 } else if (c != '-') { 813 fatal("Garbage after comment."); 814 } 815 } 816 } else if (c == 'd' || c == 'D') { 817 consumeDoctype(); 818 } else if (c == '>') { 819 return; 820 } else if (c == '[') { 821 fatal("Marked sections not allowed."); 822 } else { 823 fatal("Bad character in markup declaration."); 824 } 825 } 826 827 /** 828 * @throws SAXException 829 * 830 */ 831 private void consumeDoctype() throws SAXException, IOException { 832 if (doctypeSeen) { 833 fatal("Doctype not allowed at this point."); 834 } 835 doctypeSeen = true; 836 if (!consumeCaseInsensitiveAsciiLetterString(OCTYPE)) { 837 fatal("Expected string \u201CDOCTYPE\u201D."); 838 } 839 char c = next(); 840 if (!isWhiteSpace(c)) { 841 fatal("Expected white space after \u201CDOCTYPE\u201D."); 842 } 843 c = nextAfterZeroOrMoreWhiteSpace(); 844 if (!(c == 'h' || c == 'H')) { 845 fatal("Expected string \u201Chtml\u201D."); 846 } 847 if (!consumeCaseInsensitiveAsciiLetterString(TML)) { 848 fatal("Expected string \u201Chtml\u201D."); 849 } 850 c = next(); 851 if (c == '>') { 852 sawHtml5Doctype(); 853 return; 854 } else if (!isWhiteSpace(c)) { 855 fatal("Garbage in doctype"); 856 } 857 c = nextAfterZeroOrMoreWhiteSpace(); 858 if (c == '>') { 859 sawHtml5Doctype(); 860 return; 861 } else if (c == 's' || c == 'S') { 862 fatal("Doctype with possibly a SYSTEM id only."); 863 } else if (c == '[') { 864 fatal("Doctype with internal subset."); 865 } else if (!(c == 'p' || c == 'P')) { 866 fatal("Expected string \u201CPUBLIC\u201D."); 867 } 868 if (!consumeCaseInsensitiveAsciiLetterString(UBLIC)) { 869 fatal("Expected string \u201CPUBLIC\u201D."); 870 } 871 c = next(); 872 if (!isWhiteSpace(c)) { 873 fatal("Expected white space after \u201CPUBLIC\u201D."); 874 } 875 c = nextAfterZeroOrMoreWhiteSpace(); 876 String publicId = null; 877 String systemId = null; 878 if (c == '\"') { 879 publicId = unescapedStringUntil('\"'); 880 } else if (c == '\'') { 881 publicId = unescapedStringUntil('\''); 882 } else { 883 fatal("Garbage in doctype. Expected a quoted string."); 884 } 885 c = next(); 886 if (c == '>') { 887 checkPublicAndSystemIds(publicId, systemId); 888 return; 889 } else if (!isWhiteSpace(c)) { 890 fatal("Expected white space or \u201C>\u201D after the public id."); 891 } 892 c = nextAfterZeroOrMoreWhiteSpace(); 893 if (c == '>') { 894 checkPublicAndSystemIds(publicId, systemId); 895 return; 896 } else if (c == '\"') { 897 systemId = unescapedStringUntil('\"'); 898 } else if (c == '\'') { 899 systemId = unescapedStringUntil('\''); 900 } else { 901 fatal("Garbage in doctype. Expected a quoted string or \u201C>\u201D."); 902 } 903 checkPublicAndSystemIds(publicId, systemId); 904 c = next(); 905 if (c == '>') { 906 return; 907 } else if (!isWhiteSpace(c)) { 908 fatal("Expected white space or \u201C>\u201D after the system id."); 909 } 910 c = nextAfterZeroOrMoreWhiteSpace(); 911 if (c == '>') { 912 return; 913 } else { 914 fatal("Garbage in doctype. Expected \u201C>\u201D."); 915 } 916 } 917 918 /** 919 * @throws SAXException 920 * 921 */ 922 private void sawHtml5Doctype() throws SAXException { 923 html5 = true; 924 switch (doctypeMode) { 925 case DoctypeHandler.ANY_DOCTYPE: 926 if (doctypeHandler != null) { 927 doctypeHandler.doctype(DoctypeHandler.DOCTYPE_HTML5); 928 } 929 break; 930 case DoctypeHandler.DOCTYPE_HTML5: 931 return; 932 case DoctypeHandler.DOCTYPE_HTML401_STRICT: 933 err("Expected an HTML 4.01 Strict document but saw the HTML5 doctype."); 934 break; 935 case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL: 936 err("Expected an HTML 4.01 Transitional document but saw the HTML5 doctype."); 937 break; 938 default: 939 throw new RuntimeException("Bug in HtmlParser: doctypeMode out of range."); 940 } 941 } 942 943 /** 944 * @param publicId 945 * @param systemId 946 * @throws SAXException 947 */ 948 private void checkPublicAndSystemIds(String publicId, String systemId) 949 throws SAXException, IOException { 950 if ("-//W3C//DTD HTML 4.01//EN".equals(publicId)) { 951 switch (doctypeMode) { 952 case DoctypeHandler.ANY_DOCTYPE: 953 if (doctypeHandler != null) { 954 doctypeHandler.doctype(DoctypeHandler.DOCTYPE_HTML401_STRICT); 955 } 956 break; 957 case DoctypeHandler.DOCTYPE_HTML401_STRICT: 958 return; 959 case DoctypeHandler.DOCTYPE_HTML5: 960 err("Expected an HTML5 document but saw an HTML 4.01 Strict doctype."); 961 break; 962 case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL: 963 err("Expected HTML 4.01 Transitional document but saw an HTML 4.01 Strict doctype."); 964 break; 965 default: 966 throw new RuntimeException("Bug in HtmlParser: doctypeMode out of range."); 967 } 968 if (systemId == null) { 969 // XXX err, because HTML 4.01 says "must"? 970 warn("The Strict doctype lacks the system id (URI). This kind of Strict doctype is considered quirky by Mac IE 5. The preferred non-quirky form (also required by the HTML 4.01 specification) is \u201C<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\u201D."); 971 } else if (!"http://www.w3.org/TR/html4/strict.dtd".equals(systemId)) { 972 // XXX err, because HTML 4.01 says "must"? 973 warn("The Strict doctype has a non-canonical system id (URI). The form required by the HTML 4.01 specification is \u201C<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\u201D."); 974 } 975 } else if ("-//W3C//DTD HTML 4.01 Transitional//EN".equals(publicId)) { 976 switch (doctypeMode) { 977 case DoctypeHandler.ANY_DOCTYPE: 978 if (doctypeHandler != null) { 979 doctypeHandler.doctype(DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL); 980 } 981 break; 982 case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL: 983 return; 984 case DoctypeHandler.DOCTYPE_HTML401_STRICT: 985 err("Expected an HTML 4.01 Strict document but saw an HTML 4.01 Transitional doctype."); 986 break; 987 case DoctypeHandler.DOCTYPE_HTML5: 988 err("Expected an HTML5 document but saw an HTML 4.01 Transitional doctype."); 989 break; 990 default: 991 throw new RuntimeException("Bug in HtmlParser: doctypeMode out of range."); 992 } 993 if (systemId == null) { 994 err("The Transitional doctype lacks the system id (URI). This kind of Transitional doctype is considered quirky by browsers. The preferred non-quirky form (also required by the HTML 4.01 specification) is \u201C<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\u201D."); 995 } else if (!"http://www.w3.org/TR/html4/loose.dtd".equals(systemId)) { 996 // XXX err, because HTML 4.01 says "must"? 997 warn("The Transitional doctype has a non-canonical system id (URI). This kind of Transitional doctype may be considered quirky by some legacy browsers. The preferred non-quirky form (also required by the HTML 4.01 specification) is \u201C<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\u201D."); 998 } 999 } else if (publicId.startsWith("-//W3C//DTD XHTML ")) { 1000 fatal("XHTML public id seen. XHTML documents are not conforming HTML5 or HTML 4.01 documents."); 1001 } else { 1002 err("Legacy doctype or inappropriate doctype. This parser is designed for HTML5 and also supports the HTML5-like subset of HTML 4.01."); 1003 } 1004 } 1005 1006 /** 1007 * @param c 1008 * @return 1009 * @throws SAXException 1010 */ 1011 private String unescapedStringUntil(char delim) throws SAXException, 1012 IOException { 1013 clearAttrBuf(); 1014 for (;;) { 1015 char c = next(); 1016 if (c == delim) { 1017 return attrBufToString(); 1018 } else { 1019 appendAttrBuf(c); 1020 } 1021 } 1022 } 1023 1024 private boolean consumeCaseInsensitiveAsciiLetterString(char[] str) 1025 throws SAXException, IOException { 1026 for (int i = 0; i < str.length; i++) { 1027 if (!((next() | CASE_MASK) == str[i])) { 1028 return false; 1029 } 1030 } 1031 return true; 1032 } 1033 1034 /** 1035 * @throws SAXException 1036 * 1037 */ 1038 private char consumeComment() throws SAXException, IOException { 1039 doctypeNotOk(); 1040 char c = next(); 1041 if (c != '-') { 1042 fatal("Malformed comment."); 1043 } 1044 boolean prevWasHyphen = false; 1045 for (;;) { 1046 c = next(); 1047 if (c == '-') { 1048 if (prevWasHyphen) { 1049 return nextAfterZeroOrMoreWhiteSpace(); 1050 } else { 1051 prevWasHyphen = true; 1052 } 1053 } else { 1054 prevWasHyphen = false; 1055 } 1056 } 1057 } 1058 1059 private char nextMayEnd() throws SAXException, IOException { 1060 pos++; 1061 col++; 1062 if (pos == bufLen) { 1063 boolean charDataContinuation = false; 1064 if (cstart > -1) { 1065 flushChars(); 1066 charDataContinuation = false; 1067 } 1068 try { 1069 bufLen = reader.read(buf); 1070 } catch (CharacterCodingException cce) { 1071 fatal("Input data does not conform to the input encoding."); 1072 } 1073 if (bufLen == -1) { 1074 return '\u0000'; 1075 } else if (normalizationChecker != null) { 1076 normalizationChecker.characters(buf, 0, bufLen); 1077 } 1078 if (charDataContinuation) { 1079 cstart = 0; 1080 } 1081 pos = 0; 1082 } 1083 char c = buf[pos]; 1084 if ((c & 0xFC00) == 0xDC00) { 1085 // Got a low surrogate. See if prev was high surrogate 1086 if ((prev & 0xFC00) == 0xD800) { 1087 int intVal = (prev << 10) + c + SURROGATE_OFFSET; 1088 if (isNonCharacter(intVal)) { 1089 warn("Astral non-character."); 1090 } 1091 if (isAstralPrivateUse(intVal)) { 1092 warnAboutPrivateUseChar(); 1093 } 1094 } else { 1095 fatal("Unmatched low surrogate."); 1096 } 1097 prev = c; 1098 } else { 1099 // see if there was a lone high surrogate 1100 if ((prev & 0xFC00) == 0xD800) { 1101 fatal("Unmatched high surrogate."); 1102 } 1103 if (isForbidden(c)) { 1104 fatal("Forbidden character."); 1105 } else if (c == '\r') { 1106 prev = '\r'; 1107 c = buf[pos] = '\n'; 1108 line++; 1109 col = 0; 1110 } else if (c == '\n') { 1111 if (prev != '\r') { 1112 prev = c; 1113 line++; 1114 col = 0; 1115 } else { 1116 prev = c; 1117 // swallow the LF 1118 col = 0; 1119 int tmpCstart = cstart; 1120 flushChars(); 1121 if (tmpCstart != -1) { 1122 cstart = pos + 1; 1123 } 1124 return nextMayEnd(); 1125 } 1126 } else if (isPrivateUse(c)) { 1127 warnAboutPrivateUseChar(); 1128 } 1129 } 1130 return c; 1131 } 1132 1133 private void warnAboutPrivateUseChar() throws SAXException { 1134 if (!alreadyWarnedAboutPrivateUseCharacters) { 1135 warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)"); 1136 alreadyWarnedAboutPrivateUseCharacters = true; 1137 } 1138 } 1139 1140 private boolean isPrivateUse(char c) { 1141 return c >= '\uE000' && c <= '\uF8FF'; 1142 } 1143 1144 private boolean isAstralPrivateUse(int c) { 1145 return (c >= 0xF0000 && c <= 0xFFFFD) || (c >= 0x100000 && c <= 0x10FFFD); 1146 } 1147 1148 /** 1149 * @param intVal 1150 * @return 1151 */ 1152 private boolean isNonCharacter(int c) { 1153 return (c & 0xFFFE) == 0xFFFE; 1154 } 1155 1156 /** 1157 * @param c 1158 * @return 1159 */ 1160 private boolean isForbidden(char c) { 1161 return !(c == '\t' || c == '\n' || c == '\r' 1162 || (c >= '\u0020' && c < '\u007F') 1163 || (c >= '\u00A0' && c < '\uFDD0') || (c > '\uFDDF' && c <= '\uFFFD')); 1164 } 1165 1166 /** 1167 * @throws SAXException 1168 * 1169 */ 1170 private void flushChars() throws SAXException, IOException { 1171 if (nonWhiteSpaceAllowed) { 1172 if (cstart > -1) { 1173 if (pos > cstart) { 1174 ch.characters(buf, cstart, pos - cstart); 1175 } 1176 } 1177 cstart = -1; 1178 } 1179 } 1180 1181 /** 1182 * @throws SAXException 1183 * @throws SAXParseException 1184 */ 1185 private void fatal(String message) throws SAXException { 1186 SAXParseException spe = new SAXParseException(message, this); 1187 eh.fatalError(spe); 1188 throw spe; 1189 } 1190 1191 /** 1192 * @param string 1193 * @throws SAXException 1194 */ 1195 private void err(String message) throws SAXException { 1196 SAXParseException spe = new SAXParseException(message, this); 1197 eh.error(spe); 1198 } 1199 1200 /** 1201 * @param string 1202 * @throws SAXException 1203 */ 1204 private void warn(String message) throws SAXException { 1205 SAXParseException spe = new SAXParseException(message, this); 1206 eh.warning(spe); 1207 } 1208 1209 /** 1210 * @see org.xml.sax.Locator#getPublicId() 1211 */ 1212 public String getPublicId() { 1213 return publicId; 1214 } 1215 1216 /** 1217 * @see org.xml.sax.Locator#getSystemId() 1218 */ 1219 public String getSystemId() { 1220 return systemId; 1221 } 1222 1223 /** 1224 * @see org.xml.sax.Locator#getLineNumber() 1225 */ 1226 public int getLineNumber() { 1227 return line; 1228 } 1229 1230 /** 1231 * @see org.xml.sax.Locator#getColumnNumber() 1232 */ 1233 public int getColumnNumber() { 1234 return col; 1235 } 1236 1237 /** 1238 * @see org.xml.sax.XMLReader#getFeature(java.lang.String) 1239 */ 1240 public boolean getFeature(String key) throws SAXNotRecognizedException, 1241 SAXNotSupportedException { 1242 if ("http://xml.org/sax/features/namespaces".equals(key)) { 1243 return true; 1244 } else if ("http://xml.org/sax/features/namespace-prefixes".equals(key)) { 1245 return false; 1246 } else if ("http://hsivonen.iki.fi/checkers/nfc/".equals(key)) { 1247 return normalizationChecker != null; 1248 } else { 1249 throw new SAXNotRecognizedException(key); 1250 } 1251 } 1252 1253 /** 1254 * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean) 1255 */ 1256 public void setFeature(String key, boolean value) 1257 throws SAXNotRecognizedException, SAXNotSupportedException { 1258 if ("http://xml.org/sax/features/namespaces".equals(key)) { 1259 if (!value) { 1260 throw new SAXNotSupportedException( 1261 "Cannot turn off namespace support."); 1262 } 1263 } else if ("http://xml.org/sax/features/namespace-prefixes".equals(key)) { 1264 if (value) { 1265 throw new SAXNotSupportedException("Cannot turn on prefixing."); 1266 } 1267 } else if ("http://hsivonen.iki.fi/checkers/nfc/".equals(key)) { 1268 if (value) { 1269 if (normalizationChecker == null) { 1270 normalizationChecker = new NormalizationChecker(true); 1271 normalizationChecker.setDocumentLocator(this); 1272 normalizationChecker.setErrorHandler(getErrorHandler()); 1273 } 1274 } else { 1275 normalizationChecker = null; 1276 } 1277 } else { 1278 throw new SAXNotRecognizedException(key); 1279 } 1280 } 1281 1282 /** 1283 * @see org.xml.sax.XMLReader#getProperty(java.lang.String) 1284 */ 1285 public Object getProperty(String key) throws SAXNotRecognizedException, 1286 SAXNotSupportedException { 1287 throw new SAXNotRecognizedException(key); 1288 } 1289 1290 /** 1291 * @see org.xml.sax.XMLReader#setProperty(java.lang.String, 1292 * java.lang.Object) 1293 */ 1294 public void setProperty(String key, Object value) 1295 throws SAXNotRecognizedException, SAXNotSupportedException { 1296 throw new SAXNotRecognizedException(key); 1297 } 1298 1299 /** 1300 * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver) 1301 */ 1302 public void setEntityResolver(EntityResolver entityResolver) { 1303 this.entityResolver = entityResolver; 1304 } 1305 1306 /** 1307 * @see org.xml.sax.XMLReader#getEntityResolver() 1308 */ 1309 public EntityResolver getEntityResolver() { 1310 return entityResolver; 1311 } 1312 1313 /** 1314 * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler) 1315 */ 1316 public void setDTDHandler(DTDHandler handler) { 1317 dtdHandler = handler; 1318 } 1319 1320 /** 1321 * @see org.xml.sax.XMLReader#getDTDHandler() 1322 */ 1323 public DTDHandler getDTDHandler() { 1324 return dtdHandler; 1325 } 1326 1327 /** 1328 * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler) 1329 */ 1330 public void setContentHandler(ContentHandler ch) { 1331 pipelineLast.setContentHandler(ch); 1332 } 1333 1334 /** 1335 * @see org.xml.sax.XMLReader#getContentHandler() 1336 */ 1337 public ContentHandler getContentHandler() { 1338 return pipelineLast.getContentHandler(); 1339 } 1340 1341 /** 1342 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 1343 */ 1344 public void setErrorHandler(ErrorHandler eh) { 1345 this.eh = eh; 1346 eef.setErrorHandler(eh); 1347 tif.setErrorHandler(eh); 1348 cedf.setErrorHandler(eh); 1349 } 1350 1351 /** 1352 * @see org.xml.sax.XMLReader#getErrorHandler() 1353 */ 1354 public ErrorHandler getErrorHandler() { 1355 return eh; 1356 } 1357 1358 /** 1359 * @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource) 1360 */ 1361 public void parse(InputSource is) throws IOException, SAXException { 1362 alreadyWarnedAboutPrivateUseCharacters = false; 1363 reader = null; 1364 stream = null; 1365 systemId = null; 1366 publicId = null; 1367 encoding = null; 1368 reader = is.getCharacterStream(); 1369 systemId = is.getSystemId(); 1370 publicId = is.getPublicId(); 1371 encoding = is.getEncoding(); 1372 try { 1373 streamSetup(is); 1374 ch.setDocumentLocator(this); 1375 try { 1376 ch.startDocument(); 1377 parse(); 1378 tif.flushStack(); 1379 } finally { 1380 ch.endDocument(); 1381 } 1382 } finally { 1383 if (stream != null) { 1384 stream.close(); 1385 } 1386 if (reader != null) { 1387 reader.close(); 1388 } 1389 reader = null; 1390 stream = null; 1391 } 1392 } 1393 1394 /** 1395 * @param is 1396 * @param swallowBom 1397 * @throws IOException 1398 * @throws SAXException 1399 */ 1400 private void streamSetup(InputSource is) throws SAXException, IOException { 1401 boolean swallowBom = true; 1402 if (reader == null) { 1403 stream = is.getByteStream(); 1404 if (stream == null) { 1405 throw new IllegalArgumentException( 1406 "InputSource had neither a character stream nor a byt stream."); 1407 } 1408 if (encoding == null) { 1409 // sniff BOM 1410 if (!stream.markSupported()) { 1411 stream = new BufferedInputStream(stream); 1412 } 1413 stream.mark(1); 1414 int b = stream.read(); 1415 if (b == -1) { 1416 throw new IOException("Premature end of file."); 1417 } else if (b == 0xFE) { 1418 // first byte big endian 1419 b = stream.read(); 1420 if (b == -1) { 1421 throw new IOException("Premature end of file."); 1422 } else if (b == 0xFF) { 1423 swallowBom = false; 1424 encoding = "UTF-16"; 1425 reader = draconianInputStreamReader("UTF-16BE", stream, false); 1426 } else { 1427 cannotDetermineEncoding(); 1428 } 1429 } else if (b == 0xFF) { 1430 // first byte little endian 1431 b = stream.read(); 1432 if (b == -1) { 1433 throw new IOException("Premature end of file."); 1434 } else if (b == 0xFE) { 1435 swallowBom = false; 1436 encoding = "UTF-16"; 1437 reader = draconianInputStreamReader("UTF-16LE", stream, false); 1438 } else { 1439 cannotDetermineEncoding(); 1440 } 1441 } else if (b == 0xEF) { 1442 // first byte UTF-8 1443 b = stream.read(); 1444 if (b == -1) { 1445 throw new IOException("Premature end of file."); 1446 } else if (b == 0xBB) { 1447 b = stream.read(); 1448 if (b == -1) { 1449 throw new IOException("Premature end of file."); 1450 } else if (b == 0xBF) { 1451 swallowBom = false; 1452 encoding = "UTF-8"; 1453 reader = draconianInputStreamReader("UTF-8", stream, false); 1454 } else { 1455 cannotDetermineEncoding(); 1456 } 1457 } else { 1458 cannotDetermineEncoding(); 1459 } 1460 } else if (b < 0x80) { 1461 // no BOM 1462 swallowBom = false; 1463 stream.reset(); 1464 reader = new NonBufferingAsciiInputStreamReader(stream); 1465 } else { 1466 cannotDetermineEncoding(); 1467 } 1468 } else { 1469 reader = draconianInputStreamReader(encoding, stream, false); 1470 if ("UTF-16BE".equalsIgnoreCase(encoding) 1471 || "UTF-16LE".equalsIgnoreCase(encoding) 1472 || "UTF-32BE".equalsIgnoreCase(encoding) 1473 || "UTF-32LE".equalsIgnoreCase(encoding)) { 1474 swallowBom = false; 1475 } 1476 } 1477 } 1478 if (swallowBom) { 1479 // won't happen if charecter encoding not determined yet 1480 if (!reader.markSupported()) { 1481 reader = new BufferedReader(reader); 1482 } 1483 reader.mark(1); 1484 int c = reader.read(); 1485 if (c != 0xFEFF) { 1486 reader.reset(); 1487 } 1488 } 1489 } 1490 1491 /** 1492 * @throws IOException 1493 * 1494 */ 1495 private void cannotDetermineEncoding() throws IOException { 1496 throw new IOException( 1497 "Unable to determine the character encoding of the document. No external encoding information was provided and the first byte was not an ASCII byte but did not constitute a part of the Byte Order Mark."); 1498 } 1499 1500 /** 1501 * 1502 */ 1503 private Reader draconianInputStreamReader(String encoding, 1504 InputStream stream, boolean requireAsciiSuperset) throws SAXException { 1505 encoding = encoding.toUpperCase(); 1506 try { 1507 Charset cs = Charset.forName(encoding); 1508 String canonName = cs.name(); 1509 if (requireAsciiSuperset) { 1510 if (!EncodingInfo.isAsciiSuperset(canonName)) { 1511 fatal("The encoding \u201C" 1512 + encoding 1513 + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration."); 1514 } 1515 } 1516 if (canonName.startsWith("X-") || canonName.startsWith("x-") 1517 || canonName.startsWith("Mac")) { 1518 if (encoding.startsWith("X-")) { 1519 err(encoding 1520 + " is not an IANA-registered encoding. (Charmod C022)"); 1521 } else { 1522 err(encoding 1523 + "is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)"); 1524 } 1525 } else if (!canonName.equalsIgnoreCase(encoding)) { 1526 err(encoding 1527 + " is not the preferred name of the character encoding in use. The preferred name is " 1528 + canonName + ". (Charmod C024)"); 1529 } 1530 if (EncodingInfo.isObscure(canonName)) { 1531 warn("Character encoding " + encoding + " is not widely supported. Better interoperability may be achieved by using UTF-8."); 1532 } 1533 CharsetDecoder decoder = cs.newDecoder(); 1534 decoder.onMalformedInput(CodingErrorAction.REPORT); 1535 decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 1536 return new InputStreamReader(stream, decoder); 1537 } catch (IllegalCharsetNameException e) { 1538 fatal("Illegal character encoding name: " + encoding); 1539 } catch (UnsupportedCharsetException e) { 1540 fatal("Unsupported character encoding: " + encoding); 1541 } 1542 return null; // keep the compiler happy 1543 } 1544 1545 /** 1546 * @see org.xml.sax.XMLReader#parse(java.lang.String) 1547 */ 1548 public void parse(String url) throws IOException, SAXException { 1549 // FIXME b0rked if no resolver 1550 parse(entityResolver.resolveEntity(url, null)); 1551 } 1552 1553 /** 1554 * @param string 1555 * @throws 1556 * @throws SAXException 1557 */ 1558 void setEncoding(String enc) throws SAXException { 1559 if (enc == null) { 1560 if (encoding == null) { 1561 if (stream != null) { 1562 // XXX should the parser default to US-ASCII instead? 1563 fatal("Character encoding information not available."); 1564 } 1565 } 1566 } else { 1567 if (encoding == null) { 1568 encoding = enc; 1569 if (stream != null) { 1570 reader = draconianInputStreamReader(encoding, stream, true); 1571 } 1572 } else { 1573 if (!encoding.equalsIgnoreCase(enc)) { 1574 err("Internal character encoding information is inconsistent with external information or the BOM."); 1575 } 1576 } 1577 } 1578 } 1579 1580 void setNonWhiteSpaceAllowed(boolean allow) { 1581 nonWhiteSpaceAllowed = allow; 1582 } 1583 1584 /** 1585 * Returns the doctypeMode. 1586 * 1587 * @return the doctypeMode 1588 */ 1589 public int getDoctypeMode() { 1590 return doctypeMode; 1591 } 1592 1593 /** 1594 * Sets the doctypeMode. 1595 * 1596 * @param doctypeMode the doctypeMode to set 1597 */ 1598 public void setDoctypeMode(int doctypeMode) { 1599 this.doctypeMode = doctypeMode; 1600 } 1601 1602 /** 1603 * Returns the doctypeHandler. 1604 * 1605 * @return the doctypeHandler 1606 */ 1607 public DoctypeHandler getDoctypeHandler() { 1608 return doctypeHandler; 1609 } 1610 1611 /** 1612 * Sets the doctypeHandler. 1613 * 1614 * @param doctypeHandler the doctypeHandler to set 1615 */ 1616 public void setDoctypeHandler(DoctypeHandler doctypeHandler) { 1617 this.doctypeHandler = doctypeHandler; 1618 } 1619 1620 public void refireStart() throws SAXException { 1621 ch.setDocumentLocator(this); 1622 ch.startDocument(); 1623 } 1624 }