001 /* XmlParser.java -- 002 Copyright (C) 1999,2000,2001 Free Software Foundation, Inc. 003 Portions Copyright 2006 Henri Sivonen. 004 005 This file is part of GNU JAXP. 006 007 GNU JAXP is free software; you can redistribute it and/or modify 008 it under the terms of the GNU General Public License as published by 009 the Free Software Foundation; either version 2, or (at your option) 010 any later version. 011 012 GNU JAXP is distributed in the hope that it will be useful, but 013 WITHOUT ANY WARRANTY; without even the implied warranty of 014 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 015 General Public License for more details. 016 017 You should have received a copy of the GNU General Public License 018 along with GNU JAXP; see the file COPYING. If not, write to the 019 Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 020 02111-1307 USA. 021 022 Linking this library statically or dynamically with other modules is 023 making a combined work based on this library. Thus, the terms and 024 conditions of the GNU General Public License cover the whole 025 combination. 026 027 As a special exception, the copyright holders of this library give you 028 permission to link this library with independent modules to produce an 029 executable, regardless of the license terms of these independent 030 modules, and to copy and distribute the resulting executable under 031 terms of your choice, provided that you also meet, for each linked 032 independent module, the terms and conditions of the license of that 033 module. An independent module is a module which is not derived from 034 or based on this library. If you modify this library, you may extend 035 this exception to your version of the library, but you are not 036 obligated to do so. If you do not wish to do so, delete this 037 exception statement from your version. 038 039 Partly derived from code which carried the following notice: 040 041 Copyright (c) 1997, 1998 by Microstar Software Ltd. 042 043 AElfred is free for both commercial and non-commercial use and 044 redistribution, provided that Microstar's copyright and disclaimer are 045 retained intact. You are free to modify AElfred for your own use and 046 to redistribute AElfred with your modifications, provided that the 047 modifications are clearly documented. 048 049 This program is distributed in the hope that it will be useful, but 050 WITHOUT ANY WARRANTY; without even the implied warranty of 051 merchantability or fitness for a particular purpose. Please use it AT 052 YOUR OWN RISK. 053 */ 054 055 package fi.iki.hsivonen.gnu.xml.aelfred2; 056 057 import java.io.BufferedInputStream; 058 import java.io.EOFException; 059 import java.io.IOException; 060 import java.io.InputStream; 061 import java.io.InputStreamReader; 062 import java.io.Reader; 063 import java.nio.charset.CharacterCodingException; 064 import java.nio.charset.Charset; 065 import java.nio.charset.CharsetDecoder; 066 import java.nio.charset.CodingErrorAction; 067 import java.nio.charset.IllegalCharsetNameException; 068 import java.nio.charset.UnsupportedCharsetException; 069 import java.util.HashMap; 070 import java.util.Iterator; 071 import java.util.LinkedList; 072 073 import org.xml.sax.InputSource; 074 import org.xml.sax.SAXException; 075 076 import fi.iki.hsivonen.io.EncodingInfo; 077 import fi.iki.hsivonen.xml.checker.NormalizationChecker; 078 079 // Organized imports -- 2005-08-20 hsivonen 080 081 /** 082 * Parse XML documents and return parse events through call-backs. 083 * Use the <code>SAXDriver</code> class as your entry point, as all 084 * internal parser interfaces are subject to change. 085 * 086 * @author Written by David Megginson <dmeggins@microstar.com> 087 * (version 1.2a with bugfixes) 088 * @author Updated by David Brownell <dbrownell@users.sourceforge.net> 089 * @see SAXDriver 090 */ 091 final class XmlParser 092 { 093 094 // avoid slow per-character readCh() 095 private final static boolean USE_CHEATS = true; 096 097 //////////////////////////////////////////////////////////////////////// 098 // Constants. 099 //////////////////////////////////////////////////////////////////////// 100 101 private static final int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00; 102 103 // 104 // Constants for element content type. 105 // 106 107 /** 108 * Constant: an element has not been declared. 109 * @see #getElementContentType 110 */ 111 public final static int CONTENT_UNDECLARED = 0; 112 113 /** 114 * Constant: the element has a content model of ANY. 115 * @see #getElementContentType 116 */ 117 public final static int CONTENT_ANY = 1; 118 119 /** 120 * Constant: the element has declared content of EMPTY. 121 * @see #getElementContentType 122 */ 123 public final static int CONTENT_EMPTY = 2; 124 125 /** 126 * Constant: the element has mixed content. 127 * @see #getElementContentType 128 */ 129 public final static int CONTENT_MIXED = 3; 130 131 /** 132 * Constant: the element has element content. 133 * @see #getElementContentType 134 */ 135 public final static int CONTENT_ELEMENTS = 4; 136 137 138 // 139 // Constants for the entity type. 140 // 141 142 /** 143 * Constant: the entity has not been declared. 144 * @see #getEntityType 145 */ 146 public final static int ENTITY_UNDECLARED = 0; 147 148 /** 149 * Constant: the entity is internal. 150 * @see #getEntityType 151 */ 152 public final static int ENTITY_INTERNAL = 1; 153 154 /** 155 * Constant: the entity is external, non-parsable data. 156 * @see #getEntityType 157 */ 158 public final static int ENTITY_NDATA = 2; 159 160 /** 161 * Constant: the entity is external XML data. 162 * @see #getEntityType 163 */ 164 public final static int ENTITY_TEXT = 3; 165 166 // 167 // Attribute type constants are interned literal strings. 168 // 169 170 // 171 // Constants for attribute default value. 172 // 173 174 /** 175 * Constant: the attribute is not declared. 176 * @see #getAttributeDefaultValueType 177 */ 178 public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30; 179 180 /** 181 * Constant: the attribute has a literal default value specified. 182 * @see #getAttributeDefaultValueType 183 * @see #getAttributeDefaultValue 184 */ 185 public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31; 186 187 /** 188 * Constant: the attribute was declared #IMPLIED. 189 * @see #getAttributeDefaultValueType 190 */ 191 public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32; 192 193 /** 194 * Constant: the attribute was declared #REQUIRED. 195 * @see #getAttributeDefaultValueType 196 */ 197 public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33; 198 199 /** 200 * Constant: the attribute was declared #FIXED. 201 * @see #getAttributeDefaultValueType 202 * @see #getAttributeDefaultValue 203 */ 204 public final static int ATTRIBUTE_DEFAULT_FIXED = 34; 205 206 // 207 // Constants for input. 208 // 209 private final static int INPUT_NONE = 0; 210 private final static int INPUT_INTERNAL = 1; 211 private final static int INPUT_READER = 5; 212 213 // 214 // Flags for reading literals. 215 // 216 // expand general entity refs (attribute values in dtd and content) 217 private final static int LIT_ENTITY_REF = 2; 218 // normalize this value (space chars) (attributes, public ids) 219 private final static int LIT_NORMALIZE = 4; 220 // literal is an attribute value 221 private final static int LIT_ATTRIBUTE = 8; 222 // don't expand parameter entities 223 private final static int LIT_DISABLE_PE = 16; 224 // don't expand [or parse] character refs 225 private final static int LIT_DISABLE_CREF = 32; 226 // don't parse general entity refs 227 private final static int LIT_DISABLE_EREF = 64; 228 // literal is a public ID value 229 private final static int LIT_PUBID = 256; 230 231 // 232 // Flags affecting PE handling in DTDs (if expandPE is true). 233 // PEs expand with space padding, except inside literals. 234 // 235 private final static int CONTEXT_NORMAL = 0; 236 private final static int CONTEXT_LITERAL = 1; 237 238 // Emit warnings for relative URIs with no base URI. 239 static boolean uriWarnings; 240 static 241 { 242 String key = "gnu.xml.aelfred2.XmlParser.uriWarnings"; 243 try 244 { 245 uriWarnings = "true".equals(System.getProperty(key)); 246 } 247 catch (SecurityException e) 248 { 249 uriWarnings = false; 250 } 251 } 252 253 // 254 // The current XML handler interface. 255 // 256 private SAXDriver handler; 257 258 // 259 // I/O information. 260 // 261 private Reader reader; // current reader 262 private InputStream is; // current input stream 263 private int line; // current line number 264 private int column; // current column number 265 private int sourceType; // type of input source 266 private LinkedList<Input> inputStack; // stack of input soruces 267 private String characterEncoding; // current character encoding 268 private int currentByteCount; // bytes read from current source 269 private InputSource scratch; // temporary 270 271 // 272 // Buffers for decoded but unparsed character input. 273 // 274 private char[] readBuffer; 275 private int readBufferPos; 276 private int readBufferLength; 277 private int readBufferOverflow; // overflow from last data chunk. 278 279 // 280 // Buffer for undecoded raw byte input. 281 // 282 private final static int READ_BUFFER_MAX = 16384; 283 private byte[] rawReadBuffer; 284 285 286 // 287 // Buffer for attribute values, char refs, DTD stuff. 288 // 289 private static int DATA_BUFFER_INITIAL = 4096; 290 private char[] dataBuffer; 291 private int dataBufferPos; 292 293 // 294 // Buffer for parsed names. 295 // 296 private static int NAME_BUFFER_INITIAL = 1024; 297 private char[] nameBuffer; 298 private int nameBufferPos; 299 300 // 301 // Save any standalone flag 302 // 303 private boolean docIsStandalone; 304 305 // 306 // Hashtables for DTD information on elements, entities, and notations. 307 // Populated until we start ignoring decls (because of skipping a PE) 308 // 309 private HashMap<String, ElementDecl> elementInfo; 310 private HashMap<String, EntityInfo> entityInfo; 311 private HashMap<String, String> notationInfo; 312 private boolean skippedPE; 313 314 // 315 // Element type currently in force. 316 // 317 private String currentElement; 318 private int currentElementContent; 319 320 // 321 // Stack of entity names, to detect recursion. 322 // 323 private LinkedList<String> entityStack; 324 325 // 326 // PE expansion is enabled in most chunks of the DTD, not all. 327 // When it's enabled, literals are treated differently. 328 // 329 private boolean inLiteral; 330 private boolean expandPE; 331 private boolean peIsError; 332 333 // 334 // can't report entity expansion inside two constructs: 335 // - attribute expansions (internal entities only) 336 // - markup declarations (parameter entities only) 337 // 338 private boolean doReport; 339 340 // 341 // Symbol table, for caching interned names. 342 // 343 // These show up wherever XML names or nmtokens are used: naming elements, 344 // attributes, PIs, notations, entities, and enumerated attribute values. 345 // 346 // NOTE: This hashtable doesn't grow. The default size is intended to be 347 // rather large for most documents. Example: one snapshot of the DocBook 348 // XML 4.1 DTD used only about 350 such names. As a rule, only pathological 349 // documents (ones that don't reuse names) should ever see much collision. 350 // 351 // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing. 352 // "2039" keeps the hash table size at about two memory pages on typical 353 // 32 bit hardware. 354 // 355 private final static int SYMBOL_TABLE_LENGTH = 2039; 356 357 private Object[][] symbolTable; 358 359 // 360 // Hash table of attributes found in current start tag. 361 // 362 private String[] tagAttributes; 363 private int tagAttributePos; 364 365 // 366 // Utility flag: have we noticed a CR while reading the last 367 // data chunk? If so, we will have to go back and normalise 368 // CR or CR/LF line ends. 369 // 370 private boolean sawCR; 371 372 // 373 // Utility flag: are we in CDATA? If so, whitespace isn't ignorable. 374 // 375 private boolean inCDATA; 376 377 // 378 // Xml version. 379 // 380 private static final int XML_10 = 0; 381 private static final int XML_11 = 1; 382 private int xmlVersion = XML_10; 383 384 // 385 // Normalization checking 386 // 387 388 private NormalizationChecker normalizationChecker; 389 390 ////////////////////////////////////////////////////////////////////// 391 // Constructors. 392 //////////////////////////////////////////////////////////////////////// 393 394 /** 395 * Construct a new parser with no associated handler. 396 * @see #setHandler 397 * @see #parse 398 */ 399 // package private 400 XmlParser() 401 { 402 } 403 404 /** 405 * Set the handler that will receive parsing events. 406 * @param handler The handler to receive callback events. 407 * @see #parse 408 */ 409 // package private 410 void setHandler(SAXDriver handler) 411 { 412 this.handler = handler; 413 } 414 415 /** 416 * Parse an XML document from the character stream, byte stream, or URI 417 * that you provide (in that order of preference). Any URI that you 418 * supply will become the base URI for resolving relative URI, and may 419 * be used to acquire a reader or byte stream. 420 * 421 * <p> Only one thread at a time may use this parser; since it is 422 * private to this package, post-parse cleanup is done by the caller, 423 * which MUST NOT REUSE the parser (just null it). 424 * 425 * @param systemId Absolute URI of the document; should never be null, 426 * but may be so iff a reader <em>or</em> a stream is provided. 427 * @param publicId The public identifier of the document, or null. 428 * @param reader A character stream; must be null if stream isn't. 429 * @param stream A byte input stream; must be null if reader isn't. 430 * @param characterEncoding The suggested encoding, or null if unknown. 431 * @exception java.lang.Exception Basically SAXException or IOException 432 */ 433 // package private 434 void doParse(String systemId, String publicId, Reader reader, 435 InputStream stream, String encoding) 436 throws Exception 437 { 438 if (handler == null) 439 { 440 throw new IllegalStateException("no callback handler"); 441 } 442 443 alreadyWarnedAboutPrivateUseCharacters = false; 444 initializeVariables(); 445 446 // predeclare the built-in entities here (replacement texts) 447 // we don't need to intern(), since we're guaranteed literals 448 // are always (globally) interned. 449 setInternalEntity("amp", "&"); 450 setInternalEntity("lt", "<"); 451 setInternalEntity("gt", ">"); 452 setInternalEntity("apos", "'"); 453 setInternalEntity("quot", """); 454 455 try 456 { 457 // pushURL first to ensure locator is correct in startDocument 458 // ... it might report an IO or encoding exception. 459 handler.startDocument(); 460 pushURL(false, "[document]", 461 // default baseURI: null 462 new ExternalIdentifiers(publicId, systemId, null), 463 reader, stream, encoding, false); 464 465 parseDocument(); 466 } 467 catch (EOFException e) 468 { 469 //empty input 470 fatal("empty document, with no root element."); 471 } 472 finally 473 { 474 if (reader != null) 475 { 476 try 477 { 478 reader.close(); 479 } 480 catch (IOException e) 481 { 482 /* ignore */ 483 } 484 } 485 if (stream != null) 486 { 487 try 488 { 489 stream.close(); 490 } 491 catch (IOException e) 492 { 493 /* ignore */ 494 } 495 } 496 if (is != null) 497 { 498 try 499 { 500 is.close(); 501 } 502 catch (IOException e) 503 { 504 /* ignore */ 505 } 506 } 507 } 508 } 509 510 ////////////////////////////////////////////////////////////////////// 511 // Error reporting. 512 ////////////////////////////////////////////////////////////////////// 513 514 /** 515 * Report an error. 516 * @param message The error message. 517 * @param textFound The text that caused the error (or null). 518 * @see SAXDriver#error 519 * @see #line 520 */ 521 private void fatal(String message, String textFound, String textExpected) 522 throws SAXException 523 { 524 // smart quotes -- 2005-08-20 hsivonen 525 if (textFound != null) 526 { 527 message = message + " (found \u201C" + textFound + "\u201D)"; 528 } 529 if (textExpected != null) 530 { 531 message = message + " (expected \u201C" + textExpected + "\u201D)"; 532 } 533 handler.fatal(message); 534 535 // "can't happen" 536 throw new SAXException(message); 537 } 538 539 /** 540 * Report a serious error. 541 * @param message The error message. 542 * @param textFound The text that caused the error (or null). 543 */ 544 private void fatal(String message, char textFound, String textExpected) 545 throws SAXException 546 { 547 fatal(message, new Character(textFound).toString(), textExpected); 548 } 549 550 /** 551 * Report typical case fatal errors. 552 */ 553 private void fatal(String message) 554 throws SAXException 555 { 556 handler.fatal(message); 557 } 558 559 /** 560 * Report non-fatal errors. 561 */ 562 private void err(String message) 563 throws SAXException 564 { 565 handler.verror(message); 566 } 567 568 ////////////////////////////////////////////////////////////////////// 569 // Major syntactic productions. 570 ////////////////////////////////////////////////////////////////////// 571 572 /** 573 * Parse an XML document. 574 * <pre> 575 * [1] document ::= prolog element Misc* 576 * </pre> 577 * <p>This is the top-level parsing function for a single XML 578 * document. As a minimum, a well-formed document must have 579 * a document element, and a valid document must have a prolog 580 * (one with doctype) as well. 581 */ 582 private void parseDocument() 583 throws Exception 584 { 585 try 586 { // added by MHK 587 boolean sawDTD = parseProlog(); 588 require('<'); 589 parseElement(!sawDTD); 590 } 591 catch (EOFException ee) 592 { // added by MHK 593 fatal("premature end of file", "[EOF]", null); 594 } 595 596 try 597 { 598 parseMisc(); //skip all white, PIs, and comments 599 char c = readCh(); //if this doesn't throw an exception... 600 fatal("unexpected characters after document end", c, null); 601 } 602 catch (EOFException e) 603 { 604 if (normalizationChecker != null) { 605 normalizationChecker.flush(); 606 } 607 return; 608 } 609 } 610 611 static final char[] startDelimComment = { '<', '!', '-', '-' }; 612 static final char[] endDelimComment = { '-', '-' }; 613 614 /** 615 * Skip a comment. 616 * <pre> 617 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* "-->" 618 * </pre> 619 * <p> (The <code><!--</code> has already been read.) 620 */ 621 private void parseComment() 622 throws Exception 623 { 624 boolean saved = expandPE; 625 626 expandPE = false; 627 parseUntil(endDelimComment); 628 require('>'); 629 expandPE = saved; 630 handler.comment(dataBuffer, 0, dataBufferPos); 631 dataBufferPos = 0; 632 } 633 634 static final char[] startDelimPI = { '<', '?' }; 635 static final char[] endDelimPI = { '?', '>' }; 636 637 /** 638 * Parse a processing instruction and do a call-back. 639 * <pre> 640 * [16] PI ::= '<?' PITarget 641 * (S (Char* - (Char* '?>' Char*)))? 642 * '?>' 643 * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') ) 644 * </pre> 645 * <p> (The <code><?</code> has already been read.) 646 */ 647 private void parsePI() 648 throws SAXException, IOException 649 { 650 String name; 651 boolean saved = expandPE; 652 653 expandPE = false; 654 name = readNmtoken(true); 655 //NE08 656 if (name.indexOf(':') >= 0) 657 { 658 fatal("Illegal character(':') in processing instruction name ", 659 name, null); 660 } 661 if ("xml".equalsIgnoreCase(name)) 662 { 663 fatal("Illegal processing instruction target", name, null); 664 } 665 if (!tryRead(endDelimPI)) 666 { 667 requireWhitespace(); 668 parseUntil(endDelimPI); 669 } 670 expandPE = saved; 671 handler.processingInstruction(name, dataBufferToString()); 672 } 673 674 static final char[] endDelimCDATA = { ']', ']', '>' }; 675 676 private boolean isDirtyCurrentElement; 677 678 private boolean alreadyWarnedAboutPrivateUseCharacters; 679 680 private char prev; 681 682 /** 683 * Parse a CDATA section. 684 * <pre> 685 * [18] CDSect ::= CDStart CData CDEnd 686 * [19] CDStart ::= '<![CDATA[' 687 * [20] CData ::= (Char* - (Char* ']]>' Char*)) 688 * [21] CDEnd ::= ']]>' 689 * </pre> 690 * <p> (The '<![CDATA[' has already been read.) 691 */ 692 private void parseCDSect() 693 throws Exception 694 { 695 parseUntil(endDelimCDATA); 696 dataBufferFlush(); 697 } 698 699 /** 700 * Parse the prolog of an XML document. 701 * <pre> 702 * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)? 703 * </pre> 704 * <p>We do not look for the XML declaration here, because it was 705 * handled by pushURL (). 706 * @see pushURL 707 * @return true if a DTD was read. 708 */ 709 private boolean parseProlog() 710 throws Exception 711 { 712 parseMisc(); 713 714 if (tryRead("<!DOCTYPE")) 715 { 716 parseDoctypedecl(); 717 parseMisc(); 718 return true; 719 } 720 return false; 721 } 722 723 private void checkLegalVersion(String version) 724 throws SAXException 725 { 726 int len = version.length(); 727 for (int i = 0; i < len; i++) 728 { 729 char c = version.charAt(i); 730 if ('0' <= c && c <= '9') 731 { 732 continue; 733 } 734 if (c == '_' || c == '.' || c == ':' || c == '-') 735 { 736 continue; 737 } 738 if ('a' <= c && c <= 'z') 739 { 740 continue; 741 } 742 if ('A' <= c && c <= 'Z') 743 { 744 continue; 745 } 746 fatal ("illegal character in version", version, "1.0"); 747 } 748 } 749 750 /** 751 * Parse the XML declaration. 752 * <pre> 753 * [23] XMLDecl ::= '<?xml' VersionInfo EncodingDecl? SDDecl? S? '?>' 754 * [24] VersionInfo ::= S 'version' Eq 755 * ("'" VersionNum "'" | '"' VersionNum '"' ) 756 * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')* 757 * [32] SDDecl ::= S 'standalone' Eq 758 * ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' ) 759 * [80] EncodingDecl ::= S 'encoding' Eq 760 * ( "'" EncName "'" | "'" EncName "'" ) 761 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* 762 * </pre> 763 * <p> (The <code><?xml</code> and whitespace have already been read.) 764 * @return the encoding in the declaration, uppercased; or null 765 * @see #parseTextDecl 766 * @see #setupDecoding 767 */ 768 private String parseXMLDecl(String encoding) 769 throws SAXException, IOException 770 { 771 String version; 772 String encodingName = null; 773 String standalone = null; 774 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 775 776 // Read the version. 777 require("version"); 778 parseEq(); 779 checkLegalVersion(version = readLiteral(flags)); 780 if (!version.equals("1.0")) 781 { 782 if (version.equals("1.1")) 783 { 784 fatal("XML 1.1 not supported."); // 2006-04-24 hsivonen 785 } 786 else 787 { 788 fatal("illegal XML version", version, "1.0"); // removed 1.1 -- 2006-04-24 hsivonen 789 } 790 } 791 else 792 { 793 xmlVersion = XML_10; 794 } 795 // Try reading an encoding declaration. 796 boolean white = tryWhitespace(); 797 798 if (tryRead("encoding")) 799 { 800 if (!white) 801 { 802 fatal("whitespace required before 'encoding='"); 803 } 804 parseEq(); 805 encodingName = readLiteral(flags); 806 checkEncodingLiteral(encodingName); // 2006-04-28 hsivonen 807 if (reader == null) 808 { 809 draconianInputStreamReader(encodingName, is, true); 810 } 811 else 812 { 813 checkEncodingMatch(encoding, encodingName); 814 } 815 } 816 817 // Try reading a standalone declaration 818 if (encodingName != null) 819 { 820 white = tryWhitespace(); 821 } 822 else 823 { 824 if (encoding == null) 825 { 826 draconianInputStreamReader("UTF-8", is, false); // 2006-04-24 hsivonen 827 } 828 warnAboutLackOfEncodingDecl(encoding); 829 } 830 if (tryRead("standalone")) 831 { 832 if (!white) 833 { 834 fatal("whitespace required before 'standalone='"); 835 } 836 parseEq(); 837 standalone = readLiteral(flags); 838 if ("yes".equals(standalone)) 839 { 840 docIsStandalone = true; 841 } 842 else if (!"no".equals(standalone)) 843 { 844 fatal("standalone flag must be 'yes' or 'no'"); 845 } 846 } 847 848 skipWhitespace(); 849 require("?>"); 850 851 return encodingName; 852 } 853 854 // hsivonen 2006-04-28 855 private void checkEncodingLiteral(String encodingName) 856 throws SAXException 857 { 858 if (encodingName == null) 859 { 860 return; 861 } 862 if (encodingName.length() == 0) 863 { 864 fatal("The empty string does not a legal encoding name."); 865 } 866 char c = encodingName.charAt(0); 867 if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) 868 { 869 fatal("The encoding name must start with an ASCII letter."); 870 } 871 for (int i = 1; i < encodingName.length(); i++) 872 { 873 c = encodingName.charAt(i); 874 if (!((c >= 'a' && c <= 'z') 875 || (c >= 'A' && c <= 'Z') 876 || (c >= '0' && c <= '9') 877 || (c == '.') 878 || (c == '_') 879 || (c == '-'))) 880 { 881 fatal("Illegal character in encoding name: U+" + Integer.toHexString(c) + "."); 882 } 883 } 884 } 885 886 /** 887 * Parse a text declaration. 888 * <pre> 889 * [79] TextDecl ::= '<?xml' VersionInfo? EncodingDecl S? '?>' 890 * [80] EncodingDecl ::= S 'encoding' Eq 891 * ( '"' EncName '"' | "'" EncName "'" ) 892 * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* 893 * </pre> 894 * <p> (The <code><?xml</code>' and whitespace have already been read.) 895 * @return the encoding in the declaration, uppercased; or null 896 * @see #parseXMLDecl 897 * @see #setupDecoding 898 */ 899 private String parseTextDecl(String encoding) 900 throws SAXException, IOException 901 { 902 String encodingName = null; 903 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 904 905 // Read an optional version. 906 if (tryRead ("version")) 907 { 908 String version; 909 parseEq(); 910 checkLegalVersion(version = readLiteral(flags)); 911 if (!version.equals("1.0")) 912 { 913 if (version.equals("1.1")) 914 { 915 fatal("XML 1.1 not supported."); // 2006-04-24 hsivonen 916 } 917 else 918 { 919 fatal("illegal XML version", version, "1.0"); // removed 1.1 -- 2006-04-24 hsivonen 920 } 921 } 922 requireWhitespace(); 923 } 924 925 // Read the encoding. 926 require("encoding"); 927 parseEq(); 928 encodingName = readLiteral(flags); 929 checkEncodingLiteral(encodingName); // 2006-04-28 hsivonen 930 if (reader == null) 931 { 932 draconianInputStreamReader(encodingName, is, true); 933 } 934 else 935 { 936 checkEncodingMatch(encoding, encodingName); 937 } 938 skipWhitespace(); 939 require("?>"); 940 941 return encodingName; 942 } 943 944 private void checkEncodingMatch(String used, String detected) throws SAXException { 945 // method added -- 2006-02-03 hsivonen 946 if (used == null) { 947 if (!characterEncoding.equals(detected)) { 948 fatal("Declared character encoding was not the one sniffed from the BOM.", detected, characterEncoding); 949 } 950 } else { 951 if (!"".equals(used) && !used.equalsIgnoreCase(detected)) 952 { 953 handler.warn("External encoding information specified " + used 954 + ", but XML declaration specified " + detected 955 + ". Allowing external to override per RFC 3023. The well-formedness status of this document may change when decoupled from the external character encoding information."); 956 } 957 } 958 } 959 960 private void draconianInputStreamReader(String encoding, 961 InputStream stream, boolean requireAsciiSuperset) 962 throws SAXException, IOException 963 { 964 draconianInputStreamReader(encoding, stream, requireAsciiSuperset, encoding); 965 } 966 967 private void draconianInputStreamReader(String encoding, 968 InputStream stream, boolean requireAsciiSuperset, String actualName) 969 throws SAXException, IOException 970 { 971 // method added -- 2005-08-21 hsivonen 972 sourceType = INPUT_READER; 973 characterEncoding = actualName.toUpperCase(); 974 encoding = encoding.toUpperCase(); 975 try 976 { 977 Charset cs = Charset.forName(encoding); 978 String canonName = cs.name(); 979 if (requireAsciiSuperset) 980 { 981 if (!EncodingInfo.isAsciiSuperset(canonName)) 982 { 983 fatal("The encoding \u201C" + encoding + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration."); 984 } 985 } 986 if (canonName.startsWith("X-") || canonName.startsWith("x-") || canonName.startsWith("Mac")) 987 { 988 if (encoding.startsWith("X-")) 989 { 990 err(encoding + " is not an IANA-registered encoding. (Charmod C022)"); 991 } 992 else 993 { 994 err(encoding + "is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)"); 995 } 996 } 997 else if (!canonName.equalsIgnoreCase(encoding)) 998 { 999 err(encoding 1000 + " is not the preferred name of the character encoding in use. The preferred name is " 1001 + canonName + ". (Charmod C024)"); 1002 } 1003 if (!("UTF-8".equals(encoding) || 1004 "UTF-16".equals(encoding) || 1005 "UTF-16BE".equals(encoding) || 1006 "UTF-16LE".equals(encoding) || 1007 "ISO-8859-1".equals(encoding) || 1008 "US-ASCII".equals(encoding))) 1009 { 1010 handler.warn( 1011 "XML processors are required to support the UTF-8 and UTF-16 character encodings. The encoding was " 1012 + actualName + " instead, which is an incompatibility risk."); 1013 } 1014 CharsetDecoder decoder = cs.newDecoder(); 1015 decoder.onMalformedInput(CodingErrorAction.REPORT); 1016 decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 1017 this.reader = new InputStreamReader(stream, decoder); 1018 } 1019 catch(IllegalCharsetNameException e) 1020 { 1021 fatal("Illegal character encoding name: "+ encoding); 1022 } 1023 catch (UnsupportedCharsetException e) 1024 { 1025 handler.fatal("Unsupported character encoding: "+ encoding); 1026 } 1027 } 1028 1029 /** 1030 * Parse miscellaneous markup outside the document element and DOCTYPE 1031 * declaration. 1032 * <pre> 1033 * [27] Misc ::= Comment | PI | S 1034 * </pre> 1035 */ 1036 private void parseMisc() 1037 throws Exception 1038 { 1039 while (true) 1040 { 1041 skipWhitespace(); 1042 if (tryRead(startDelimPI)) 1043 { 1044 parsePI(); 1045 } 1046 else if (tryRead(startDelimComment)) 1047 { 1048 parseComment(); 1049 } 1050 else 1051 { 1052 return; 1053 } 1054 } 1055 } 1056 1057 /** 1058 * Parse a document type declaration. 1059 * <pre> 1060 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S? 1061 * ('[' (markupdecl | PEReference | S)* ']' S?)? '>' 1062 * </pre> 1063 * <p> (The <code><!DOCTYPE</code> has already been read.) 1064 */ 1065 private void parseDoctypedecl() 1066 throws Exception 1067 { 1068 String rootName; 1069 ExternalIdentifiers ids; 1070 1071 // Read the document type name. 1072 requireWhitespace(); 1073 rootName = readNmtoken(true); 1074 1075 // Read the External subset's IDs 1076 skipWhitespace(); 1077 ids = readExternalIds(false, true); 1078 1079 // report (a) declaration of name, (b) lexical info (ids) 1080 handler.doctypeDecl(rootName, ids.publicId, ids.systemId); 1081 1082 // Internal subset is parsed first, if present 1083 skipWhitespace(); 1084 if (tryRead('[')) 1085 { 1086 1087 // loop until the subset ends 1088 while (true) 1089 { 1090 doReport = expandPE = true; 1091 skipWhitespace(); 1092 doReport = expandPE = false; 1093 if (tryRead(']')) 1094 { 1095 break; // end of subset 1096 } 1097 else 1098 { 1099 // WFC, PEs in internal subset (only between decls) 1100 peIsError = expandPE = true; 1101 parseMarkupdecl(); 1102 peIsError = expandPE = false; 1103 } 1104 } 1105 } 1106 skipWhitespace(); 1107 require('>'); 1108 1109 // Read the external subset, if any 1110 InputSource subset; 1111 1112 if (ids.systemId == null) 1113 { 1114 subset = handler.getExternalSubset(rootName, 1115 handler.getSystemId()); 1116 } 1117 else 1118 { 1119 subset = null; 1120 } 1121 if (ids.systemId != null || subset != null) 1122 { 1123 pushString(null, ">"); 1124 1125 // NOTE: [dtd] is so we say what SAX2 expects, 1126 // though it's misleading (subset, not entire dtd) 1127 if (ids.systemId != null) 1128 { 1129 pushURL(true, "[dtd]", ids, null, null, null, true); 1130 } 1131 else 1132 { 1133 handler.warn("modifying document by adding external subset"); 1134 pushURL(true, "[dtd]", 1135 new ExternalIdentifiers(subset.getPublicId(), 1136 subset.getSystemId(), 1137 null), 1138 subset.getCharacterStream(), 1139 subset.getByteStream(), 1140 subset.getEncoding(), 1141 false); 1142 } 1143 1144 // Loop until we end up back at '>' 1145 while (true) 1146 { 1147 doReport = expandPE = true; 1148 skipWhitespace(); 1149 doReport = expandPE = false; 1150 if (tryRead('>')) 1151 { 1152 break; 1153 } 1154 else 1155 { 1156 expandPE = true; 1157 parseMarkupdecl(); 1158 expandPE = false; 1159 } 1160 } 1161 1162 // the ">" string isn't popped yet 1163 if (inputStack.size() != 1) 1164 { 1165 fatal("external subset has unmatched '>'"); 1166 } 1167 } 1168 1169 // done dtd 1170 handler.endDoctype(); 1171 expandPE = false; 1172 doReport = true; 1173 } 1174 1175 /** 1176 * Parse a markup declaration in the internal or external DTD subset. 1177 * <pre> 1178 * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl 1179 * | NotationDecl | PI | Comment 1180 * [30] extSubsetDecl ::= (markupdecl | conditionalSect 1181 * | PEReference | S) * 1182 * </pre> 1183 * <p> Reading toplevel PE references is handled as a lexical issue 1184 * by the caller, as is whitespace. 1185 */ 1186 private void parseMarkupdecl() 1187 throws Exception 1188 { 1189 char[] saved = null; 1190 boolean savedPE = expandPE; 1191 1192 // prevent "<%foo;" and ensures saved entity is right 1193 require('<'); 1194 unread('<'); 1195 expandPE = false; 1196 1197 if (tryRead("<!ELEMENT")) 1198 { 1199 saved = readBuffer; 1200 expandPE = savedPE; 1201 parseElementDecl(); 1202 } 1203 else if (tryRead("<!ATTLIST")) 1204 { 1205 saved = readBuffer; 1206 expandPE = savedPE; 1207 parseAttlistDecl(); 1208 } 1209 else if (tryRead("<!ENTITY")) 1210 { 1211 saved = readBuffer; 1212 expandPE = savedPE; 1213 parseEntityDecl(); 1214 } 1215 else if (tryRead("<!NOTATION")) 1216 { 1217 saved = readBuffer; 1218 expandPE = savedPE; 1219 parseNotationDecl(); 1220 } 1221 else if (tryRead(startDelimPI)) 1222 { 1223 saved = readBuffer; 1224 expandPE = savedPE; 1225 parsePI(); 1226 } 1227 else if (tryRead(startDelimComment)) 1228 { 1229 saved = readBuffer; 1230 expandPE = savedPE; 1231 parseComment(); 1232 } 1233 else if (tryRead("<![")) 1234 { 1235 saved = readBuffer; 1236 expandPE = savedPE; 1237 if (inputStack.size() > 0) 1238 { 1239 parseConditionalSect(saved); 1240 } 1241 else 1242 { 1243 fatal("conditional sections illegal in internal subset"); 1244 } 1245 } 1246 else 1247 { 1248 fatal("expected markup declaration"); 1249 } 1250 1251 // VC: Proper Decl/PE Nesting 1252 if (readBuffer != saved) 1253 { 1254 handler.verror("Illegal Declaration/PE nesting"); 1255 } 1256 } 1257 1258 /** 1259 * Parse an element, with its tags. 1260 * <pre> 1261 * [39] element ::= EmptyElementTag | STag content ETag 1262 * [40] STag ::= '<' Name (S Attribute)* S? '>' 1263 * [44] EmptyElementTag ::= '<' Name (S Attribute)* S? '/>' 1264 * </pre> 1265 * <p> (The '<' has already been read.) 1266 * <p>NOTE: this method actually chains onto parseContent (), if necessary, 1267 * and parseContent () will take care of calling parseETag (). 1268 */ 1269 private void parseElement(boolean maybeGetSubset) 1270 throws Exception 1271 { 1272 String gi; 1273 char c; 1274 int oldElementContent = currentElementContent; 1275 String oldElement = currentElement; 1276 ElementDecl element; 1277 1278 // This is the (global) counter for the 1279 // array of specified attributes. 1280 tagAttributePos = 0; 1281 1282 // Read the element type name. 1283 gi = readNmtoken(true); 1284 1285 // If we saw no DTD, and this is the document root element, 1286 // let the application modify the input stream by providing one. 1287 if (maybeGetSubset) 1288 { 1289 InputSource subset = handler.getExternalSubset(gi, 1290 handler.getSystemId()); 1291 if (subset != null) 1292 { 1293 String publicId = subset.getPublicId(); 1294 String systemId = subset.getSystemId(); 1295 1296 handler.warn("modifying document by adding DTD"); 1297 handler.doctypeDecl(gi, publicId, systemId); 1298 pushString(null, ">"); 1299 1300 // NOTE: [dtd] is so we say what SAX2 expects, 1301 // though it's misleading (subset, not entire dtd) 1302 pushURL(true, "[dtd]", 1303 new ExternalIdentifiers(publicId, systemId, null), 1304 subset.getCharacterStream(), 1305 subset.getByteStream(), 1306 subset.getEncoding(), 1307 false); 1308 1309 // Loop until we end up back at '>' 1310 while (true) 1311 { 1312 doReport = expandPE = true; 1313 skipWhitespace(); 1314 doReport = expandPE = false; 1315 if (tryRead('>')) 1316 { 1317 break; 1318 } 1319 else 1320 { 1321 expandPE = true; 1322 parseMarkupdecl(); 1323 expandPE = false; 1324 } 1325 } 1326 1327 // the ">" string isn't popped yet 1328 if (inputStack.size() != 1) 1329 { 1330 fatal("external subset has unmatched '>'"); 1331 } 1332 1333 handler.endDoctype(); 1334 } 1335 } 1336 1337 // Determine the current content type. 1338 currentElement = gi; 1339 element = elementInfo.get(gi); 1340 currentElementContent = getContentType(element, CONTENT_ANY); 1341 1342 // Read the attributes, if any. 1343 // After this loop, "c" is the closing delimiter. 1344 boolean white = tryWhitespace(); 1345 c = readCh(); 1346 while (c != '/' && c != '>') 1347 { 1348 unread(c); 1349 if (!white) 1350 { 1351 fatal("need whitespace between attributes"); 1352 } 1353 parseAttribute(gi); 1354 white = tryWhitespace(); 1355 c = readCh(); 1356 } 1357 1358 // Supply any defaulted attributes. 1359 Iterator<String> atts = declaredAttributes(element); 1360 if (atts != null) 1361 { 1362 String aname; 1363 loop: 1364 while (atts.hasNext()) 1365 { 1366 aname = atts.next(); 1367 // See if it was specified. 1368 for (int i = 0; i < tagAttributePos; i++) 1369 { 1370 if (tagAttributes[i] == aname) 1371 { 1372 continue loop; 1373 } 1374 } 1375 // ... or has a default 1376 String value = getAttributeDefaultValue(gi, aname); 1377 1378 if (value == null) 1379 { 1380 continue; 1381 } 1382 handler.attribute(aname, value, false); 1383 } 1384 } 1385 1386 // Figure out if this is a start tag 1387 // or an empty element, and dispatch an 1388 // event accordingly. 1389 switch (c) 1390 { 1391 case '>': 1392 handler.startElement(gi); 1393 parseContent(); 1394 break; 1395 case '/': 1396 require('>'); 1397 handler.startElement(gi); 1398 handler.endElement(gi); 1399 break; 1400 } 1401 1402 // Restore the previous state. 1403 currentElement = oldElement; 1404 currentElementContent = oldElementContent; 1405 } 1406 1407 /** 1408 * Parse an attribute assignment. 1409 * <pre> 1410 * [41] Attribute ::= Name Eq AttValue 1411 * </pre> 1412 * @param name The name of the attribute's element. 1413 * @see SAXDriver#attribute 1414 */ 1415 private void parseAttribute(String name) 1416 throws Exception 1417 { 1418 String aname; 1419 String type; 1420 String value; 1421 int flags = LIT_ATTRIBUTE | LIT_ENTITY_REF; 1422 1423 // Read the attribute name. 1424 aname = readNmtoken(true); 1425 type = getAttributeType(name, aname); 1426 1427 // Parse '=' 1428 parseEq(); 1429 1430 // Read the value, normalizing whitespace 1431 // unless it is CDATA. 1432 if (handler.stringInterning) 1433 { 1434 if (type == "CDATA" || type == null) 1435 { 1436 value = readLiteral(flags); 1437 } 1438 else 1439 { 1440 value = readLiteral(flags | LIT_NORMALIZE); 1441 } 1442 } 1443 else 1444 { 1445 if (type.equals("CDATA") || type == null) 1446 { 1447 value = readLiteral(flags); 1448 } 1449 else 1450 { 1451 value = readLiteral(flags | LIT_NORMALIZE); 1452 } 1453 } 1454 1455 // WFC: no duplicate attributes 1456 for (int i = 0; i < tagAttributePos; i++) 1457 { 1458 if (aname.equals(tagAttributes [i])) 1459 { 1460 fatal("duplicate attribute", aname, null); 1461 } 1462 } 1463 1464 // Inform the handler about the 1465 // attribute. 1466 handler.attribute(aname, value, true); 1467 dataBufferPos = 0; 1468 1469 // Note that the attribute has been 1470 // specified. 1471 if (tagAttributePos == tagAttributes.length) 1472 { 1473 String newAttrib[] = new String[tagAttributes.length * 2]; 1474 System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos); 1475 tagAttributes = newAttrib; 1476 } 1477 tagAttributes[tagAttributePos++] = aname; 1478 } 1479 1480 /** 1481 * Parse an equals sign surrounded by optional whitespace. 1482 * <pre> 1483 * [25] Eq ::= S? '=' S? 1484 * </pre> 1485 */ 1486 private void parseEq() 1487 throws SAXException, IOException 1488 { 1489 skipWhitespace(); 1490 require('='); 1491 skipWhitespace(); 1492 } 1493 1494 /** 1495 * Parse an end tag. 1496 * <pre> 1497 * [42] ETag ::= '</' Name S? '>' 1498 * </pre> 1499 * <p>NOTE: parseContent () chains to here, we already read the 1500 * "</". 1501 */ 1502 private void parseETag() 1503 throws Exception 1504 { 1505 require(currentElement); 1506 skipWhitespace(); 1507 require('>'); 1508 handler.endElement(currentElement); 1509 // not re-reporting any SAXException re bogus end tags, 1510 // even though that diagnostic might be clearer ... 1511 } 1512 1513 /** 1514 * Parse the content of an element. 1515 * <pre> 1516 * [43] content ::= (element | CharData | Reference 1517 * | CDSect | PI | Comment)* 1518 * [67] Reference ::= EntityRef | CharRef 1519 * </pre> 1520 * <p> NOTE: consumes ETtag. 1521 */ 1522 private void parseContent() 1523 throws Exception 1524 { 1525 char c; 1526 1527 while (true) 1528 { 1529 // consume characters (or ignorable whitspace) until delimiter 1530 parseCharData(); 1531 1532 // Handle delimiters 1533 c = readCh(); 1534 switch (c) 1535 { 1536 case '&': // Found "&" 1537 c = readCh(); 1538 if (c == '#') 1539 { 1540 parseCharRef(); 1541 } 1542 else 1543 { 1544 unread(c); 1545 parseEntityRef(true); 1546 } 1547 isDirtyCurrentElement = true; 1548 break; 1549 1550 case '<': // Found "<" 1551 dataBufferFlush(); 1552 c = readCh(); 1553 switch (c) 1554 { 1555 case '!': // Found "<!" 1556 c = readCh(); 1557 switch (c) 1558 { 1559 case '-': // Found "<!-" 1560 require('-'); 1561 isDirtyCurrentElement = false; 1562 parseComment(); 1563 break; 1564 case '[': // Found "<![" 1565 isDirtyCurrentElement = false; 1566 require("CDATA["); 1567 handler.startCDATA(); 1568 inCDATA = true; 1569 parseCDSect(); 1570 inCDATA = false; 1571 handler.endCDATA(); 1572 break; 1573 default: 1574 fatal("expected comment or CDATA section", c, null); 1575 break; 1576 } 1577 break; 1578 1579 case '?': // Found "<?" 1580 isDirtyCurrentElement = false; 1581 parsePI(); 1582 break; 1583 1584 case '/': // Found "</" 1585 isDirtyCurrentElement = false; 1586 parseETag(); 1587 return; 1588 1589 default: // Found "<" followed by something else 1590 isDirtyCurrentElement = false; 1591 unread(c); 1592 parseElement(false); 1593 break; 1594 } 1595 } 1596 } 1597 } 1598 1599 /** 1600 * Parse an element type declaration. 1601 * <pre> 1602 * [45] elementdecl ::= '<!ELEMENT' S Name S contentspec S? '>' 1603 * </pre> 1604 * <p> NOTE: the '<!ELEMENT' has already been read. 1605 */ 1606 private void parseElementDecl() 1607 throws Exception 1608 { 1609 String name; 1610 1611 requireWhitespace(); 1612 // Read the element type name. 1613 name = readNmtoken(true); 1614 1615 requireWhitespace(); 1616 // Read the content model. 1617 parseContentspec(name); 1618 1619 skipWhitespace(); 1620 require('>'); 1621 } 1622 1623 /** 1624 * Content specification. 1625 * <pre> 1626 * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements 1627 * </pre> 1628 */ 1629 private void parseContentspec(String name) 1630 throws Exception 1631 { 1632 // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ... 1633 if (tryRead("EMPTY")) 1634 { 1635 setElement(name, CONTENT_EMPTY, null, null); 1636 if (!skippedPE) 1637 { 1638 handler.getDeclHandler().elementDecl(name, "EMPTY"); 1639 } 1640 return; 1641 } 1642 else if (tryRead("ANY")) 1643 { 1644 setElement(name, CONTENT_ANY, null, null); 1645 if (!skippedPE) 1646 { 1647 handler.getDeclHandler().elementDecl(name, "ANY"); 1648 } 1649 return; 1650 } 1651 else 1652 { 1653 String model; 1654 char[] saved; 1655 1656 require('('); 1657 saved = readBuffer; 1658 dataBufferAppend('('); 1659 skipWhitespace(); 1660 if (tryRead("#PCDATA")) 1661 { 1662 dataBufferAppend("#PCDATA"); 1663 parseMixed(saved); 1664 model = dataBufferToString(); 1665 setElement(name, CONTENT_MIXED, model, null); 1666 } 1667 else 1668 { 1669 parseElements(saved); 1670 model = dataBufferToString(); 1671 setElement(name, CONTENT_ELEMENTS, model, null); 1672 } 1673 if (!skippedPE) 1674 { 1675 handler.getDeclHandler().elementDecl(name, model); 1676 } 1677 } 1678 } 1679 1680 /** 1681 * Parse an element-content model. 1682 * <pre> 1683 * [47] elements ::= (choice | seq) ('?' | '*' | '+')? 1684 * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')' 1685 * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')' 1686 * </pre> 1687 * 1688 * <p> NOTE: the opening '(' and S have already been read. 1689 * 1690 * @param saved Buffer for entity that should have the terminal ')' 1691 */ 1692 private void parseElements(char[] saved) 1693 throws Exception 1694 { 1695 char c; 1696 char sep; 1697 1698 // Parse the first content particle 1699 skipWhitespace(); 1700 parseCp(); 1701 1702 // Check for end or for a separator. 1703 skipWhitespace(); 1704 c = readCh(); 1705 switch (c) 1706 { 1707 case ')': 1708 // VC: Proper Group/PE Nesting 1709 if (readBuffer != saved) 1710 { 1711 handler.verror("Illegal Group/PE nesting"); 1712 } 1713 1714 dataBufferAppend(')'); 1715 c = readCh(); 1716 switch (c) 1717 { 1718 case '*': 1719 case '+': 1720 case '?': 1721 dataBufferAppend(c); 1722 break; 1723 default: 1724 unread(c); 1725 } 1726 return; 1727 case ',': // Register the separator. 1728 case '|': 1729 sep = c; 1730 dataBufferAppend(c); 1731 break; 1732 default: 1733 fatal("bad separator in content model", c, null); 1734 return; 1735 } 1736 1737 // Parse the rest of the content model. 1738 while (true) 1739 { 1740 skipWhitespace(); 1741 parseCp(); 1742 skipWhitespace(); 1743 c = readCh(); 1744 if (c == ')') 1745 { 1746 // VC: Proper Group/PE Nesting 1747 if (readBuffer != saved) 1748 { 1749 handler.verror("Illegal Group/PE nesting"); 1750 } 1751 1752 dataBufferAppend(')'); 1753 break; 1754 } 1755 else if (c != sep) 1756 { 1757 fatal("bad separator in content model", c, null); 1758 return; 1759 } 1760 else 1761 { 1762 dataBufferAppend(c); 1763 } 1764 } 1765 1766 // Check for the occurrence indicator. 1767 c = readCh(); 1768 switch (c) 1769 { 1770 case '?': 1771 case '*': 1772 case '+': 1773 dataBufferAppend(c); 1774 return; 1775 default: 1776 unread(c); 1777 return; 1778 } 1779 } 1780 1781 /** 1782 * Parse a content particle. 1783 * <pre> 1784 * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')? 1785 * </pre> 1786 */ 1787 private void parseCp() 1788 throws Exception 1789 { 1790 if (tryRead('(')) 1791 { 1792 dataBufferAppend('('); 1793 parseElements(readBuffer); 1794 } 1795 else 1796 { 1797 dataBufferAppend(readNmtoken(true)); 1798 char c = readCh(); 1799 switch (c) 1800 { 1801 case '?': 1802 case '*': 1803 case '+': 1804 dataBufferAppend(c); 1805 break; 1806 default: 1807 unread(c); 1808 break; 1809 } 1810 } 1811 } 1812 1813 /** 1814 * Parse mixed content. 1815 * <pre> 1816 * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*' 1817 * | '(' S? ('#PCDATA') S? ')' 1818 * </pre> 1819 * 1820 * @param saved Buffer for entity that should have the terminal ')' 1821 */ 1822 private void parseMixed(char[] saved) 1823 throws Exception 1824 { 1825 // Check for PCDATA alone. 1826 skipWhitespace(); 1827 if (tryRead(')')) 1828 { 1829 // VC: Proper Group/PE Nesting 1830 if (readBuffer != saved) 1831 { 1832 handler.verror("Illegal Group/PE nesting"); 1833 } 1834 1835 dataBufferAppend(")*"); 1836 tryRead('*'); 1837 return; 1838 } 1839 1840 // Parse mixed content. 1841 skipWhitespace(); 1842 while (!tryRead(")")) 1843 { 1844 require('|'); 1845 dataBufferAppend('|'); 1846 skipWhitespace(); 1847 dataBufferAppend(readNmtoken(true)); 1848 skipWhitespace(); 1849 } 1850 1851 // VC: Proper Group/PE Nesting 1852 if (readBuffer != saved) 1853 { 1854 handler.verror("Illegal Group/PE nesting"); 1855 } 1856 1857 require('*'); 1858 dataBufferAppend(")*"); 1859 } 1860 1861 /** 1862 * Parse an attribute list declaration. 1863 * <pre> 1864 * [52] AttlistDecl ::= '<!ATTLIST' S Name AttDef* S? '>' 1865 * </pre> 1866 * <p>NOTE: the '<!ATTLIST' has already been read. 1867 */ 1868 private void parseAttlistDecl() 1869 throws Exception 1870 { 1871 String elementName; 1872 1873 requireWhitespace(); 1874 elementName = readNmtoken(true); 1875 boolean white = tryWhitespace(); 1876 while (!tryRead('>')) 1877 { 1878 if (!white) 1879 { 1880 fatal("whitespace required before attribute definition"); 1881 } 1882 parseAttDef(elementName); 1883 white = tryWhitespace(); 1884 } 1885 } 1886 1887 /** 1888 * Parse a single attribute definition. 1889 * <pre> 1890 * [53] AttDef ::= S Name S AttType S DefaultDecl 1891 * </pre> 1892 */ 1893 private void parseAttDef(String elementName) 1894 throws Exception 1895 { 1896 String name; 1897 String type; 1898 String enumer = null; 1899 1900 // Read the attribute name. 1901 name = readNmtoken(true); 1902 1903 // Read the attribute type. 1904 requireWhitespace(); 1905 type = readAttType(); 1906 1907 // Get the string of enumerated values if necessary. 1908 if (handler.stringInterning) 1909 { 1910 if ("ENUMERATION" == type || "NOTATION" == type) 1911 { 1912 enumer = dataBufferToString(); 1913 } 1914 } 1915 else 1916 { 1917 if ("ENUMERATION".equals(type) || "NOTATION".equals(type)) 1918 { 1919 enumer = dataBufferToString(); 1920 } 1921 } 1922 1923 // Read the default value. 1924 requireWhitespace(); 1925 parseDefault(elementName, name, type, enumer); 1926 } 1927 1928 /** 1929 * Parse the attribute type. 1930 * <pre> 1931 * [54] AttType ::= StringType | TokenizedType | EnumeratedType 1932 * [55] StringType ::= 'CDATA' 1933 * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY' 1934 * | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS' 1935 * [57] EnumeratedType ::= NotationType | Enumeration 1936 * </pre> 1937 */ 1938 private String readAttType() 1939 throws Exception 1940 { 1941 if (tryRead('(')) 1942 { 1943 parseEnumeration(false); 1944 return "ENUMERATION"; 1945 } 1946 else 1947 { 1948 String typeString = readNmtoken(true); 1949 if (handler.stringInterning) 1950 { 1951 if ("NOTATION" == typeString) 1952 { 1953 parseNotationType(); 1954 return typeString; 1955 } 1956 else if ("CDATA" == typeString 1957 || "ID" == typeString 1958 || "IDREF" == typeString 1959 || "IDREFS" == typeString 1960 || "ENTITY" == typeString 1961 || "ENTITIES" == typeString 1962 || "NMTOKEN" == typeString 1963 || "NMTOKENS" == typeString) 1964 { 1965 return typeString; 1966 } 1967 } 1968 else 1969 { 1970 if ("NOTATION".equals(typeString)) 1971 { 1972 parseNotationType(); 1973 return typeString; 1974 } 1975 else if ("CDATA".equals(typeString) 1976 || "ID".equals(typeString) 1977 || "IDREF".equals(typeString) 1978 || "IDREFS".equals(typeString) 1979 || "ENTITY".equals(typeString) 1980 || "ENTITIES".equals(typeString) 1981 || "NMTOKEN".equals(typeString) 1982 || "NMTOKENS".equals(typeString)) 1983 { 1984 return typeString; 1985 } 1986 } 1987 fatal("illegal attribute type", typeString, null); 1988 return null; 1989 } 1990 } 1991 1992 /** 1993 * Parse an enumeration. 1994 * <pre> 1995 * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')' 1996 * </pre> 1997 * <p>NOTE: the '(' has already been read. 1998 */ 1999 private void parseEnumeration(boolean isNames) 2000 throws Exception 2001 { 2002 dataBufferAppend('('); 2003 2004 // Read the first token. 2005 skipWhitespace(); 2006 dataBufferAppend(readNmtoken(isNames)); 2007 // Read the remaining tokens. 2008 skipWhitespace(); 2009 while (!tryRead(')')) 2010 { 2011 require('|'); 2012 dataBufferAppend('|'); 2013 skipWhitespace(); 2014 dataBufferAppend(readNmtoken (isNames)); 2015 skipWhitespace(); 2016 } 2017 dataBufferAppend(')'); 2018 } 2019 2020 /** 2021 * Parse a notation type for an attribute. 2022 * <pre> 2023 * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks 2024 * (S? '|' S? name)* S? ')' 2025 * </pre> 2026 * <p>NOTE: the 'NOTATION' has already been read 2027 */ 2028 private void parseNotationType() 2029 throws Exception 2030 { 2031 requireWhitespace(); 2032 require('('); 2033 2034 parseEnumeration(true); 2035 } 2036 2037 /** 2038 * Parse the default value for an attribute. 2039 * <pre> 2040 * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED' 2041 * | (('#FIXED' S)? AttValue) 2042 * </pre> 2043 */ 2044 private void parseDefault(String elementName, String name, 2045 String type, String enumer) 2046 throws Exception 2047 { 2048 int valueType = ATTRIBUTE_DEFAULT_SPECIFIED; 2049 String value = null; 2050 int flags = LIT_ATTRIBUTE; 2051 boolean saved = expandPE; 2052 String defaultType = null; 2053 2054 // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace 2055 // chars to spaces (doesn't matter when that's done if it doesn't 2056 // interfere with char refs expanding to whitespace). 2057 2058 if (!skippedPE) 2059 { 2060 flags |= LIT_ENTITY_REF; 2061 if (handler.stringInterning) 2062 { 2063 if ("CDATA" != type) 2064 { 2065 flags |= LIT_NORMALIZE; 2066 } 2067 } 2068 else 2069 { 2070 if (!"CDATA".equals(type)) 2071 { 2072 flags |= LIT_NORMALIZE; 2073 } 2074 } 2075 } 2076 2077 expandPE = false; 2078 if (tryRead('#')) 2079 { 2080 if (tryRead("FIXED")) 2081 { 2082 defaultType = "#FIXED"; 2083 valueType = ATTRIBUTE_DEFAULT_FIXED; 2084 requireWhitespace(); 2085 value = readLiteral(flags); 2086 } 2087 else if (tryRead("REQUIRED")) 2088 { 2089 defaultType = "#REQUIRED"; 2090 valueType = ATTRIBUTE_DEFAULT_REQUIRED; 2091 } 2092 else if (tryRead("IMPLIED")) 2093 { 2094 defaultType = "#IMPLIED"; 2095 valueType = ATTRIBUTE_DEFAULT_IMPLIED; 2096 } 2097 else 2098 { 2099 fatal("illegal keyword for attribute default value"); 2100 } 2101 } 2102 else 2103 { 2104 value = readLiteral(flags); 2105 } 2106 expandPE = saved; 2107 setAttribute(elementName, name, type, enumer, value, valueType); 2108 if (handler.stringInterning) 2109 { 2110 if ("ENUMERATION" == type) 2111 { 2112 type = enumer; 2113 } 2114 else if ("NOTATION" == type) 2115 { 2116 type = "NOTATION " + enumer; 2117 } 2118 } 2119 else 2120 { 2121 if ("ENUMERATION".equals(type)) 2122 { 2123 type = enumer; 2124 } 2125 else if ("NOTATION".equals(type)) 2126 { 2127 type = "NOTATION " + enumer; 2128 } 2129 } 2130 if (!skippedPE) 2131 { 2132 handler.getDeclHandler().attributeDecl(elementName, name, type, 2133 defaultType, value); 2134 } 2135 } 2136 2137 /** 2138 * Parse a conditional section. 2139 * <pre> 2140 * [61] conditionalSect ::= includeSect || ignoreSect 2141 * [62] includeSect ::= '<![' S? 'INCLUDE' S? '[' 2142 * extSubsetDecl ']]>' 2143 * [63] ignoreSect ::= '<![' S? 'IGNORE' S? '[' 2144 * ignoreSectContents* ']]>' 2145 * [64] ignoreSectContents ::= Ignore 2146 * ('<![' ignoreSectContents* ']]>' Ignore )* 2147 * [65] Ignore ::= Char* - (Char* ( '<![' | ']]>') Char* ) 2148 * </pre> 2149 * <p> NOTE: the '>![' has already been read. 2150 */ 2151 private void parseConditionalSect(char[] saved) 2152 throws Exception 2153 { 2154 skipWhitespace(); 2155 if (tryRead("INCLUDE")) 2156 { 2157 skipWhitespace(); 2158 require('['); 2159 // VC: Proper Conditional Section/PE Nesting 2160 if (readBuffer != saved) 2161 { 2162 handler.verror("Illegal Conditional Section/PE nesting"); 2163 } 2164 skipWhitespace(); 2165 while (!tryRead("]]>")) 2166 { 2167 parseMarkupdecl(); 2168 skipWhitespace(); 2169 } 2170 } 2171 else if (tryRead("IGNORE")) 2172 { 2173 skipWhitespace(); 2174 require('['); 2175 // VC: Proper Conditional Section/PE Nesting 2176 if (readBuffer != saved) 2177 { 2178 handler.verror("Illegal Conditional Section/PE nesting"); 2179 } 2180 char c; 2181 expandPE = false; 2182 for (int nest = 1; nest > 0; ) 2183 { 2184 c = readCh(); 2185 switch (c) 2186 { 2187 case '<': 2188 if (tryRead("![")) 2189 { 2190 nest++; 2191 } 2192 case ']': 2193 if (tryRead("]>")) 2194 { 2195 nest--; 2196 } 2197 } 2198 } 2199 expandPE = true; 2200 } 2201 else 2202 { 2203 fatal("conditional section must begin with INCLUDE or IGNORE"); 2204 } 2205 } 2206 2207 private void parseCharRef() 2208 throws SAXException, IOException 2209 { 2210 parseCharRef(true /* do flushDataBuffer by default */); 2211 } 2212 2213 /** 2214 * Try to read a character reference without consuming data from buffer. 2215 * <pre> 2216 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' 2217 * </pre> 2218 * <p>NOTE: the '&#' has already been read. 2219 */ 2220 private void tryReadCharRef() 2221 throws SAXException, IOException 2222 { 2223 int value = 0; 2224 char c; 2225 2226 if (tryRead('x')) 2227 { 2228 loop1: 2229 while (true) 2230 { 2231 c = readCh(); 2232 if (c == ';') 2233 { 2234 break loop1; 2235 } 2236 else 2237 { 2238 int n = Character.digit(c, 16); 2239 if (n == -1) 2240 { 2241 fatal("illegal character in character reference", c, null); 2242 break loop1; 2243 } 2244 value *= 16; 2245 value += n; 2246 } 2247 } 2248 } 2249 else 2250 { 2251 loop2: 2252 while (true) 2253 { 2254 c = readCh(); 2255 if (c == ';') 2256 { 2257 break loop2; 2258 } 2259 else 2260 { 2261 int n = Character.digit(c, 10); 2262 if (n == -1) 2263 { 2264 fatal("illegal character in character reference", c, null); 2265 break loop2; 2266 } 2267 value *= 10; 2268 value += n; 2269 } 2270 } 2271 } 2272 2273 // check for character refs being legal XML 2274 if ((value < 0x0020 2275 && ! (value == '\n' || value == '\t' || value == '\r')) 2276 || (value >= 0xD800 && value <= 0xDFFF) 2277 || value == 0xFFFE || value == 0xFFFF 2278 || value > 0x0010ffff) 2279 { 2280 fatal("illegal XML character reference U+" 2281 + Integer.toHexString(value)); 2282 } 2283 else if (value >= 0x007F && value <= 0x009F) // 2006-11-13 hsivonen 2284 { 2285 handler.warn("Character reference expands to a control character: U+00" + Integer.toHexString(c) + "."); 2286 } 2287 if (isPrivateUse(value)) 2288 { 2289 warnAboutPrivateUseChar(); 2290 } 2291 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz 2292 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: 2293 if (value > 0x0010ffff) 2294 { 2295 // too big for surrogate 2296 fatal("character reference " + value + " is too large for UTF-16", 2297 new Integer(value).toString(), null); 2298 } 2299 2300 } 2301 2302 /** 2303 * Read and interpret a character reference. 2304 * <pre> 2305 * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';' 2306 * </pre> 2307 * <p>NOTE: the '&#' has already been read. 2308 */ 2309 private void parseCharRef(boolean doFlush) 2310 throws SAXException, IOException 2311 { 2312 int value = 0; 2313 char c; 2314 2315 if (tryRead('x')) 2316 { 2317 loop1: 2318 while (true) 2319 { 2320 c = readCh(); 2321 if (c == ';') 2322 { 2323 break loop1; 2324 } 2325 else 2326 { 2327 int n = Character.digit(c, 16); 2328 if (n == -1) 2329 { 2330 fatal("illegal character in character reference", c, null); 2331 break loop1; 2332 } 2333 value *= 16; 2334 value += n; 2335 } 2336 } 2337 } 2338 else 2339 { 2340 loop2: 2341 while (true) 2342 { 2343 c = readCh(); 2344 if (c == ';') 2345 { 2346 break loop2; 2347 } 2348 else 2349 { 2350 int n = Character.digit(c, 10); 2351 if (n == -1) 2352 { 2353 fatal("illegal character in character reference", c, null); 2354 break loop2; 2355 } 2356 value *= 10; 2357 value += c - '0'; 2358 } 2359 } 2360 } 2361 2362 // check for character refs being legal XML 2363 if ((value < 0x0020 2364 && ! (value == '\n' || value == '\t' || value == '\r')) 2365 || (value >= 0xD800 && value <= 0xDFFF) 2366 || value == 0xFFFE || value == 0xFFFF 2367 || value > 0x0010ffff) 2368 { 2369 fatal("illegal XML character reference U+" 2370 + Integer.toHexString(value)); 2371 } 2372 else if (value >= 0x007F && value <= 0x009F) // 2006-11-13 hsivonen 2373 { 2374 handler.warn("Character reference expands to a control character: U+00" + Integer.toHexString(c) + "."); 2375 } 2376 if (isPrivateUse(value)) 2377 { 2378 warnAboutPrivateUseChar(); 2379 } 2380 2381 // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz 2382 // (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz: 2383 if (value <= 0x0000ffff) 2384 { 2385 // no surrogates needed 2386 dataBufferAppend((char) value); 2387 } 2388 else if (value <= 0x0010ffff) 2389 { 2390 value -= 0x10000; 2391 // > 16 bits, surrogate needed 2392 dataBufferAppend((char) (0xd800 | (value >> 10))); 2393 dataBufferAppend((char) (0xdc00 | (value & 0x0003ff))); 2394 } 2395 else 2396 { 2397 // too big for surrogate 2398 fatal("character reference " + value + " is too large for UTF-16", 2399 new Integer(value).toString(), null); 2400 } 2401 if (doFlush) 2402 { 2403 dataBufferFlush(); 2404 } 2405 } 2406 2407 /** 2408 * Parse and expand an entity reference. 2409 * <pre> 2410 * [68] EntityRef ::= '&' Name ';' 2411 * </pre> 2412 * <p>NOTE: the '&' has already been read. 2413 * @param externalAllowed External entities are allowed here. 2414 */ 2415 private void parseEntityRef(boolean externalAllowed) 2416 throws SAXException, IOException 2417 { 2418 String name; 2419 2420 name = readNmtoken(true); 2421 require(';'); 2422 switch (getEntityType(name)) 2423 { 2424 case ENTITY_UNDECLARED: 2425 // NOTE: XML REC describes amazingly convoluted handling for 2426 // this case. Nothing as meaningful as being a WFness error 2427 // unless the processor might _legitimately_ not have seen a 2428 // declaration ... which is what this implements. 2429 String message; 2430 2431 message = "reference to undeclared general entity " + name; 2432 if (skippedPE && !docIsStandalone) 2433 { 2434 handler.verror(message); 2435 // we don't know this entity, and it might be external... 2436 if (externalAllowed) 2437 { 2438 handler.skippedEntity(name); 2439 } 2440 } 2441 else 2442 { 2443 fatal(message); 2444 } 2445 break; 2446 case ENTITY_INTERNAL: 2447 pushString(name, getEntityValue(name)); 2448 2449 //workaround for possible input pop before marking 2450 //the buffer reading position 2451 char t = readCh(); 2452 unread(t); 2453 int bufferPosMark = readBufferPos; 2454 2455 int end = readBufferPos + getEntityValue(name).length(); 2456 for (int k = readBufferPos; k < end; k++) 2457 { 2458 t = readCh(); 2459 if (t == '&') 2460 { 2461 t = readCh(); 2462 if (t == '#') 2463 { 2464 //try to match a character ref 2465 tryReadCharRef(); 2466 2467 //everything has been read 2468 if (readBufferPos >= end) 2469 { 2470 break; 2471 } 2472 k = readBufferPos; 2473 continue; 2474 } 2475 else if (Character.isLetter(t)) 2476 { 2477 //looks like an entity ref 2478 unread(t); 2479 readNmtoken(true); 2480 require(';'); 2481 2482 //everything has been read 2483 if (readBufferPos >= end) 2484 { 2485 break; 2486 } 2487 k = readBufferPos; 2488 continue; 2489 } 2490 fatal(" malformed entity reference"); 2491 } 2492 2493 } 2494 readBufferPos = bufferPosMark; 2495 break; 2496 case ENTITY_TEXT: 2497 if (externalAllowed) 2498 { 2499 pushURL(false, name, getEntityIds(name), 2500 null, null, null, true); 2501 } 2502 else 2503 { 2504 fatal("reference to external entity in attribute value.", 2505 name, null); 2506 } 2507 break; 2508 case ENTITY_NDATA: 2509 if (externalAllowed) 2510 { 2511 fatal("unparsed entity reference in content", name, null); 2512 } 2513 else 2514 { 2515 fatal("reference to external entity in attribute value.", 2516 name, null); 2517 } 2518 break; 2519 default: 2520 throw new RuntimeException(); 2521 } 2522 } 2523 2524 /** 2525 * Parse and expand a parameter entity reference. 2526 * <pre> 2527 * [69] PEReference ::= '%' Name ';' 2528 * </pre> 2529 * <p>NOTE: the '%' has already been read. 2530 */ 2531 private void parsePEReference() 2532 throws SAXException, IOException 2533 { 2534 String name; 2535 2536 name = "%" + readNmtoken(true); 2537 require(';'); 2538 switch (getEntityType(name)) 2539 { 2540 case ENTITY_UNDECLARED: 2541 // VC: Entity Declared 2542 handler.verror("reference to undeclared parameter entity " + name); 2543 2544 // we should disable handling of all subsequent declarations 2545 // unless this is a standalone document (info discarded) 2546 break; 2547 case ENTITY_INTERNAL: 2548 if (inLiteral) 2549 { 2550 pushString(name, getEntityValue(name)); 2551 } 2552 else 2553 { 2554 pushString(name, ' ' + getEntityValue(name) + ' '); 2555 } 2556 break; 2557 case ENTITY_TEXT: 2558 if (!inLiteral) 2559 { 2560 pushString(null, " "); 2561 } 2562 pushURL(true, name, getEntityIds(name), null, null, null, true); 2563 if (!inLiteral) 2564 { 2565 pushString(null, " "); 2566 } 2567 break; 2568 } 2569 } 2570 2571 /** 2572 * Parse an entity declaration. 2573 * <pre> 2574 * [70] EntityDecl ::= GEDecl | PEDecl 2575 * [71] GEDecl ::= '<!ENTITY' S Name S EntityDef S? '>' 2576 * [72] PEDecl ::= '<!ENTITY' S '%' S Name S PEDef S? '>' 2577 * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?) 2578 * [74] PEDef ::= EntityValue | ExternalID 2579 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral 2580 * | 'PUBLIC' S PubidLiteral S SystemLiteral 2581 * [76] NDataDecl ::= S 'NDATA' S Name 2582 * </pre> 2583 * <p>NOTE: the '<!ENTITY' has already been read. 2584 */ 2585 private void parseEntityDecl() 2586 throws Exception 2587 { 2588 boolean peFlag = false; 2589 int flags = 0; 2590 2591 // Check for a parameter entity. 2592 expandPE = false; 2593 requireWhitespace(); 2594 if (tryRead('%')) 2595 { 2596 peFlag = true; 2597 requireWhitespace(); 2598 } 2599 expandPE = true; 2600 2601 // Read the entity name, and prepend 2602 // '%' if necessary. 2603 String name = readNmtoken(true); 2604 //NE08 2605 if (name.indexOf(':') >= 0) 2606 { 2607 fatal("Illegal character(':') in entity name ", name, null); 2608 } 2609 if (peFlag) 2610 { 2611 name = "%" + name; 2612 } 2613 2614 // Read the entity value. 2615 requireWhitespace(); 2616 char c = readCh(); 2617 unread (c); 2618 if (c == '"' || c == '\'') 2619 { 2620 // Internal entity ... replacement text has expanded refs 2621 // to characters and PEs, but not to general entities 2622 String value = readLiteral(flags); 2623 setInternalEntity(name, value); 2624 } 2625 else 2626 { 2627 // Read the external IDs 2628 ExternalIdentifiers ids = readExternalIds(false, false); 2629 2630 // Check for NDATA declaration. 2631 boolean white = tryWhitespace(); 2632 if (!peFlag && tryRead("NDATA")) 2633 { 2634 if (!white) 2635 { 2636 fatal("whitespace required before NDATA"); 2637 } 2638 requireWhitespace(); 2639 String notationName = readNmtoken(true); 2640 if (!skippedPE) 2641 { 2642 setExternalEntity(name, ENTITY_NDATA, ids, notationName); 2643 handler.unparsedEntityDecl(name, ids.publicId, ids.systemId, 2644 ids.baseUri, notationName); 2645 } 2646 } 2647 else if (!skippedPE) 2648 { 2649 setExternalEntity(name, ENTITY_TEXT, ids, null); 2650 handler.getDeclHandler() 2651 .externalEntityDecl(name, ids.publicId, 2652 handler.resolveURIs() 2653 // FIXME: ASSUMES not skipped 2654 // "false" forces error on bad URI 2655 ? handler.absolutize(ids.baseUri, 2656 ids.systemId, 2657 false) 2658 : ids.systemId); 2659 } 2660 } 2661 2662 // Finish the declaration. 2663 skipWhitespace(); 2664 require('>'); 2665 } 2666 2667 /** 2668 * Parse a notation declaration. 2669 * <pre> 2670 * [82] NotationDecl ::= '<!NOTATION' S Name S 2671 * (ExternalID | PublicID) S? '>' 2672 * [83] PublicID ::= 'PUBLIC' S PubidLiteral 2673 * </pre> 2674 * <P>NOTE: the '<!NOTATION' has already been read. 2675 */ 2676 private void parseNotationDecl() 2677 throws Exception 2678 { 2679 String nname; 2680 ExternalIdentifiers ids; 2681 2682 requireWhitespace(); 2683 nname = readNmtoken(true); 2684 //NE08 2685 if (nname.indexOf(':') >= 0) 2686 { 2687 fatal("Illegal character(':') in notation name ", nname, null); 2688 } 2689 requireWhitespace(); 2690 2691 // Read the external identifiers. 2692 ids = readExternalIds(true, false); 2693 2694 // Register the notation. 2695 setNotation(nname, ids); 2696 2697 skipWhitespace(); 2698 require('>'); 2699 } 2700 2701 /** 2702 * Parse character data. 2703 * <pre> 2704 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*) 2705 * </pre> 2706 */ 2707 private void parseCharData() 2708 throws Exception 2709 { 2710 char c; 2711 int state = 0; 2712 boolean pureWhite = false; 2713 2714 // assert (dataBufferPos == 0); 2715 2716 // are we expecting pure whitespace? it might be dirty... 2717 if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement) 2718 { 2719 pureWhite = true; 2720 } 2721 2722 // always report right out of readBuffer 2723 // to minimize (pointless) buffer copies 2724 while (true) 2725 { 2726 int lineAugment = 0; 2727 int columnAugment = 0; 2728 int i; 2729 2730 loop: 2731 for (i = readBufferPos; i < readBufferLength; i++) 2732 { 2733 switch (c = readBuffer[i]) 2734 { 2735 case '\n': 2736 lineAugment++; 2737 columnAugment = 0; 2738 // pureWhite unmodified 2739 break; 2740 case '\r': // should not happen!! 2741 case '\t': 2742 case ' ': 2743 // pureWhite unmodified 2744 columnAugment++; 2745 break; 2746 case '&': 2747 case '<': 2748 columnAugment++; 2749 // pureWhite unmodified 2750 // CLEAN end of text sequence 2751 state = 1; 2752 break loop; 2753 case ']': 2754 // that's not a whitespace char, and 2755 // can not terminate pure whitespace either 2756 pureWhite = false; 2757 if ((i + 2) < readBufferLength) 2758 { 2759 if (readBuffer [i + 1] == ']' 2760 && readBuffer [i + 2] == '>') 2761 { 2762 // ERROR end of text sequence 2763 state = 2; 2764 break loop; 2765 } 2766 } 2767 else 2768 { 2769 // FIXME missing two end-of-buffer cases 2770 } 2771 columnAugment++; 2772 break; 2773 default: 2774 if ((c < 0x0020 || c > 0xFFFD) 2775 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) 2776 && xmlVersion == XML_11)) 2777 { 2778 fatal("illegal XML character U+" 2779 + Integer.toHexString(c)); 2780 } 2781 else if (c >= '\u007F' && c <= '\u009F') // 2006-04-25 hsivonen 2782 { 2783 handler.warn("Saw a control character: U+00" + Integer.toHexString(c) + "."); 2784 } 2785 // that's not a whitespace char 2786 pureWhite = false; 2787 columnAugment++; 2788 } 2789 } 2790 2791 // report text thus far 2792 if (lineAugment > 0) 2793 { 2794 line += lineAugment; 2795 column = columnAugment; 2796 } 2797 else 2798 { 2799 column += columnAugment; 2800 } 2801 2802 // report characters/whitspace 2803 int length = i - readBufferPos; 2804 2805 if (length != 0) 2806 { 2807 if (pureWhite) 2808 { 2809 handler.ignorableWhitespace(readBuffer, 2810 readBufferPos, length); 2811 } 2812 else 2813 { 2814 handler.charData(readBuffer, readBufferPos, length); 2815 } 2816 readBufferPos = i; 2817 } 2818 2819 if (state != 0) 2820 { 2821 break; 2822 } 2823 2824 // fill next buffer from this entity, or 2825 // pop stack and continue with previous entity 2826 unread(readCh()); 2827 } 2828 if (!pureWhite) 2829 { 2830 isDirtyCurrentElement = true; 2831 } 2832 // finish, maybe with error 2833 if (state != 1) // finish, no error 2834 { 2835 fatal("character data may not contain ']]>'"); 2836 } 2837 } 2838 2839 ////////////////////////////////////////////////////////////////////// 2840 // High-level reading and scanning methods. 2841 ////////////////////////////////////////////////////////////////////// 2842 2843 /** 2844 * Require whitespace characters. 2845 */ 2846 private void requireWhitespace() 2847 throws SAXException, IOException 2848 { 2849 char c = readCh(); 2850 if (isWhitespace(c)) 2851 { 2852 skipWhitespace(); 2853 } 2854 else 2855 { 2856 fatal("whitespace required", c, null); 2857 } 2858 } 2859 2860 /** 2861 * Skip whitespace characters. 2862 * <pre> 2863 * [3] S ::= (#x20 | #x9 | #xd | #xa)+ 2864 * </pre> 2865 */ 2866 private void skipWhitespace() 2867 throws SAXException, IOException 2868 { 2869 // Start with a little cheat. Most of 2870 // the time, the white space will fall 2871 // within the current read buffer; if 2872 // not, then fall through. 2873 if (USE_CHEATS) 2874 { 2875 int lineAugment = 0; 2876 int columnAugment = 0; 2877 2878 loop: 2879 for (int i = readBufferPos; i < readBufferLength; i++) 2880 { 2881 switch (readBuffer[i]) 2882 { 2883 case ' ': 2884 case '\t': 2885 case '\r': 2886 columnAugment++; 2887 break; 2888 case '\n': 2889 lineAugment++; 2890 columnAugment = 0; 2891 break; 2892 case '%': 2893 if (expandPE) 2894 { 2895 break loop; 2896 } 2897 // else fall through... 2898 default: 2899 readBufferPos = i; 2900 if (lineAugment > 0) 2901 { 2902 line += lineAugment; 2903 column = columnAugment; 2904 } 2905 else 2906 { 2907 column += columnAugment; 2908 } 2909 return; 2910 } 2911 } 2912 } 2913 2914 // OK, do it the slow way. 2915 char c = readCh (); 2916 while (isWhitespace(c)) 2917 { 2918 c = readCh(); 2919 } 2920 unread(c); 2921 } 2922 2923 /** 2924 * Read a name or (when parsing an enumeration) name token. 2925 * <pre> 2926 * [5] Name ::= (Letter | '_' | ':') (NameChar)* 2927 * [7] Nmtoken ::= (NameChar)+ 2928 * </pre> 2929 */ 2930 private String readNmtoken(boolean isName) 2931 throws SAXException, IOException 2932 { 2933 char c; 2934 2935 if (USE_CHEATS) 2936 { 2937 loop: 2938 for (int i = readBufferPos; i < readBufferLength; i++) 2939 { 2940 c = readBuffer[i]; 2941 switch (c) 2942 { 2943 case '%': 2944 if (expandPE) 2945 { 2946 break loop; 2947 } 2948 // else fall through... 2949 2950 // What may legitimately come AFTER a name/nmtoken? 2951 case '<': case '>': case '&': 2952 case ',': case '|': case '*': case '+': case '?': 2953 case ')': 2954 case '=': 2955 case '\'': case '"': 2956 case '[': 2957 case ' ': case '\t': case '\r': case '\n': 2958 case ';': 2959 case '/': 2960 int start = readBufferPos; 2961 if (i == start) 2962 { 2963 fatal("name expected", readBuffer[i], null); 2964 } 2965 readBufferPos = i; 2966 return intern(readBuffer, start, i - start); 2967 2968 default: 2969 // FIXME ... per IBM's OASIS test submission, these: 2970 // ? U+06dd 2971 // Combining U+309B 2972 //these switches are kind of ugly but at least we won't 2973 //have to go over the whole lits for each char 2974 if (isName && i == readBufferPos) 2975 { 2976 char c2 = (char) (c & 0x00f0); 2977 switch (c & 0xff00) 2978 { 2979 //starting with 01 2980 case 0x0100: 2981 switch (c2) 2982 { 2983 case 0x0030: 2984 if (c == 0x0132 || c == 0x0133 || c == 0x013f) 2985 { 2986 fatal("Not a name start character, U+" 2987 + Integer.toHexString(c)); 2988 } 2989 break; 2990 case 0x0040: 2991 if (c == 0x0140 || c == 0x0149) 2992 { 2993 fatal("Not a name start character, U+" 2994 + Integer.toHexString(c)); 2995 } 2996 break; 2997 case 0x00c0: 2998 if (c == 0x01c4 || c == 0x01cc) 2999 { 3000 fatal("Not a name start character, U+" 3001 + Integer.toHexString(c)); 3002 } 3003 break; 3004 case 0x00f0: 3005 if (c == 0x01f1 || c == 0x01f3) 3006 { 3007 fatal("Not a name start character, U+" 3008 + Integer.toHexString(c)); 3009 } 3010 break; 3011 case 0x00b0: 3012 if (c == 0x01f1 || c == 0x01f3) 3013 { 3014 fatal("Not a name start character, U+" 3015 + Integer.toHexString(c)); 3016 } 3017 break; 3018 default: 3019 if (c == 0x017f) 3020 { 3021 fatal("Not a name start character, U+" 3022 + Integer.toHexString(c)); 3023 } 3024 } 3025 3026 break; 3027 //starting with 11 3028 case 0x1100: 3029 switch (c2) 3030 { 3031 case 0x0000: 3032 if (c == 0x1104 || c == 0x1108 || 3033 c == 0x110a || c == 0x110d) 3034 { 3035 fatal("Not a name start character, U+" 3036 + Integer.toHexString(c)); 3037 } 3038 break; 3039 case 0x0030: 3040 if (c == 0x113b || c == 0x113f) 3041 { 3042 fatal("Not a name start character, U+" 3043 + Integer.toHexString(c)); 3044 } 3045 break; 3046 case 0x0040: 3047 if (c == 0x1141 || c == 0x114d 3048 || c == 0x114f ) 3049 { 3050 fatal("Not a name start character, U+" 3051 + Integer.toHexString(c)); 3052 } 3053 break; 3054 case 0x0050: 3055 if (c == 0x1151 || c == 0x1156) 3056 { 3057 fatal("Not a name start character, U+" 3058 + Integer.toHexString(c)); 3059 } 3060 break; 3061 case 0x0060: 3062 if (c == 0x1162 || c == 0x1164 3063 || c == 0x1166 || c == 0x116b 3064 || c == 0x116f) 3065 { 3066 fatal("Not a name start character, U+" 3067 + Integer.toHexString(c)); 3068 } 3069 break; 3070 case 0x00b0: 3071 if (c == 0x11b6 || c == 0x11b9 3072 || c == 0x11bb || c == 0x116f) 3073 { 3074 fatal("Not a name start character, U+" 3075 + Integer.toHexString(c)); 3076 } 3077 break; 3078 default: 3079 if (c == 0x1174 || c == 0x119f 3080 || c == 0x11ac || c == 0x11c3 3081 || c == 0x11f1) 3082 { 3083 fatal("Not a name start character, U+" 3084 + Integer.toHexString(c)); 3085 } 3086 } 3087 break; 3088 default: 3089 if (c == 0x0e46 || c == 0x1011 3090 || c == 0x212f || c == 0x0587 3091 || c == 0x0230 ) 3092 { 3093 fatal("Not a name start character, U+" 3094 + Integer.toHexString(c)); 3095 } 3096 } 3097 } 3098 // punt on exact tests from Appendix A; approximate 3099 // them using the Unicode ID start/part rules 3100 if (i == readBufferPos && isName) 3101 { 3102 if (!Character.isUnicodeIdentifierStart(c) 3103 && c != ':' && c != '_') 3104 { 3105 fatal("Not a name start character, U+" 3106 + Integer.toHexString(c)); 3107 } 3108 } 3109 else if (!Character.isUnicodeIdentifierPart(c) 3110 && c != '-' && c != ':' && c != '_' && c != '.' 3111 && !isExtender(c)) 3112 { 3113 fatal("Not a name character, U+" 3114 + Integer.toHexString(c)); 3115 } 3116 } 3117 } 3118 } 3119 3120 nameBufferPos = 0; 3121 3122 // Read the first character. 3123 loop: 3124 while (true) 3125 { 3126 c = readCh(); 3127 switch (c) 3128 { 3129 case '%': 3130 case '<': case '>': case '&': 3131 case ',': case '|': case '*': case '+': case '?': 3132 case ')': 3133 case '=': 3134 case '\'': case '"': 3135 case '[': 3136 case ' ': case '\t': case '\n': case '\r': 3137 case ';': 3138 case '/': 3139 unread(c); 3140 if (nameBufferPos == 0) 3141 { 3142 fatal ("name expected"); 3143 } 3144 // punt on exact tests from Appendix A, but approximate them 3145 if (isName 3146 && !Character.isUnicodeIdentifierStart(nameBuffer[0]) 3147 && ":_".indexOf(nameBuffer[0]) == -1) 3148 { 3149 fatal("Not a name start character, U+" 3150 + Integer.toHexString(nameBuffer[0])); 3151 } 3152 String s = intern(nameBuffer, 0, nameBufferPos); 3153 nameBufferPos = 0; 3154 return s; 3155 default: 3156 // punt on exact tests from Appendix A, but approximate them 3157 3158 if ((nameBufferPos != 0 || !isName) 3159 && !Character.isUnicodeIdentifierPart(c) 3160 && ":-_.".indexOf(c) == -1 3161 && !isExtender(c)) 3162 { 3163 fatal("Not a name character, U+" 3164 + Integer.toHexString(c)); 3165 } 3166 if (nameBufferPos >= nameBuffer.length) 3167 { 3168 nameBuffer = 3169 (char[]) extendArray(nameBuffer, 3170 nameBuffer.length, nameBufferPos); 3171 } 3172 nameBuffer[nameBufferPos++] = c; 3173 } 3174 } 3175 } 3176 3177 private static boolean isExtender(char c) 3178 { 3179 // [88] Extender ::= ... 3180 return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387 3181 || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005 3182 || (c >= 0x3031 && c <= 0x3035) 3183 || (c >= 0x309d && c <= 0x309e) 3184 || (c >= 0x30fc && c <= 0x30fe); 3185 } 3186 3187 /** 3188 * Read a literal. With matching single or double quotes as 3189 * delimiters (and not embedded!) this is used to parse: 3190 * <pre> 3191 * [9] EntityValue ::= ... ([^%&] | PEReference | Reference)* ... 3192 * [10] AttValue ::= ... ([^<&] | Reference)* ... 3193 * [11] SystemLiteral ::= ... (URLchar - "'")* ... 3194 * [12] PubidLiteral ::= ... (PubidChar - "'")* ... 3195 * </pre> 3196 * as well as the quoted strings in XML and text declarations 3197 * (for version, encoding, and standalone) which have their 3198 * own constraints. 3199 */ 3200 private String readLiteral(int flags) 3201 throws SAXException, IOException 3202 { 3203 char delim, c; 3204 int startLine = line; 3205 boolean saved = expandPE; 3206 boolean savedReport = doReport; 3207 3208 // Find the first delimiter. 3209 delim = readCh(); 3210 if (delim != '"' && delim != '\'') 3211 { 3212 fatal("expected '\"' or \"'\"", delim, null); 3213 return null; 3214 } 3215 inLiteral = true; 3216 if ((flags & LIT_DISABLE_PE) != 0) 3217 { 3218 expandPE = false; 3219 } 3220 doReport = false; 3221 3222 // Each level of input source has its own buffer; remember 3223 // ours, so we won't read the ending delimiter from any 3224 // other input source, regardless of entity processing. 3225 char[] ourBuf = readBuffer; 3226 3227 // Read the literal. 3228 try 3229 { 3230 c = readCh(); 3231 loop: 3232 while (! (c == delim && readBuffer == ourBuf)) 3233 { 3234 switch (c) 3235 { 3236 // attributes and public ids are normalized 3237 // in almost the same ways 3238 case '\n': 3239 case '\r': 3240 if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0) 3241 { 3242 c = ' '; 3243 } 3244 break; 3245 case '\t': 3246 if ((flags & LIT_ATTRIBUTE) != 0) 3247 { 3248 c = ' '; 3249 } 3250 break; 3251 case '&': 3252 c = readCh(); 3253 // Char refs are expanded immediately, except for 3254 // all the cases where it's deferred. 3255 if (c == '#') 3256 { 3257 if ((flags & LIT_DISABLE_CREF) != 0) 3258 { 3259 dataBufferAppend('&'); 3260 break; 3261 } 3262 parseCharRef(false /* Do not do flushDataBuffer */); 3263 3264 // exotic WFness risk: this is an entity literal, 3265 // dataBuffer [dataBufferPos - 1] == '&', and 3266 // following chars are a _partial_ entity/char ref 3267 3268 // It looks like an entity ref ... 3269 } 3270 else 3271 { 3272 unread(c); 3273 // Expand it? 3274 if ((flags & LIT_ENTITY_REF) > 0) 3275 { 3276 parseEntityRef(false); 3277 //Is it just data? 3278 } 3279 else if ((flags & LIT_DISABLE_EREF) != 0) 3280 { 3281 dataBufferAppend('&'); 3282 3283 // OK, it will be an entity ref -- expanded later. 3284 } 3285 else 3286 { 3287 String name = readNmtoken(true); 3288 require(';'); 3289 dataBufferAppend('&'); 3290 dataBufferAppend(name); 3291 dataBufferAppend(';'); 3292 } 3293 } 3294 c = readCh(); 3295 continue loop; 3296 3297 case '<': 3298 // and why? Perhaps so "&foo;" expands the same 3299 // inside and outside an attribute? 3300 if ((flags & LIT_ATTRIBUTE) != 0) 3301 { 3302 fatal("attribute values may not contain '<'"); 3303 } 3304 break; 3305 3306 // We don't worry about case '%' and PE refs, readCh does. 3307 3308 default: 3309 break; 3310 } 3311 dataBufferAppend(c); 3312 c = readCh(); 3313 } 3314 } 3315 catch (EOFException e) 3316 { 3317 fatal("end of input while looking for delimiter (started on line " 3318 + startLine + ')', null, new Character(delim).toString()); 3319 } 3320 inLiteral = false; 3321 expandPE = saved; 3322 doReport = savedReport; 3323 3324 // Normalise whitespace if necessary. 3325 if ((flags & LIT_NORMALIZE) > 0) 3326 { 3327 dataBufferNormalize(); 3328 } 3329 3330 // Return the value. 3331 return dataBufferToString(); 3332 } 3333 3334 /** 3335 * Try reading external identifiers. 3336 * A system identifier is not required for notations. 3337 * @param inNotation Are we parsing a notation decl? 3338 * @param isSubset Parsing external subset decl (may be omitted)? 3339 * @return A three-member String array containing the identifiers, 3340 * or nulls. Order: public, system, baseURI. 3341 */ 3342 private ExternalIdentifiers readExternalIds(boolean inNotation, 3343 boolean isSubset) 3344 throws Exception 3345 { 3346 char c; 3347 ExternalIdentifiers ids = new ExternalIdentifiers(); 3348 int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF; 3349 3350 if (tryRead("PUBLIC")) 3351 { 3352 requireWhitespace(); 3353 ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags); 3354 if (inNotation) 3355 { 3356 skipWhitespace(); 3357 c = readCh(); 3358 unread(c); 3359 if (c == '"' || c == '\'') 3360 { 3361 ids.systemId = readLiteral(flags); 3362 } 3363 } 3364 else 3365 { 3366 requireWhitespace(); 3367 ids.systemId = readLiteral(flags); 3368 } 3369 3370 for (int i = 0; i < ids.publicId.length(); i++) 3371 { 3372 c = ids.publicId.charAt(i); 3373 if (c >= 'a' && c <= 'z') 3374 { 3375 continue; 3376 } 3377 if (c >= 'A' && c <= 'Z') 3378 { 3379 continue; 3380 } 3381 if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1) 3382 { 3383 continue; 3384 } 3385 fatal("illegal PUBLIC id character U+" 3386 + Integer.toHexString(c)); 3387 } 3388 } 3389 else if (tryRead("SYSTEM")) 3390 { 3391 requireWhitespace(); 3392 ids.systemId = readLiteral(flags); 3393 } 3394 else if (!isSubset) 3395 { 3396 fatal("missing SYSTEM or PUBLIC keyword"); 3397 } 3398 3399 if (ids.systemId != null) 3400 { 3401 if (ids.systemId.indexOf('#') != -1) 3402 { 3403 handler.verror("SYSTEM id has a URI fragment: " + ids.systemId); 3404 } 3405 ids.baseUri = handler.getSystemId(); 3406 if (ids.baseUri == null && uriWarnings) 3407 { 3408 handler.warn("No base URI; hope URI is absolute: " 3409 + ids.systemId); 3410 } 3411 } 3412 3413 return ids; 3414 } 3415 3416 /** 3417 * Test if a character is whitespace. 3418 * <pre> 3419 * [3] S ::= (#x20 | #x9 | #xd | #xa)+ 3420 * </pre> 3421 * @param c The character to test. 3422 * @return true if the character is whitespace. 3423 */ 3424 private final boolean isWhitespace(char c) 3425 { 3426 if (c > 0x20) 3427 { 3428 return false; 3429 } 3430 if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d) 3431 { 3432 return true; 3433 } 3434 return false; // illegal ... 3435 } 3436 3437 ////////////////////////////////////////////////////////////////////// 3438 // Utility routines. 3439 ////////////////////////////////////////////////////////////////////// 3440 3441 /** 3442 * Add a character to the data buffer. 3443 */ 3444 private void dataBufferAppend(char c) 3445 { 3446 // Expand buffer if necessary. 3447 if (dataBufferPos >= dataBuffer.length) 3448 { 3449 dataBuffer = (char[]) extendArray(dataBuffer, 3450 dataBuffer.length, dataBufferPos); 3451 } 3452 dataBuffer[dataBufferPos++] = c; 3453 } 3454 3455 /** 3456 * Add a string to the data buffer. 3457 */ 3458 private void dataBufferAppend(String s) 3459 { 3460 dataBufferAppend(s.toCharArray(), 0, s.length()); 3461 } 3462 3463 /** 3464 * Append (part of) a character array to the data buffer. 3465 */ 3466 private void dataBufferAppend(char[] ch, int start, int length) 3467 { 3468 dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length, 3469 dataBufferPos + length); 3470 3471 System.arraycopy(ch, start, dataBuffer, dataBufferPos, length); 3472 dataBufferPos += length; 3473 } 3474 3475 /** 3476 * Normalise space characters in the data buffer. 3477 */ 3478 private void dataBufferNormalize() 3479 { 3480 int i = 0; 3481 int j = 0; 3482 int end = dataBufferPos; 3483 3484 // Skip spaces at the start. 3485 while (j < end && dataBuffer[j] == ' ') 3486 { 3487 j++; 3488 } 3489 3490 // Skip whitespace at the end. 3491 while (end > j && dataBuffer[end - 1] == ' ') 3492 { 3493 end --; 3494 } 3495 3496 // Start copying to the left. 3497 while (j < end) 3498 { 3499 3500 char c = dataBuffer[j++]; 3501 3502 // Normalise all other spaces to 3503 // a single space. 3504 if (c == ' ') 3505 { 3506 while (j < end && dataBuffer[j++] == ' ') 3507 { 3508 continue; 3509 } 3510 dataBuffer[i++] = ' '; 3511 dataBuffer[i++] = dataBuffer[j - 1]; 3512 } 3513 else 3514 { 3515 dataBuffer[i++] = c; 3516 } 3517 } 3518 3519 // The new length is <= the old one. 3520 dataBufferPos = i; 3521 } 3522 3523 /** 3524 * Convert the data buffer to a string. 3525 */ 3526 private String dataBufferToString() 3527 { 3528 String s = new String(dataBuffer, 0, dataBufferPos); 3529 dataBufferPos = 0; 3530 return s; 3531 } 3532 3533 /** 3534 * Flush the contents of the data buffer to the handler, as 3535 * appropriate, and reset the buffer for new input. 3536 */ 3537 private void dataBufferFlush() 3538 throws SAXException 3539 { 3540 if (currentElementContent == CONTENT_ELEMENTS 3541 && dataBufferPos > 0 3542 && !inCDATA) 3543 { 3544 // We can't just trust the buffer to be whitespace, there 3545 // are (error) cases when it isn't 3546 for (int i = 0; i < dataBufferPos; i++) 3547 { 3548 if (!isWhitespace(dataBuffer[i])) 3549 { 3550 handler.charData(dataBuffer, 0, dataBufferPos); 3551 dataBufferPos = 0; 3552 } 3553 } 3554 if (dataBufferPos > 0) 3555 { 3556 handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos); 3557 dataBufferPos = 0; 3558 } 3559 } 3560 else if (dataBufferPos > 0) 3561 { 3562 handler.charData(dataBuffer, 0, dataBufferPos); 3563 dataBufferPos = 0; 3564 } 3565 } 3566 3567 /** 3568 * Require a string to appear, or throw an exception. 3569 * <p><em>Precondition:</em> Entity expansion is not required. 3570 * <p><em>Precondition:</em> data buffer has no characters that 3571 * will get sent to the application. 3572 */ 3573 private void require(String delim) 3574 throws SAXException, IOException 3575 { 3576 int length = delim.length(); 3577 char[] ch; 3578 3579 if (length < dataBuffer.length) 3580 { 3581 ch = dataBuffer; 3582 delim.getChars(0, length, ch, 0); 3583 } 3584 else 3585 { 3586 ch = delim.toCharArray(); 3587 } 3588 3589 if (USE_CHEATS && length <= (readBufferLength - readBufferPos)) 3590 { 3591 int offset = readBufferPos; 3592 3593 for (int i = 0; i < length; i++, offset++) 3594 { 3595 if (ch[i] != readBuffer[offset]) 3596 { 3597 fatal ("required string", null, delim); 3598 } 3599 } 3600 readBufferPos = offset; 3601 3602 } 3603 else 3604 { 3605 for (int i = 0; i < length; i++) 3606 { 3607 require(ch[i]); 3608 } 3609 } 3610 } 3611 3612 /** 3613 * Require a character to appear, or throw an exception. 3614 */ 3615 private void require(char delim) 3616 throws SAXException, IOException 3617 { 3618 char c = readCh(); 3619 3620 if (c != delim) 3621 { 3622 fatal("required character", c, new Character(delim).toString()); 3623 } 3624 } 3625 3626 /** 3627 * Create an interned string from a character array. 3628 * Ælfred uses this method to create an interned version 3629 * of all names and name tokens, so that it can test equality 3630 * with <code>==</code> instead of <code>String.equals ()</code>. 3631 * 3632 * <p>This is much more efficient than constructing a non-interned 3633 * string first, and then interning it. 3634 * 3635 * @param ch an array of characters for building the string. 3636 * @param start the starting position in the array. 3637 * @param length the number of characters to place in the string. 3638 * @return an interned string. 3639 * @see #intern (String) 3640 * @see java.lang.String#intern 3641 */ 3642 public String intern(char[] ch, int start, int length) 3643 { 3644 int index = 0; 3645 int hash = 0; 3646 Object[] bucket; 3647 3648 // Generate a hash code. This is a widely used string hash, 3649 // often attributed to Brian Kernighan. 3650 for (int i = start; i < start + length; i++) 3651 { 3652 hash = 31 * hash + ch[i]; 3653 } 3654 hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH; 3655 3656 // Get the bucket -- consists of {array,String} pairs 3657 if ((bucket = symbolTable[hash]) == null) 3658 { 3659 // first string in this bucket 3660 bucket = new Object[8]; 3661 3662 // Search for a matching tuple, and 3663 // return the string if we find one. 3664 } 3665 else 3666 { 3667 while (index < bucket.length) 3668 { 3669 char[] chFound = (char[]) bucket[index]; 3670 3671 // Stop when we hit an empty entry. 3672 if (chFound == null) 3673 { 3674 break; 3675 } 3676 3677 // If they're the same length, check for a match. 3678 if (chFound.length == length) 3679 { 3680 for (int i = 0; i < chFound.length; i++) 3681 { 3682 // continue search on failure 3683 if (ch[start + i] != chFound[i]) 3684 { 3685 break; 3686 } 3687 else if (i == length - 1) 3688 { 3689 // That's it, we have a match! 3690 return (String) bucket[index + 1]; 3691 } 3692 } 3693 } 3694 index += 2; 3695 } 3696 // Not found -- we'll have to add it. 3697 3698 // Do we have to grow the bucket? 3699 bucket = (Object[]) extendArray(bucket, bucket.length, index); 3700 } 3701 symbolTable[hash] = bucket; 3702 3703 // OK, add it to the end of the bucket -- "local" interning. 3704 // Intern "globally" to let applications share interning benefits. 3705 // That is, "!=" and "==" work on our strings, not just equals(). 3706 String s = new String(ch, start, length).intern(); 3707 bucket[index] = s.toCharArray(); 3708 bucket[index + 1] = s; 3709 return s; 3710 } 3711 3712 /** 3713 * Ensure the capacity of an array, allocating a new one if 3714 * necessary. Usually extends only for name hash collisions. 3715 */ 3716 private Object extendArray(Object array, int currentSize, int requiredSize) 3717 { 3718 if (requiredSize < currentSize) 3719 { 3720 return array; 3721 } 3722 else 3723 { 3724 Object newArray = null; 3725 int newSize = currentSize * 2; 3726 3727 if (newSize <= requiredSize) 3728 { 3729 newSize = requiredSize + 1; 3730 } 3731 3732 if (array instanceof char[]) 3733 { 3734 newArray = new char[newSize]; 3735 } 3736 else if (array instanceof Object[]) 3737 { 3738 newArray = new Object[newSize]; 3739 } 3740 else 3741 { 3742 throw new RuntimeException(); 3743 } 3744 3745 System.arraycopy(array, 0, newArray, 0, currentSize); 3746 return newArray; 3747 } 3748 } 3749 3750 ////////////////////////////////////////////////////////////////////// 3751 // XML query routines. 3752 ////////////////////////////////////////////////////////////////////// 3753 3754 boolean isStandalone() 3755 { 3756 return docIsStandalone; 3757 } 3758 3759 // 3760 // Elements 3761 // 3762 3763 private int getContentType(ElementDecl element, int defaultType) 3764 { 3765 int retval; 3766 3767 if (element == null) 3768 { 3769 return defaultType; 3770 } 3771 retval = element.contentType; 3772 if (retval == CONTENT_UNDECLARED) 3773 { 3774 retval = defaultType; 3775 } 3776 return retval; 3777 } 3778 3779 /** 3780 * Look up the content type of an element. 3781 * @param name The element type name. 3782 * @return An integer constant representing the content type. 3783 * @see #CONTENT_UNDECLARED 3784 * @see #CONTENT_ANY 3785 * @see #CONTENT_EMPTY 3786 * @see #CONTENT_MIXED 3787 * @see #CONTENT_ELEMENTS 3788 */ 3789 public int getElementContentType(String name) 3790 { 3791 ElementDecl element = elementInfo.get(name); 3792 return getContentType(element, CONTENT_UNDECLARED); 3793 } 3794 3795 /** 3796 * Register an element. 3797 * Array format: 3798 * [0] element type name 3799 * [1] content model (mixed, elements only) 3800 * [2] attribute hash table 3801 */ 3802 private void setElement(String name, int contentType, 3803 String contentModel, HashMap<String, AttributeDecl> attributes) 3804 throws SAXException 3805 { 3806 if (skippedPE) 3807 { 3808 return; 3809 } 3810 3811 ElementDecl element = elementInfo.get(name); 3812 3813 // first <!ELEMENT ...> or <!ATTLIST ...> for this type? 3814 if (element == null) 3815 { 3816 element = new ElementDecl(); 3817 element.contentType = contentType; 3818 element.contentModel = contentModel; 3819 element.attributes = attributes; 3820 elementInfo.put(name, element); 3821 return; 3822 } 3823 3824 // <!ELEMENT ...> declaration? 3825 if (contentType != CONTENT_UNDECLARED) 3826 { 3827 // ... following an associated <!ATTLIST ...> 3828 if (element.contentType == CONTENT_UNDECLARED) 3829 { 3830 element.contentType = contentType; 3831 element.contentModel = contentModel; 3832 } 3833 else 3834 { 3835 // VC: Unique Element Type Declaration 3836 handler.verror("multiple declarations for element type: " 3837 + name); 3838 } 3839 } 3840 3841 // first <!ATTLIST ...>, before <!ELEMENT ...> ? 3842 else if (attributes != null) 3843 { 3844 element.attributes = attributes; 3845 } 3846 } 3847 3848 /** 3849 * Look up the attribute hash table for an element. 3850 * The hash table is the second item in the element array. 3851 */ 3852 private HashMap<String, AttributeDecl> getElementAttributes(String name) 3853 { 3854 ElementDecl element = elementInfo.get(name); 3855 return (element == null) ? null : element.attributes; 3856 } 3857 3858 // 3859 // Attributes 3860 // 3861 3862 /** 3863 * Get the declared attributes for an element type. 3864 * @param elname The name of the element type. 3865 * @return An iterator over all the attributes declared for 3866 * a specific element type. The results will be valid only 3867 * after the DTD (if any) has been parsed. 3868 * @see #getAttributeType 3869 * @see #getAttributeEnumeration 3870 * @see #getAttributeDefaultValueType 3871 * @see #getAttributeDefaultValue 3872 * @see #getAttributeExpandedValue 3873 */ 3874 private Iterator<String> declaredAttributes(ElementDecl element) 3875 { 3876 HashMap<String, AttributeDecl> attlist; 3877 3878 if (element == null) 3879 { 3880 return null; 3881 } 3882 if ((attlist = element.attributes) == null) 3883 { 3884 return null; 3885 } 3886 return attlist.keySet().iterator(); 3887 } 3888 3889 /** 3890 * Get the declared attributes for an element type. 3891 * @param elname The name of the element type. 3892 * @return An iterator over all the attributes declared for 3893 * a specific element type. The results will be valid only 3894 * after the DTD (if any) has been parsed. 3895 * @see #getAttributeType 3896 * @see #getAttributeEnumeration 3897 * @see #getAttributeDefaultValueType 3898 * @see #getAttributeDefaultValue 3899 * @see #getAttributeExpandedValue 3900 */ 3901 public Iterator<String> declaredAttributes(String elname) 3902 { 3903 return declaredAttributes(elementInfo.get(elname)); 3904 } 3905 3906 /** 3907 * Retrieve the declared type of an attribute. 3908 * @param name The name of the associated element. 3909 * @param aname The name of the attribute. 3910 * @return An interend string denoting the type, or null 3911 * indicating an undeclared attribute. 3912 */ 3913 public String getAttributeType(String name, String aname) 3914 { 3915 AttributeDecl attribute = getAttribute(name, aname); 3916 return (attribute == null) ? null : attribute.type; 3917 } 3918 3919 /** 3920 * Retrieve the allowed values for an enumerated attribute type. 3921 * @param name The name of the associated element. 3922 * @param aname The name of the attribute. 3923 * @return A string containing the token list. 3924 */ 3925 public String getAttributeEnumeration(String name, String aname) 3926 { 3927 AttributeDecl attribute = getAttribute(name, aname); 3928 // assert: attribute.enumeration is "ENUMERATION" or "NOTATION" 3929 return (attribute == null) ? null : attribute.enumeration; 3930 } 3931 3932 /** 3933 * Retrieve the default value of a declared attribute. 3934 * @param name The name of the associated element. 3935 * @param aname The name of the attribute. 3936 * @return The default value, or null if the attribute was 3937 * #IMPLIED or simply undeclared and unspecified. 3938 * @see #getAttributeExpandedValue 3939 */ 3940 public String getAttributeDefaultValue(String name, String aname) 3941 { 3942 AttributeDecl attribute = getAttribute(name, aname); 3943 return (attribute == null) ? null : attribute.value; 3944 } 3945 3946 /* 3947 3948 // FIXME: Leaving this in, until W3C finally resolves the confusion 3949 // between parts of the XML 2nd REC about when entity declararations 3950 // are guaranteed to be known. Current code matches what section 5.1 3951 // (conformance) describes, but some readings of the self-contradicting 3952 // text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that 3953 // attribute expansion/normalization must be deferred in some cases 3954 // (just TRY to identify them!). 3955 3956 * Retrieve the expanded value of a declared attribute. 3957 * <p>General entities (and char refs) will be expanded (once). 3958 * @param name The name of the associated element. 3959 * @param aname The name of the attribute. 3960 * @return The expanded default value, or null if the attribute was 3961 * #IMPLIED or simply undeclared 3962 * @see #getAttributeDefaultValue 3963 public String getAttributeExpandedValue (String name, String aname) 3964 throws Exception 3965 { 3966 AttributeDecl attribute = getAttribute (name, aname); 3967 3968 if (attribute == null) { 3969 return null; 3970 } else if (attribute.defaultValue == null && attribute.value != null) { 3971 // we MUST use the same buf for both quotes else the literal 3972 // can't be properly terminated 3973 char buf [] = new char [1]; 3974 int flags = LIT_ENTITY_REF | LIT_ATTRIBUTE; 3975 String type = getAttributeType (name, aname); 3976 3977 if (type != "CDATA" && type != null) 3978 flags |= LIT_NORMALIZE; 3979 buf [0] = '"'; 3980 pushCharArray (null, buf, 0, 1); 3981 pushString (null, attribute.value); 3982 pushCharArray (null, buf, 0, 1); 3983 attribute.defaultValue = readLiteral (flags); 3984 } 3985 return attribute.defaultValue; 3986 } 3987 */ 3988 3989 /** 3990 * Retrieve the default value mode of a declared attribute. 3991 * @see #ATTRIBUTE_DEFAULT_SPECIFIED 3992 * @see #ATTRIBUTE_DEFAULT_IMPLIED 3993 * @see #ATTRIBUTE_DEFAULT_REQUIRED 3994 * @see #ATTRIBUTE_DEFAULT_FIXED 3995 */ 3996 public int getAttributeDefaultValueType(String name, String aname) 3997 { 3998 AttributeDecl attribute = getAttribute(name, aname); 3999 return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED : 4000 attribute.valueType; 4001 } 4002 4003 /** 4004 * Register an attribute declaration for later retrieval. 4005 * Format: 4006 * - String type 4007 * - String default value 4008 * - int value type 4009 * - enumeration 4010 * - processed default value 4011 */ 4012 private void setAttribute(String elName, String name, String type, 4013 String enumeration, String value, int valueType) 4014 throws Exception 4015 { 4016 HashMap<String, AttributeDecl> attlist; 4017 4018 if (skippedPE) 4019 { 4020 return; 4021 } 4022 4023 // Create a new hashtable if necessary. 4024 attlist = getElementAttributes(elName); 4025 if (attlist == null) 4026 { 4027 attlist = new HashMap<String, AttributeDecl>(); 4028 } 4029 4030 // ignore multiple attribute declarations! 4031 if (attlist.get(name) != null) 4032 { 4033 // warn ... 4034 return; 4035 } 4036 else 4037 { 4038 AttributeDecl attribute = new AttributeDecl(); 4039 attribute.type = type; 4040 attribute.value = value; 4041 attribute.valueType = valueType; 4042 attribute.enumeration = enumeration; 4043 attlist.put(name, attribute); 4044 4045 // save; but don't overwrite any existing <!ELEMENT ...> 4046 setElement(elName, CONTENT_UNDECLARED, null, attlist); 4047 } 4048 } 4049 4050 /** 4051 * Retrieve the attribute declaration for the given element name and name. 4052 */ 4053 private AttributeDecl getAttribute(String elName, String name) 4054 { 4055 HashMap<String, AttributeDecl> attlist = getElementAttributes(elName); 4056 return (attlist == null) ? null : attlist.get(name); 4057 } 4058 4059 // 4060 // Entities 4061 // 4062 4063 /** 4064 * Find the type of an entity. 4065 * @returns An integer constant representing the entity type. 4066 * @see #ENTITY_UNDECLARED 4067 * @see #ENTITY_INTERNAL 4068 * @see #ENTITY_NDATA 4069 * @see #ENTITY_TEXT 4070 */ 4071 public int getEntityType(String ename) 4072 { 4073 EntityInfo entity = entityInfo.get(ename); 4074 return (entity == null) ? ENTITY_UNDECLARED : entity.type; 4075 } 4076 4077 /** 4078 * Return an external entity's identifiers. 4079 * @param ename The name of the external entity. 4080 * @return The entity's public identifier, system identifier, and base URI. 4081 * Null if the entity was not declared as an external entity. 4082 * @see #getEntityType 4083 */ 4084 public ExternalIdentifiers getEntityIds(String ename) 4085 { 4086 EntityInfo entity = entityInfo.get(ename); 4087 return (entity == null) ? null : entity.ids; 4088 } 4089 4090 /** 4091 * Return an internal entity's replacement text. 4092 * @param ename The name of the internal entity. 4093 * @return The entity's replacement text, or null if 4094 * the entity was not declared as an internal entity. 4095 * @see #getEntityType 4096 */ 4097 public String getEntityValue(String ename) 4098 { 4099 EntityInfo entity = entityInfo.get(ename); 4100 return (entity == null) ? null : entity.value; 4101 } 4102 4103 /** 4104 * Register an entity declaration for later retrieval. 4105 */ 4106 private void setInternalEntity(String eName, String value) 4107 throws SAXException 4108 { 4109 if (skippedPE) 4110 { 4111 return; 4112 } 4113 4114 if (entityInfo.get(eName) == null) 4115 { 4116 EntityInfo entity = new EntityInfo(); 4117 entity.type = ENTITY_INTERNAL; 4118 entity.value = value; 4119 entityInfo.put(eName, entity); 4120 } 4121 if (handler.stringInterning) 4122 { 4123 if ("lt" == eName || "gt" == eName || "quot" == eName 4124 || "apos" == eName || "amp" == eName) 4125 { 4126 return; 4127 } 4128 } 4129 else 4130 { 4131 if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName) 4132 || "apos".equals(eName) || "amp".equals(eName)) 4133 { 4134 return; 4135 } 4136 } 4137 handler.getDeclHandler().internalEntityDecl(eName, value); 4138 } 4139 4140 /** 4141 * Register an external entity declaration for later retrieval. 4142 */ 4143 private void setExternalEntity(String eName, int eClass, 4144 ExternalIdentifiers ids, String nName) 4145 { 4146 if (entityInfo.get(eName) == null) 4147 { 4148 EntityInfo entity = new EntityInfo(); 4149 entity.type = eClass; 4150 entity.ids = ids; 4151 entity.notationName = nName; 4152 entityInfo.put(eName, entity); 4153 } 4154 } 4155 4156 // 4157 // Notations. 4158 // 4159 4160 /** 4161 * Report a notation declaration, checking for duplicates. 4162 */ 4163 private void setNotation(String nname, ExternalIdentifiers ids) 4164 throws SAXException 4165 { 4166 if (skippedPE) 4167 { 4168 return; 4169 } 4170 4171 handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri); 4172 if (notationInfo.get(nname) == null) 4173 { 4174 notationInfo.put(nname, nname); 4175 } 4176 else 4177 { 4178 // VC: Unique Notation Name 4179 handler.verror("Duplicate notation name decl: " + nname); 4180 } 4181 } 4182 4183 // 4184 // Location. 4185 // 4186 4187 /** 4188 * Return the current line number. 4189 */ 4190 public int getLineNumber() 4191 { 4192 return line; 4193 } 4194 4195 /** 4196 * Return the current column number. 4197 */ 4198 public int getColumnNumber() 4199 { 4200 return column; 4201 } 4202 4203 ////////////////////////////////////////////////////////////////////// 4204 // High-level I/O. 4205 ////////////////////////////////////////////////////////////////////// 4206 4207 /** 4208 * Read a single character from the readBuffer. 4209 * <p>The readDataChunk () method maintains the buffer. 4210 * <p>If we hit the end of an entity, try to pop the stack and 4211 * keep going. 4212 * <p> (This approach doesn't really enforce XML's rules about 4213 * entity boundaries, but this is not currently a validating 4214 * parser). 4215 * <p>This routine also attempts to keep track of the current 4216 * position in external entities, but it's not entirely accurate. 4217 * @return The next available input character. 4218 * @see #unread (char) 4219 * @see #readDataChunk 4220 * @see #readBuffer 4221 * @see #line 4222 * @return The next character from the current input source. 4223 */ 4224 private char readCh() 4225 throws SAXException, IOException 4226 { 4227 // As long as there's nothing in the 4228 // read buffer, try reading more data 4229 // (for an external entity) or popping 4230 // the entity stack (for either). 4231 while (readBufferPos >= readBufferLength) 4232 { 4233 switch (sourceType) 4234 { 4235 case INPUT_READER: 4236 readDataChunk(); 4237 while (readBufferLength < 1) 4238 { 4239 popInput(); 4240 if (readBufferLength < 1) 4241 { 4242 readDataChunk(); 4243 } 4244 } 4245 break; 4246 4247 default: 4248 4249 popInput(); 4250 break; 4251 } 4252 } 4253 4254 char c = readBuffer[readBufferPos++]; 4255 4256 // copied from fi.iki.hsivonen.htmlparser 4257 if ((c & 0xFC00) == 0xDC00) { 4258 // Got a low surrogate. See if prev was high surrogate 4259 if ((prev & 0xFC00) == 0xD800) { 4260 int intVal = (prev << 10) + c + SURROGATE_OFFSET; 4261 if (isNonCharacter(intVal)) { 4262 handler.warn("Astral non-character."); 4263 } 4264 if (isAstralPrivateUse(intVal)) { 4265 warnAboutPrivateUseChar(); 4266 } 4267 } else { 4268 fatal("Unmatched low surrogate."); 4269 } 4270 prev = c; 4271 } else { 4272 // see if there was a lone high surrogate 4273 if ((prev & 0xFC00) == 0xD800) { 4274 fatal("Unmatched high surrogate."); 4275 } 4276 } 4277 4278 if (c == '\n') 4279 { 4280 line++; 4281 column = 0; 4282 } 4283 else 4284 { 4285 if (c == '<') 4286 { 4287 /* the most common return to parseContent () ... NOP */ 4288 } 4289 else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD) 4290 || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) 4291 && xmlVersion == XML_11)) 4292 { 4293 fatal("illegal XML character U+" + Integer.toHexString(c)); 4294 } 4295 else if (c >= '\u007F' && c <= '\u009F') // 2006-04-25 hsivonen 4296 { 4297 handler.warn("Saw a control character: U+00" + Integer.toHexString(c) + "."); 4298 } 4299 4300 if (isPrivateUse(c)) 4301 { 4302 warnAboutPrivateUseChar(); 4303 } 4304 // If we're in the DTD and in a context where PEs get expanded, 4305 // do so ... 1/14/2000 errata identify those contexts. There 4306 // are also spots in the internal subset where PE refs are fatal 4307 // errors, hence yet another flag. 4308 else if (c == '%' && expandPE) 4309 { 4310 if (peIsError) 4311 { 4312 fatal("PE reference within decl in internal subset."); 4313 } 4314 parsePEReference(); 4315 return readCh(); 4316 } 4317 column++; 4318 } 4319 4320 return c; 4321 } 4322 4323 /** 4324 * Push a single character back onto the current input stream. 4325 * <p>This method usually pushes the character back onto 4326 * the readBuffer. 4327 * <p>I don't think that this would ever be called with 4328 * readBufferPos = 0, because the methods always reads a character 4329 * before unreading it, but just in case, I've added a boundary 4330 * condition. 4331 * @param c The character to push back. 4332 * @see #readCh 4333 * @see #unread (char[]) 4334 * @see #readBuffer 4335 */ 4336 private void unread(char c) 4337 throws SAXException 4338 { 4339 // Normal condition. 4340 if (c == '\n') 4341 { 4342 line--; 4343 column = -1; 4344 } 4345 if (readBufferPos > 0) 4346 { 4347 readBuffer[--readBufferPos] = c; 4348 } 4349 else 4350 { 4351 pushString(null, new Character(c).toString()); 4352 } 4353 } 4354 4355 /** 4356 * Push a char array back onto the current input stream. 4357 * <p>NOTE: you must <em>never</em> push back characters that you 4358 * haven't actually read: use pushString () instead. 4359 * @see #readCh 4360 * @see #unread (char) 4361 * @see #readBuffer 4362 * @see #pushString 4363 */ 4364 private void unread(char[] ch, int length) 4365 throws SAXException 4366 { 4367 for (int i = 0; i < length; i++) 4368 { 4369 if (ch[i] == '\n') 4370 { 4371 line--; 4372 column = -1; 4373 } 4374 } 4375 if (length < readBufferPos) 4376 { 4377 readBufferPos -= length; 4378 } 4379 else 4380 { 4381 pushCharArray(null, ch, 0, length); 4382 } 4383 } 4384 4385 /** 4386 * Push, or skip, a new external input source. 4387 * The source will be some kind of parsed entity, such as a PE 4388 * (including the external DTD subset) or content for the body. 4389 * 4390 * @param url The java.net.URL object for the entity. 4391 * @see SAXDriver#resolveEntity 4392 * @see #pushString 4393 * @see #sourceType 4394 * @see #pushInput 4395 * @see #detectEncoding 4396 * @see #sourceType 4397 * @see #readBuffer 4398 */ 4399 private void pushURL(boolean isPE, 4400 String ename, 4401 ExternalIdentifiers ids, 4402 Reader aReader, 4403 InputStream aStream, 4404 String aEncoding, 4405 boolean doResolve) 4406 throws SAXException, IOException 4407 { 4408 // removed boolean ignoreEncoding -- 2006-02-03 hsivonen 4409 String systemId; 4410 InputSource source; 4411 InputSource scratch = new InputSource(); 4412 4413 if (!isPE) 4414 { 4415 dataBufferFlush(); 4416 } 4417 4418 scratch.setPublicId(ids.publicId); 4419 scratch.setSystemId(ids.systemId); 4420 4421 // See if we should skip or substitute the entity. 4422 // If we're not skipping, resolving reports startEntity() 4423 // and updates the (handler's) stack of URIs. 4424 if (doResolve) 4425 { 4426 // assert (stream == null && reader == null && encoding == null) 4427 source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri); 4428 if (source == null) 4429 { 4430 handler.warn("skipping entity: " + ename); 4431 handler.skippedEntity(ename); 4432 if (isPE) 4433 { 4434 skippedPE = true; 4435 } 4436 return; 4437 } 4438 4439 // we might be using alternate IDs/encoding 4440 systemId = source.getSystemId(); 4441 // The following warning and setting systemId was deleted bcause 4442 // the application has the option of not setting systemId 4443 // provided that it has set the characte/byte stream. 4444 /* 4445 if (systemId == null) { 4446 handler.warn ("missing system ID, using " + ids.systemId); 4447 systemId = ids.systemId; 4448 } 4449 */ 4450 } 4451 else 4452 { 4453 // "[document]", or "[dtd]" via getExternalSubset() 4454 scratch.setCharacterStream(aReader); 4455 scratch.setByteStream(aStream); 4456 scratch.setEncoding(aEncoding); 4457 source = scratch; 4458 systemId = ids.systemId; 4459 if (handler.stringInterning) 4460 { 4461 handler.startExternalEntity(ename, systemId, 4462 "[document]" == ename); 4463 } 4464 else 4465 { 4466 handler.startExternalEntity(ename, systemId, 4467 "[document]".equals(ename)); 4468 } 4469 } 4470 4471 // Push the existing status. 4472 pushInput(ename); 4473 4474 // Create a new read buffer. 4475 // (Note the four-character margin) 4476 readBuffer = new char[READ_BUFFER_MAX + 4]; 4477 readBufferPos = 0; 4478 readBufferLength = 0; 4479 readBufferOverflow = -1; 4480 is = null; 4481 reader = null; 4482 line = 1; 4483 column = 0; 4484 currentByteCount = 0; 4485 4486 // If there's an explicit character stream, just 4487 // ignore encoding declarations. 4488 if (source.getCharacterStream() != null) 4489 { 4490 sourceType = INPUT_READER; 4491 this.reader = source.getCharacterStream(); 4492 // swallow UTF-8 BOM -- 2006-02-03 hsivonen 4493 if ("UTF-8".equalsIgnoreCase(source.getEncoding())) 4494 { 4495 char bom = readCh(); 4496 if (bom != '\uFEFF') { 4497 unread(bom); 4498 } 4499 } 4500 tryEncodingDecl(source.getEncoding() == null ? "" : source.getEncoding()); 4501 return; 4502 } 4503 4504 // Else we handle the conversion, and need to ensure 4505 // it's done right. 4506 if (source.getByteStream() != null) 4507 { 4508 is = source.getByteStream(); 4509 } 4510 else 4511 { 4512 // Stop -- 2006-11-10 hsivonen 4513 fatal("The entity resolver didn't properly resolve the entity."); 4514 } 4515 4516 // If we get to here, there must be 4517 // an InputStream available. 4518 if (!is.markSupported()) 4519 { 4520 is = new BufferedInputStream(is); 4521 } 4522 4523 // Zapped bogus external encoding label code -- 2006-11-10 hsivonen 4524 4525 // if we got an external encoding label, use it ... 4526 if (source.getEncoding() != null) 4527 { 4528 draconianInputStreamReader(source.getEncoding(), is, false); 4529 if ("UTF-8".equalsIgnoreCase(source.getEncoding())) 4530 { 4531 char bom = readCh(); 4532 if (bom != '\uFEFF') { 4533 unread(bom); 4534 } 4535 } 4536 tryEncodingDecl(source.getEncoding()); 4537 // ... else autodetect from first bytes. 4538 } 4539 else 4540 { 4541 detectEncoding(); 4542 // Read any XML or text declaration. 4543 String enc = tryEncodingDecl(null); 4544 if (enc == null && "UTF-32" == characterEncoding) 4545 { 4546 fatal("UTF-32 was sniffed from the BOM, but there was no matching encoding declaration. The omission of explicit encoding declaration is only allowed with UTF-8 and UTF-16."); 4547 } 4548 } 4549 } 4550 4551 /** 4552 * Check for an encoding declaration. This is the second part of the 4553 * XML encoding autodetection algorithm, relying on detectEncoding to 4554 * get to the point that this part can read any encoding declaration 4555 * in the document (using only US-ASCII characters). 4556 * 4557 * <p> Because this part starts to fill parser buffers with this data, 4558 * it's tricky to setup a reader so that Java's built-in decoders can be 4559 * used for the character encodings that aren't built in to this parser 4560 * (such as EUC-JP, KOI8-R, Big5, etc). 4561 * 4562 * @return any encoding in the declaration, uppercased; or null 4563 * @see detectEncoding 4564 */ 4565 private String tryEncodingDecl(String encoding) 4566 throws SAXException, IOException 4567 { 4568 // Read the XML/text declaration. 4569 if (tryRead("<?xml")) 4570 { 4571 if (tryWhitespace()) 4572 { 4573 if (inputStack.size() > 0) 4574 { 4575 return parseTextDecl(encoding); 4576 } 4577 else 4578 { 4579 return parseXMLDecl(encoding); 4580 } 4581 } 4582 else 4583 { 4584 // <?xml-stylesheet ...?> or similar 4585 unread('l'); 4586 unread('m'); 4587 unread('x'); 4588 unread('?'); 4589 unread('<'); 4590 } 4591 } 4592 // 2006-02-03 hsivonen 4593 warnAboutLackOfEncodingDecl(encoding); 4594 return null; 4595 } 4596 4597 /** 4598 * @param characterEncoding 4599 * @throws SAXException 4600 */ 4601 private void warnAboutLackOfEncodingDecl(String encoding) throws SAXException { 4602 if (!(encoding == null || "".equals(encoding) 4603 || "UTF-8".equalsIgnoreCase(encoding) || "UTF-16".equalsIgnoreCase(encoding))) 4604 { 4605 handler.warn( 4606 "External encoding information specified a non-UTF-8/non-UTF-16 encoding (" + encoding + "), but there was no matching internal encoding declaration. The well-formedness status of this document may change when decoupled from the external encoding information."); 4607 } 4608 } 4609 4610 /** 4611 * Attempt to detect the encoding of an entity. 4612 * <p>The trick here (as suggested in the XML standard) is that 4613 * any entity not in UTF-8, or in UCS-2 with a byte-order mark, 4614 * <b>must</b> begin with an XML declaration or an encoding 4615 * declaration; we simply have to look for "<?xml" in various 4616 * encodings. 4617 * <p>This method has no way to distinguish among 8-bit encodings. 4618 * Instead, it sets up for UTF-8, then (possibly) revises its assumption 4619 * later in setupDecoding (). Any ASCII-derived 8-bit encoding 4620 * should work, but most will be rejected later by setupDecoding (). 4621 * @see #tryEncoding (byte[], byte, byte, byte, byte) 4622 * @see #tryEncoding (byte[], byte, byte) 4623 * @see #setupDecoding 4624 */ 4625 private void detectEncoding() 4626 throws SAXException, IOException 4627 { 4628 byte[] signature = new byte[4]; 4629 4630 // Read the first four bytes for 4631 // autodetection. 4632 is.mark(4); 4633 is.read(signature); 4634 is.reset(); 4635 4636 // 4637 // FIRST: four byte encodings (who uses these?) 4638 // 4639 if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, 4640 (byte) 0x00, (byte) 0x3c)) 4641 { 4642 // UCS-4 must begin with "<?xml" 4643 // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234) 4644 // "UTF-32BE" 4645 draconianInputStreamReader("UTF-32BE", is, false); 4646 } 4647 else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, 4648 (byte) 0x00, (byte) 0x00)) 4649 { 4650 // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321) 4651 // "UTF-32LE" 4652 draconianInputStreamReader("UTF-32LE", is, false); 4653 } 4654 else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, 4655 (byte) 0x3c, (byte) 0x00)) 4656 { 4657 // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143) 4658 fatal("Unsupported 32-bit encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); // 2006-02-03 hsivonen 4659 } 4660 else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, 4661 (byte) 0x00, (byte) 0x00)) 4662 { 4663 // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421) 4664 fatal("Unsupported 32-bit encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); // 2006-02-03 hsivonen 4665 } 4666 else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00, 4667 (byte) 0xfe, (byte) 0xff)) 4668 { 4669 // 00 00 fe ff UCS_4_1234 (with BOM) 4670 is.read(); is.read(); is.read(); is.read(); 4671 draconianInputStreamReader("UTF-32BE", is, false, "UTF-32"); 4672 } 4673 else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, 4674 (byte) 0x00, (byte) 0x00)) 4675 { 4676 // ff fe 00 00 UCS_4_4321 (with BOM) 4677 is.read(); is.read(); is.read(); is.read(); 4678 draconianInputStreamReader("UTF-32LE", is, false, "UTF-32"); 4679 } 4680 // SECOND: two byte encodings 4681 // note ... with 1/14/2000 errata the XML spec identifies some 4682 // more "broken UTF-16" autodetection cases, with no XML decl, 4683 // which we don't handle here (that's legal too). 4684 // 4685 else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff)) 4686 { 4687 // UCS-2 with a byte-order marker. (UTF-16) 4688 // 0xfe 0xff: UCS-2, big-endian (12) 4689 is.read(); is.read(); 4690 draconianInputStreamReader("UTF-16BE", is, false, "UTF-16"); 4691 } 4692 else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe)) 4693 { 4694 // UCS-2 with a byte-order marker. (UTF-16) 4695 // 0xff 0xfe: UCS-2, little-endian (21) 4696 is.read(); is.read(); 4697 draconianInputStreamReader("UTF-16LE", is, false, "UTF-16"); 4698 } 4699 else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c, 4700 (byte) 0x00, (byte) 0x3f)) 4701 { 4702 // UTF-16BE (otherwise, malformed UTF-16) 4703 // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark 4704 fatal("no byte-order mark for UTF-16 entity"); // s/UCS-2/UTF-16/ -- 2006-02-03 hsivonen 4705 } 4706 else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00, 4707 (byte) 0x3f, (byte) 0x00)) 4708 { 4709 // UTF-16LE (otherwise, malformed UTF-16) 4710 // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark 4711 fatal("no byte-order mark for UTF-16 entity"); // s/UCS-2/UTF-16/ -- 2006-02-03 hsivonen 4712 } 4713 // 4714 // THIRD: EBCDIC 4715 // 4716 else if (tryEncoding(signature, (byte) 0x4c, (byte) 0x6f, 4717 (byte) 0xa7, (byte) 0x94)) 4718 { 4719 // 4c 6f a7 94 ... we don't understand EBCDIC flavors 4720 fatal("Unsupported EBCDIC encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); 4721 } 4722 // 4723 // FOURTH: ASCII-derived encodings, fixed and variable lengths 4724 // 4725 else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f, 4726 (byte) 0x78, (byte) 0x6d)) 4727 { 4728 // ASCII derived 4729 // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING) 4730 characterEncoding = null; 4731 prefetchASCIIEncodingDecl(); 4732 } 4733 else if (signature[0] == (byte) 0xef 4734 && signature[1] == (byte) 0xbb 4735 && signature[2] == (byte) 0xbf) 4736 { 4737 // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text) 4738 // this un-needed notion slipped into XML 2nd ed through a 4739 // "non-normative" erratum; now required by MSFT and UDDI, 4740 // and E22 made it normative. 4741 is.read(); is.read(); is.read(); 4742 draconianInputStreamReader("UTF-8", is, false); 4743 } 4744 else 4745 { 4746 // (default) UTF-8 without encoding/XML declaration 4747 draconianInputStreamReader("UTF-8", is, false); 4748 } 4749 } 4750 4751 /** 4752 * Check for a four-byte signature. 4753 * <p>Utility routine for detectEncoding (). 4754 * <p>Always looks for some part of "<?XML" in a specific encoding. 4755 * @param sig The first four bytes read. 4756 * @param b1 The first byte of the signature 4757 * @param b2 The second byte of the signature 4758 * @param b3 The third byte of the signature 4759 * @param b4 The fourth byte of the signature 4760 * @see #detectEncoding 4761 */ 4762 private static boolean tryEncoding(byte[] sig, byte b1, byte b2, 4763 byte b3, byte b4) 4764 { 4765 return (sig[0] == b1 && sig[1] == b2 4766 && sig[2] == b3 && sig[3] == b4); 4767 } 4768 4769 /** 4770 * Check for a two-byte signature. 4771 * <p>Looks for a UCS-2 byte-order mark. 4772 * <p>Utility routine for detectEncoding (). 4773 * @param sig The first four bytes read. 4774 * @param b1 The first byte of the signature 4775 * @param b2 The second byte of the signature 4776 * @see #detectEncoding 4777 */ 4778 private static boolean tryEncoding(byte[] sig, byte b1, byte b2) 4779 { 4780 return ((sig[0] == b1) && (sig[1] == b2)); 4781 } 4782 4783 /** 4784 * This method pushes a string back onto input. 4785 * <p>It is useful either as the expansion of an internal entity, 4786 * or for backtracking during the parse. 4787 * <p>Call pushCharArray () to do the actual work. 4788 * @param s The string to push back onto input. 4789 * @see #pushCharArray 4790 */ 4791 private void pushString(String ename, String s) 4792 throws SAXException 4793 { 4794 char[] ch = s.toCharArray(); 4795 pushCharArray(ename, ch, 0, ch.length); 4796 } 4797 4798 /** 4799 * Push a new internal input source. 4800 * <p>This method is useful for expanding an internal entity, 4801 * or for unreading a string of characters. It creates a new 4802 * readBuffer containing the characters in the array, instead 4803 * of characters converted from an input byte stream. 4804 * @param ch The char array to push. 4805 * @see #pushString 4806 * @see #pushURL 4807 * @see #readBuffer 4808 * @see #sourceType 4809 * @see #pushInput 4810 */ 4811 private void pushCharArray(String ename, char[] ch, int start, int length) 4812 throws SAXException 4813 { 4814 // Push the existing status 4815 pushInput(ename); 4816 if (ename != null && doReport) 4817 { 4818 dataBufferFlush(); 4819 handler.startInternalEntity(ename); 4820 } 4821 sourceType = INPUT_INTERNAL; 4822 readBuffer = ch; 4823 readBufferPos = start; 4824 readBufferLength = length; 4825 readBufferOverflow = -1; 4826 } 4827 4828 /** 4829 * Save the current input source onto the stack. 4830 * <p>This method saves all of the global variables associated with 4831 * the current input source, so that they can be restored when a new 4832 * input source has finished. It also tests for entity recursion. 4833 * <p>The method saves the following global variables onto a stack 4834 * using a fixed-length array: 4835 * <ol> 4836 * <li>sourceType 4837 * <li>externalEntity 4838 * <li>readBuffer 4839 * <li>readBufferPos 4840 * <li>readBufferLength 4841 * <li>line 4842 * <li>characterEncoding 4843 * </ol> 4844 * @param ename The name of the entity (if any) causing the new input. 4845 * @see #popInput 4846 * @see #sourceType 4847 * @see #externalEntity 4848 * @see #readBuffer 4849 * @see #readBufferPos 4850 * @see #readBufferLength 4851 * @see #line 4852 * @see #characterEncoding 4853 */ 4854 private void pushInput(String ename) 4855 throws SAXException 4856 { 4857 // Check for entity recursion. 4858 if (ename != null) 4859 { 4860 Iterator<String> entities = entityStack.iterator(); 4861 while (entities.hasNext()) 4862 { 4863 String e = entities.next(); 4864 if (e != null && e == ename) 4865 { 4866 fatal("recursive reference to entity", ename, null); 4867 } 4868 } 4869 } 4870 entityStack.addLast(ename); 4871 4872 // Don't bother if there is no current input. 4873 if (sourceType == INPUT_NONE) 4874 { 4875 return; 4876 } 4877 4878 // Set up a snapshot of the current 4879 // input source. 4880 Input input = new Input(); 4881 4882 input.sourceType = sourceType; 4883 input.readBuffer = readBuffer; 4884 input.readBufferPos = readBufferPos; 4885 input.readBufferLength = readBufferLength; 4886 input.line = line; 4887 input.charecterEncoding = characterEncoding; 4888 input.readBufferOverflow = readBufferOverflow; 4889 input.is = is; 4890 input.currentByteCount = currentByteCount; 4891 input.column = column; 4892 input.reader = reader; 4893 input.prev = prev; 4894 input.normalizationChecker = normalizationChecker; 4895 4896 // Push it onto the stack. 4897 inputStack.addLast(input); 4898 } 4899 4900 /** 4901 * Restore a previous input source. 4902 * <p>This method restores all of the global variables associated with 4903 * the current input source. 4904 * @exception java.io.EOFException 4905 * If there are no more entries on the input stack. 4906 * @see #pushInput 4907 * @see #sourceType 4908 * @see #readBuffer 4909 * @see #readBufferPos 4910 * @see #readBufferLength 4911 * @see #line 4912 * @see #characterEncoding 4913 */ 4914 private void popInput() 4915 throws SAXException, IOException 4916 { 4917 String ename = entityStack.removeLast(); 4918 4919 if (ename != null && doReport) 4920 { 4921 dataBufferFlush(); 4922 } 4923 switch (sourceType) 4924 { 4925 case INPUT_READER: 4926 handler.endExternalEntity(ename); 4927 reader.close(); 4928 break; 4929 case INPUT_INTERNAL: 4930 if (ename != null && doReport) 4931 { 4932 handler.endInternalEntity(ename); 4933 } 4934 break; 4935 } 4936 4937 if (normalizationChecker != null) 4938 { 4939 normalizationChecker.flush(); 4940 } 4941 4942 // Throw an EOFException if there 4943 // is nothing else to pop. 4944 if (inputStack.isEmpty()) 4945 { 4946 throw new EOFException("no more input"); 4947 } 4948 4949 Input input = inputStack.removeLast(); 4950 4951 sourceType = input.sourceType; 4952 readBuffer = input.readBuffer; 4953 readBufferPos = input.readBufferPos; 4954 readBufferLength = input.readBufferLength; 4955 line = input.line; 4956 characterEncoding = input.charecterEncoding; 4957 readBufferOverflow = input.readBufferOverflow; 4958 is = input.is; 4959 currentByteCount = input.currentByteCount; 4960 column = input.column; 4961 reader = input.reader; 4962 prev = input.prev; 4963 normalizationChecker = input.normalizationChecker; 4964 } 4965 4966 /** 4967 * Return true if we can read the expected character. 4968 * <p>Note that the character will be removed from the input stream 4969 * on success, but will be put back on failure. Do not attempt to 4970 * read the character again if the method succeeds. 4971 * @param delim The character that should appear next. For a 4972 * insensitive match, you must supply this in upper-case. 4973 * @return true if the character was successfully read, or false if 4974 * it was not. 4975 * @see #tryRead (String) 4976 */ 4977 private boolean tryRead(char delim) 4978 throws SAXException, IOException 4979 { 4980 char c; 4981 4982 // Read the character 4983 c = readCh(); 4984 4985 // Test for a match, and push the character 4986 // back if the match fails. 4987 if (c == delim) 4988 { 4989 return true; 4990 } 4991 else 4992 { 4993 unread(c); 4994 return false; 4995 } 4996 } 4997 4998 /** 4999 * Return true if we can read the expected string. 5000 * <p>This is simply a convenience method. 5001 * <p>Note that the string will be removed from the input stream 5002 * on success, but will be put back on failure. Do not attempt to 5003 * read the string again if the method succeeds. 5004 * <p>This method will push back a character rather than an 5005 * array whenever possible (probably the majority of cases). 5006 * @param delim The string that should appear next. 5007 * @return true if the string was successfully read, or false if 5008 * it was not. 5009 * @see #tryRead (char) 5010 */ 5011 private boolean tryRead(String delim) 5012 throws SAXException, IOException 5013 { 5014 return tryRead(delim.toCharArray()); 5015 } 5016 5017 private boolean tryRead(char[] ch) 5018 throws SAXException, IOException 5019 { 5020 char c; 5021 5022 // Compare the input, character- 5023 // by character. 5024 5025 for (int i = 0; i < ch.length; i++) 5026 { 5027 c = readCh(); 5028 if (c != ch[i]) 5029 { 5030 unread(c); 5031 if (i != 0) 5032 { 5033 unread(ch, i); 5034 } 5035 return false; 5036 } 5037 } 5038 return true; 5039 } 5040 5041 /** 5042 * Return true if we can read some whitespace. 5043 * <p>This is simply a convenience method. 5044 * <p>This method will push back a character rather than an 5045 * array whenever possible (probably the majority of cases). 5046 * @return true if whitespace was found. 5047 */ 5048 private boolean tryWhitespace() 5049 throws SAXException, IOException 5050 { 5051 char c; 5052 c = readCh(); 5053 if (isWhitespace(c)) 5054 { 5055 skipWhitespace(); 5056 return true; 5057 } 5058 else 5059 { 5060 unread(c); 5061 return false; 5062 } 5063 } 5064 5065 private void parseUntil(char[] delim) 5066 throws SAXException, IOException 5067 { 5068 char c; 5069 int startLine = line; 5070 5071 try 5072 { 5073 while (!tryRead(delim)) 5074 { 5075 c = readCh(); 5076 dataBufferAppend(c); 5077 } 5078 } 5079 catch (EOFException e) 5080 { 5081 fatal("end of input while looking for delimiter " 5082 + "(started on line " + startLine 5083 + ')', null, new String(delim)); 5084 } 5085 } 5086 5087 ////////////////////////////////////////////////////////////////////// 5088 // Low-level I/O. 5089 ////////////////////////////////////////////////////////////////////// 5090 5091 /** 5092 * Prefetch US-ASCII XML/text decl from input stream into read buffer. 5093 * Doesn't buffer more than absolutely needed, so that when an encoding 5094 * decl says we need to create an InputStreamReader, we can discard our 5095 * buffer and reset(). Caller knows the first chars of the decl exist 5096 * in the input stream. 5097 */ 5098 private void prefetchASCIIEncodingDecl() 5099 throws SAXException, IOException 5100 { 5101 int ch; 5102 readBufferPos = readBufferLength = 0; 5103 5104 is.mark(readBuffer.length); 5105 while (true) 5106 { 5107 ch = is.read(); 5108 readBuffer[readBufferLength++] = (char) ch; 5109 switch (ch) 5110 { 5111 case (int) '>': 5112 return; 5113 case -1: 5114 fatal("file ends before end of XML or encoding declaration.", 5115 null, "?>"); 5116 } 5117 if (readBuffer.length == readBufferLength) 5118 { 5119 fatal("unfinished XML or encoding declaration"); 5120 } 5121 } 5122 } 5123 5124 /** 5125 * Read a chunk of data from an external input source. 5126 * <p>This is simply a front-end that fills the rawReadBuffer 5127 * with bytes, then calls the appropriate encoding handler. 5128 * @see #characterEncoding 5129 * @see #rawReadBuffer 5130 * @see #readBuffer 5131 * @see #filterCR 5132 * @see #copyUtf8ReadBuffer 5133 * @see #copyIso8859_1ReadBuffer 5134 * @see #copyUcs_2ReadBuffer 5135 * @see #copyUcs_4ReadBuffer 5136 */ 5137 private void readDataChunk() 5138 throws SAXException, IOException 5139 { 5140 int count; 5141 5142 // See if we have any overflow (filterCR sets for CR at end) 5143 if (readBufferOverflow > -1) 5144 { 5145 readBuffer[0] = (char) readBufferOverflow; 5146 readBufferOverflow = -1; 5147 readBufferPos = 1; 5148 sawCR = true; 5149 } 5150 else 5151 { 5152 readBufferPos = 0; 5153 sawCR = false; 5154 } 5155 5156 try 5157 { 5158 count = reader.read(readBuffer, 5159 readBufferPos, READ_BUFFER_MAX - readBufferPos); 5160 } 5161 catch(CharacterCodingException cce) 5162 { 5163 // 2006-04-25 hsivonen 5164 fatal("Input data does not conform to the input encoding. The input encoding was " + characterEncoding + "."); 5165 return; // never happens 5166 } 5167 if (normalizationChecker != null && count > 0) 5168 { 5169 normalizationChecker.characters(readBuffer, readBufferPos, count); 5170 } 5171 if (count < 0) 5172 { 5173 readBufferLength = readBufferPos; 5174 } 5175 else 5176 { 5177 readBufferLength = readBufferPos + count; 5178 } 5179 if (readBufferLength > 0) 5180 { 5181 filterCR(count >= 0); 5182 } 5183 sawCR = false; 5184 } 5185 5186 /** 5187 * Filter carriage returns in the read buffer. 5188 * CRLF becomes LF; CR becomes LF. 5189 * @param moreData true iff more data might come from the same source 5190 * @see #readDataChunk 5191 * @see #readBuffer 5192 * @see #readBufferOverflow 5193 */ 5194 private void filterCR(boolean moreData) 5195 { 5196 int i, j; 5197 5198 readBufferOverflow = -1; 5199 5200 loop: 5201 for (i = j = readBufferPos; j < readBufferLength; i++, j++) 5202 { 5203 switch (readBuffer[j]) 5204 { 5205 case '\r': 5206 if (j == readBufferLength - 1) 5207 { 5208 if (moreData) 5209 { 5210 readBufferOverflow = '\r'; 5211 readBufferLength--; 5212 } 5213 else // CR at end of buffer 5214 { 5215 readBuffer[i++] = '\n'; 5216 } 5217 break loop; 5218 } 5219 else if (readBuffer[j + 1] == '\n') 5220 { 5221 j++; 5222 } 5223 readBuffer[i] = '\n'; 5224 break; 5225 5226 case '\n': 5227 default: 5228 readBuffer[i] = readBuffer[j]; 5229 break; 5230 } 5231 } 5232 readBufferLength = i; 5233 } 5234 5235 private void warnAboutPrivateUseChar() throws SAXException { 5236 if (!alreadyWarnedAboutPrivateUseCharacters) { 5237 handler.warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)"); 5238 alreadyWarnedAboutPrivateUseCharacters = true; 5239 } 5240 } 5241 5242 // copied from fi.iki.hsivonen.htmlparser 5243 5244 private boolean isPrivateUse(char c) { 5245 return c >= '\uE000' && c <= '\uF8FF'; 5246 } 5247 5248 private boolean isPrivateUse(int c) { 5249 return (c >= 0xE000 && c <= 0xF8FF) || (c >= 0xF0000 && c <= 0xFFFFD) || (c >= 0x100000 && c <= 0x10FFFD); 5250 } 5251 5252 private boolean isAstralPrivateUse(int c) { 5253 return (c >= 0xF0000 && c <= 0xFFFFD) || (c >= 0x100000 && c <= 0x10FFFD); 5254 } 5255 5256 private boolean isNonCharacter(int c) { 5257 return (c & 0xFFFE) == 0xFFFE; 5258 } 5259 5260 ////////////////////////////////////////////////////////////////////// 5261 // Local Variables. 5262 ////////////////////////////////////////////////////////////////////// 5263 5264 /** 5265 * Re-initialize the variables for each parse. 5266 */ 5267 private void initializeVariables() 5268 { 5269 prev = '\u0000'; 5270 // First line 5271 line = 1; 5272 column = 0; 5273 5274 // Set up the buffers for data and names 5275 dataBufferPos = 0; 5276 dataBuffer = new char[DATA_BUFFER_INITIAL]; 5277 nameBufferPos = 0; 5278 nameBuffer = new char[NAME_BUFFER_INITIAL]; 5279 5280 // Set up the DTD hash tables 5281 elementInfo = new HashMap<String, ElementDecl>(); 5282 entityInfo = new HashMap<String, EntityInfo>(); 5283 notationInfo = new HashMap<String, String>(); 5284 skippedPE = false; 5285 5286 // Set up the variables for the current 5287 // element context. 5288 currentElement = null; 5289 currentElementContent = CONTENT_UNDECLARED; 5290 5291 // Set up the input variables 5292 sourceType = INPUT_NONE; 5293 inputStack = new LinkedList<Input>(); 5294 entityStack = new LinkedList<String>(); 5295 tagAttributePos = 0; 5296 tagAttributes = new String[100]; 5297 rawReadBuffer = new byte[READ_BUFFER_MAX]; 5298 readBufferOverflow = -1; 5299 5300 inLiteral = false; 5301 expandPE = false; 5302 peIsError = false; 5303 5304 doReport = false; 5305 5306 inCDATA = false; 5307 5308 symbolTable = new Object[SYMBOL_TABLE_LENGTH][]; 5309 5310 if (handler.checkNormalization) { 5311 normalizationChecker = new NormalizationChecker(true); 5312 normalizationChecker.setDocumentLocator(handler); 5313 normalizationChecker.setErrorHandler(handler.getErrorHandler()); 5314 } else { 5315 normalizationChecker = null; 5316 } 5317 } 5318 5319 static class ExternalIdentifiers 5320 { 5321 5322 String publicId; 5323 String systemId; 5324 String baseUri; 5325 5326 ExternalIdentifiers() 5327 { 5328 } 5329 5330 ExternalIdentifiers(String publicId, String systemId, String baseUri) 5331 { 5332 this.publicId = publicId; 5333 this.systemId = systemId; 5334 this.baseUri = baseUri; 5335 } 5336 5337 } 5338 5339 static class EntityInfo 5340 { 5341 5342 int type; 5343 ExternalIdentifiers ids; 5344 String value; 5345 String notationName; 5346 5347 } 5348 5349 static class AttributeDecl 5350 { 5351 5352 String type; 5353 String value; 5354 int valueType; 5355 String enumeration; 5356 String defaultValue; 5357 5358 } 5359 5360 static class ElementDecl 5361 { 5362 5363 int contentType; 5364 String contentModel; 5365 HashMap<String, AttributeDecl> attributes; 5366 5367 } 5368 5369 static class Input 5370 { 5371 char prev; 5372 int sourceType; 5373 char[] readBuffer; 5374 int readBufferPos; 5375 int readBufferLength; 5376 int line; 5377 String charecterEncoding; 5378 int readBufferOverflow; 5379 InputStream is; 5380 int currentByteCount; 5381 int column; 5382 Reader reader; 5383 NormalizationChecker normalizationChecker; 5384 } 5385 5386 } 5387