001    /* XmlParser.java -- 
002       Copyright (C) 1999,2000,2001 Free Software Foundation, Inc.
003       Portions Copyright 2006 Henri Sivonen.
004    
005    This file is part of GNU JAXP.
006    
007    GNU JAXP is free software; you can redistribute it and/or modify
008    it under the terms of the GNU General Public License as published by
009    the Free Software Foundation; either version 2, or (at your option)
010    any later version.
011    
012    GNU JAXP is distributed in the hope that it will be useful, but
013    WITHOUT ANY WARRANTY; without even the implied warranty of
014    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
015    General Public License for more details.
016    
017    You should have received a copy of the GNU General Public License
018    along with GNU JAXP; see the file COPYING.  If not, write to the
019    Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
020    02111-1307 USA.
021    
022    Linking this library statically or dynamically with other modules is
023    making a combined work based on this library.  Thus, the terms and
024    conditions of the GNU General Public License cover the whole
025    combination.
026    
027    As a special exception, the copyright holders of this library give you
028    permission to link this library with independent modules to produce an
029    executable, regardless of the license terms of these independent
030    modules, and to copy and distribute the resulting executable under
031    terms of your choice, provided that you also meet, for each linked
032    independent module, the terms and conditions of the license of that
033    module.  An independent module is a module which is not derived from
034    or based on this library.  If you modify this library, you may extend
035    this exception to your version of the library, but you are not
036    obligated to do so.  If you do not wish to do so, delete this
037    exception statement from your version.
038    
039    Partly derived from code which carried the following notice:
040    
041      Copyright (c) 1997, 1998 by Microstar Software Ltd.
042    
043      AElfred is free for both commercial and non-commercial use and
044      redistribution, provided that Microstar's copyright and disclaimer are
045      retained intact.  You are free to modify AElfred for your own use and
046      to redistribute AElfred with your modifications, provided that the
047      modifications are clearly documented.
048    
049      This program is distributed in the hope that it will be useful, but
050      WITHOUT ANY WARRANTY; without even the implied warranty of
051      merchantability or fitness for a particular purpose.  Please use it AT
052      YOUR OWN RISK.
053    */
054    
055    package fi.iki.hsivonen.gnu.xml.aelfred2;
056    
057    import java.io.BufferedInputStream;
058    import java.io.EOFException;
059    import java.io.IOException;
060    import java.io.InputStream;
061    import java.io.InputStreamReader;
062    import java.io.Reader;
063    import java.nio.charset.CharacterCodingException;
064    import java.nio.charset.Charset;
065    import java.nio.charset.CharsetDecoder;
066    import java.nio.charset.CodingErrorAction;
067    import java.nio.charset.IllegalCharsetNameException;
068    import java.nio.charset.UnsupportedCharsetException;
069    import java.util.HashMap;
070    import java.util.Iterator;
071    import java.util.LinkedList;
072    
073    import org.xml.sax.InputSource;
074    import org.xml.sax.SAXException;
075    
076    import fi.iki.hsivonen.io.EncodingInfo;
077    import fi.iki.hsivonen.xml.checker.NormalizationChecker;
078    
079    // Organized imports -- 2005-08-20 hsivonen
080    
081    /**
082     * Parse XML documents and return parse events through call-backs.
083     * Use the <code>SAXDriver</code> class as your entry point, as all
084     * internal parser interfaces are subject to change.
085     *
086     * @author Written by David Megginson &lt;dmeggins@microstar.com&gt;
087     *      (version 1.2a with bugfixes)
088     * @author Updated by David Brownell &lt;dbrownell@users.sourceforge.net&gt;
089     * @see SAXDriver
090     */
091    final class XmlParser
092    {
093        
094      // avoid slow per-character readCh()
095      private final static boolean USE_CHEATS = true;
096    
097      ////////////////////////////////////////////////////////////////////////
098      // Constants.
099      ////////////////////////////////////////////////////////////////////////
100    
101      private static final int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
102      
103      //
104      // Constants for element content type.
105      //
106      
107      /**
108       * Constant: an element has not been declared.
109       * @see #getElementContentType
110       */
111      public final static int CONTENT_UNDECLARED = 0;
112      
113      /**
114       * Constant: the element has a content model of ANY.
115       * @see #getElementContentType
116       */
117      public final static int CONTENT_ANY = 1;
118      
119      /**
120       * Constant: the element has declared content of EMPTY.
121       * @see #getElementContentType
122       */
123      public final static int CONTENT_EMPTY = 2;
124      
125      /**
126       * Constant: the element has mixed content.
127       * @see #getElementContentType
128       */
129      public final static int CONTENT_MIXED = 3;
130      
131      /**
132       * Constant: the element has element content.
133       * @see #getElementContentType
134       */
135      public final static int CONTENT_ELEMENTS = 4;
136      
137      
138      //
139      // Constants for the entity type.
140      //
141      
142      /**
143       * Constant: the entity has not been declared.
144       * @see #getEntityType
145       */
146      public final static int ENTITY_UNDECLARED = 0;
147      
148      /**
149       * Constant: the entity is internal.
150       * @see #getEntityType
151       */
152      public final static int ENTITY_INTERNAL = 1;
153      
154      /**
155       * Constant: the entity is external, non-parsable data.
156       * @see #getEntityType
157       */
158      public final static int ENTITY_NDATA = 2;
159      
160      /**
161       * Constant: the entity is external XML data.
162       * @see #getEntityType
163       */
164      public final static int ENTITY_TEXT = 3;
165        
166      //
167      // Attribute type constants are interned literal strings.
168      //
169        
170      //
171      // Constants for attribute default value.
172      //
173      
174      /**
175       * Constant: the attribute is not declared.
176       * @see #getAttributeDefaultValueType
177       */
178      public final static int ATTRIBUTE_DEFAULT_UNDECLARED = 30;
179      
180      /**
181       * Constant: the attribute has a literal default value specified.
182       * @see #getAttributeDefaultValueType
183       * @see #getAttributeDefaultValue
184       */
185      public final static int ATTRIBUTE_DEFAULT_SPECIFIED = 31;
186      
187      /**
188       * Constant: the attribute was declared #IMPLIED.
189       * @see #getAttributeDefaultValueType
190       */
191      public final static int ATTRIBUTE_DEFAULT_IMPLIED = 32;
192      
193      /**
194       * Constant: the attribute was declared #REQUIRED.
195       * @see #getAttributeDefaultValueType
196       */
197      public final static int ATTRIBUTE_DEFAULT_REQUIRED = 33;
198      
199      /**
200       * Constant: the attribute was declared #FIXED.
201       * @see #getAttributeDefaultValueType
202       * @see #getAttributeDefaultValue
203       */
204      public final static int ATTRIBUTE_DEFAULT_FIXED = 34;
205        
206      //
207      // Constants for input.
208      //
209      private final static int INPUT_NONE = 0;
210      private final static int INPUT_INTERNAL = 1;
211      private final static int INPUT_READER = 5;
212      
213      //
214      // Flags for reading literals.
215      //
216      // expand general entity refs (attribute values in dtd and content)
217      private final static int LIT_ENTITY_REF = 2;
218      // normalize this value (space chars) (attributes, public ids)
219      private final static int LIT_NORMALIZE = 4;
220      // literal is an attribute value 
221      private final static int LIT_ATTRIBUTE = 8;
222      // don't expand parameter entities
223      private final static int LIT_DISABLE_PE = 16;
224      // don't expand [or parse] character refs
225      private final static int LIT_DISABLE_CREF = 32;
226      // don't parse general entity refs
227      private final static int LIT_DISABLE_EREF = 64;
228      // literal is a public ID value 
229      private final static int LIT_PUBID = 256;
230        
231      //
232      // Flags affecting PE handling in DTDs (if expandPE is true).
233      // PEs expand with space padding, except inside literals.
234      //
235      private final static int CONTEXT_NORMAL = 0;
236      private final static int CONTEXT_LITERAL = 1;
237      
238      // Emit warnings for relative URIs with no base URI.
239      static boolean uriWarnings;
240      static
241      {
242        String key = "gnu.xml.aelfred2.XmlParser.uriWarnings";
243        try
244          {
245            uriWarnings = "true".equals(System.getProperty(key));
246          }
247        catch (SecurityException e)
248          {
249            uriWarnings = false;
250          }
251      }
252        
253      //
254      // The current XML handler interface.
255      //
256      private SAXDriver handler;
257      
258      //
259      // I/O information.
260      //
261      private Reader reader;   // current reader
262      private InputStream is;     // current input stream
263      private int line;     // current line number
264      private int column;   // current column number
265      private int sourceType;   // type of input source
266      private LinkedList<Input> inputStack;   // stack of input soruces
267      private String characterEncoding;   // current character encoding
268      private int currentByteCount; // bytes read from current source
269      private InputSource scratch;  // temporary
270      
271      //
272      // Buffers for decoded but unparsed character input.
273      //
274      private char[] readBuffer;
275      private int readBufferPos;
276      private int readBufferLength;
277      private int readBufferOverflow;  // overflow from last data chunk.
278      
279      //
280      // Buffer for undecoded raw byte input.
281      //
282      private final static int READ_BUFFER_MAX = 16384;
283      private byte[] rawReadBuffer;
284      
285      
286      //
287      // Buffer for attribute values, char refs, DTD stuff.
288      //
289      private static int DATA_BUFFER_INITIAL = 4096;
290      private char[] dataBuffer;
291      private int dataBufferPos;
292      
293      //
294      // Buffer for parsed names.
295      //
296      private static int NAME_BUFFER_INITIAL = 1024;
297      private char[] nameBuffer;
298      private int nameBufferPos;
299      
300      //
301      // Save any standalone flag
302      //
303      private boolean docIsStandalone;
304      
305      //
306      // Hashtables for DTD information on elements, entities, and notations.
307      // Populated until we start ignoring decls (because of skipping a PE)
308      //
309      private HashMap<String, ElementDecl> elementInfo;
310      private HashMap<String, EntityInfo> entityInfo;
311      private HashMap<String, String> notationInfo;
312      private boolean skippedPE;
313      
314      //
315      // Element type currently in force.
316      //
317      private String currentElement;
318      private int currentElementContent;
319      
320      //
321      // Stack of entity names, to detect recursion.
322      //
323      private LinkedList<String> entityStack;
324      
325      //
326      // PE expansion is enabled in most chunks of the DTD, not all.
327      // When it's enabled, literals are treated differently.
328      //
329      private boolean inLiteral;
330      private boolean expandPE;
331      private boolean peIsError;
332      
333      //
334      // can't report entity expansion inside two constructs:
335      // - attribute expansions (internal entities only)
336      // - markup declarations (parameter entities only)
337      //
338      private boolean doReport;
339      
340      //
341      // Symbol table, for caching interned names.
342      //
343      // These show up wherever XML names or nmtokens are used:  naming elements,
344      // attributes, PIs, notations, entities, and enumerated attribute values.
345      //
346      // NOTE:  This hashtable doesn't grow.  The default size is intended to be
347      // rather large for most documents.  Example:  one snapshot of the DocBook
348      // XML 4.1 DTD used only about 350 such names.  As a rule, only pathological
349      // documents (ones that don't reuse names) should ever see much collision.
350      //
351      // Be sure that SYMBOL_TABLE_LENGTH always stays prime, for best hashing.
352      // "2039" keeps the hash table size at about two memory pages on typical
353      // 32 bit hardware.
354      //
355      private final static int SYMBOL_TABLE_LENGTH = 2039;
356      
357      private Object[][] symbolTable;
358      
359      //
360      // Hash table of attributes found in current start tag.
361      //
362      private String[] tagAttributes;
363      private int tagAttributePos;
364      
365      //
366      // Utility flag: have we noticed a CR while reading the last
367      // data chunk?  If so, we will have to go back and normalise
368      // CR or CR/LF line ends.
369      //
370      private boolean sawCR;
371      
372      //
373      // Utility flag: are we in CDATA?  If so, whitespace isn't ignorable.
374      // 
375      private boolean inCDATA;
376      
377      //
378      // Xml version.
379      //  
380      private static final int XML_10 = 0; 
381      private static final int XML_11 = 1; 
382      private int xmlVersion = XML_10;
383      
384      //
385      // Normalization checking
386      //
387    
388      private NormalizationChecker normalizationChecker;
389      
390      //////////////////////////////////////////////////////////////////////
391      // Constructors.
392      ////////////////////////////////////////////////////////////////////////
393      
394      /**
395       * Construct a new parser with no associated handler.
396       * @see #setHandler
397       * @see #parse
398       */
399      // package private
400      XmlParser()
401      {
402      }
403    
404      /**
405       * Set the handler that will receive parsing events.
406       * @param handler The handler to receive callback events.
407       * @see #parse
408       */
409      // package private
410      void setHandler(SAXDriver handler)
411      {
412        this.handler = handler;
413      }
414    
415      /**
416       * Parse an XML document from the character stream, byte stream, or URI
417       * that you provide (in that order of preference).  Any URI that you
418       * supply will become the base URI for resolving relative URI, and may
419       * be used to acquire a reader or byte stream.
420       *
421       * <p> Only one thread at a time may use this parser; since it is
422       * private to this package, post-parse cleanup is done by the caller,
423       * which MUST NOT REUSE the parser (just null it).
424       *
425       * @param systemId Absolute URI of the document; should never be null,
426       *    but may be so iff a reader <em>or</em> a stream is provided.
427       * @param publicId The public identifier of the document, or null.
428       * @param reader A character stream; must be null if stream isn't.
429       * @param stream A byte input stream; must be null if reader isn't.
430       * @param characterEncoding The suggested encoding, or null if unknown.
431       * @exception java.lang.Exception Basically SAXException or IOException
432       */
433      // package private 
434      void doParse(String systemId, String publicId, Reader reader,
435                   InputStream stream, String encoding)
436        throws Exception
437      {
438        if (handler == null)
439          {
440            throw new IllegalStateException("no callback handler");
441          }
442    
443        alreadyWarnedAboutPrivateUseCharacters = false;
444        initializeVariables();
445    
446        // predeclare the built-in entities here (replacement texts)
447        // we don't need to intern(), since we're guaranteed literals
448        // are always (globally) interned.
449        setInternalEntity("amp", "&#38;");
450        setInternalEntity("lt", "&#60;");
451        setInternalEntity("gt", "&#62;");
452        setInternalEntity("apos", "&#39;");
453        setInternalEntity("quot", "&#34;");
454    
455        try
456          {
457            // pushURL first to ensure locator is correct in startDocument
458            // ... it might report an IO or encoding exception.
459            handler.startDocument();
460            pushURL(false, "[document]",
461                    // default baseURI: null
462                    new ExternalIdentifiers(publicId, systemId, null),
463                    reader, stream, encoding, false);
464            
465            parseDocument();
466          }
467        catch (EOFException e)
468          {
469            //empty input
470            fatal("empty document, with no root element.");
471          }
472        finally
473          {
474            if (reader != null)
475              {
476                try
477                  {
478                    reader.close();
479                  }
480                catch (IOException e)
481                  {
482                    /* ignore */
483                  }
484              }
485            if (stream != null)
486              {
487                try
488                  {
489                    stream.close();
490                  }
491                catch (IOException e)
492                  {
493                    /* ignore */
494                  }
495              }
496            if (is != null)
497              {
498                try
499                  {
500                    is.close();
501                  }
502                catch (IOException e)
503                  {
504                    /* ignore */
505                  }
506              }
507          }
508      }
509    
510      //////////////////////////////////////////////////////////////////////
511      // Error reporting.
512      //////////////////////////////////////////////////////////////////////
513        
514      /**
515       * Report an error.
516       * @param message The error message.
517       * @param textFound The text that caused the error (or null).
518       * @see SAXDriver#error
519       * @see #line
520       */
521      private void fatal(String message, String textFound, String textExpected)
522        throws SAXException
523      {
524          // smart quotes -- 2005-08-20 hsivonen
525        if (textFound != null)
526          {
527            message = message + " (found \u201C" + textFound + "\u201D)";
528          }
529        if (textExpected != null)
530          {
531            message = message + " (expected \u201C" + textExpected + "\u201D)";
532          }
533        handler.fatal(message);
534        
535        // "can't happen"
536        throw new SAXException(message);
537      }
538    
539      /**
540       * Report a serious error.
541       * @param message The error message.
542       * @param textFound The text that caused the error (or null).
543       */
544      private void fatal(String message, char textFound, String textExpected)
545        throws SAXException
546      {
547        fatal(message, new Character(textFound).toString(), textExpected);
548      }
549    
550      /**
551       * Report typical case fatal errors.
552       */
553      private void fatal(String message)
554        throws SAXException
555      {
556        handler.fatal(message);
557      }
558    
559      /**
560       * Report non-fatal errors.
561       */
562      private void err(String message)
563        throws SAXException
564      {
565        handler.verror(message);
566      }
567      
568      //////////////////////////////////////////////////////////////////////
569      // Major syntactic productions.
570      //////////////////////////////////////////////////////////////////////
571    
572      /**
573       * Parse an XML document.
574       * <pre>
575       * [1] document ::= prolog element Misc*
576       * </pre>
577       * <p>This is the top-level parsing function for a single XML
578       * document.  As a minimum, a well-formed document must have
579       * a document element, and a valid document must have a prolog
580       * (one with doctype) as well.
581       */
582      private void parseDocument()
583        throws Exception
584      {
585        try
586          {                                       // added by MHK
587            boolean sawDTD = parseProlog();
588            require('<');
589            parseElement(!sawDTD);
590          }
591        catch (EOFException ee)
592          {                 // added by MHK
593            fatal("premature end of file", "[EOF]", null);
594          }
595        
596        try
597          {
598            parseMisc();   //skip all white, PIs, and comments
599            char c = readCh();    //if this doesn't throw an exception...
600            fatal("unexpected characters after document end", c, null);
601          }
602        catch (EOFException e)
603          {
604            if (normalizationChecker != null) {
605                normalizationChecker.flush();
606            }
607            return;
608          }
609      }
610      
611      static final char[] startDelimComment = { '<', '!', '-', '-' };
612      static final char[] endDelimComment = { '-', '-' };
613    
614      /**
615       * Skip a comment.
616       * <pre>
617       * [15] Comment ::= '&lt;!--' ((Char - '-') | ('-' (Char - '-')))* "-->"
618       * </pre>
619       * <p> (The <code>&lt;!--</code> has already been read.)
620       */
621      private void parseComment()
622        throws Exception
623      {
624        boolean saved = expandPE;
625        
626        expandPE = false;
627        parseUntil(endDelimComment);
628        require('>');
629        expandPE = saved;
630        handler.comment(dataBuffer, 0, dataBufferPos);
631        dataBufferPos = 0;
632      }
633      
634      static final char[] startDelimPI = { '<', '?' };
635      static final char[] endDelimPI = { '?', '>' };
636    
637      /**
638       * Parse a processing instruction and do a call-back.
639       * <pre>
640       * [16] PI ::= '&lt;?' PITarget
641       *    (S (Char* - (Char* '?&gt;' Char*)))?
642       *    '?&gt;'
643       * [17] PITarget ::= Name - ( ('X'|'x') ('M'|m') ('L'|l') )
644       * </pre>
645       * <p> (The <code>&lt;?</code> has already been read.)
646       */
647      private void parsePI()
648        throws SAXException, IOException
649      {
650        String name;
651        boolean saved = expandPE;
652        
653        expandPE = false;
654        name = readNmtoken(true);
655        //NE08
656        if (name.indexOf(':') >= 0)
657          {
658            fatal("Illegal character(':') in processing instruction name ",
659                  name, null);
660          }
661        if ("xml".equalsIgnoreCase(name))
662          {
663            fatal("Illegal processing instruction target", name, null);
664          }
665        if (!tryRead(endDelimPI))
666          {
667            requireWhitespace();
668            parseUntil(endDelimPI);
669          }
670        expandPE = saved;
671        handler.processingInstruction(name, dataBufferToString());
672      }
673      
674      static final char[] endDelimCDATA = { ']', ']', '>' };
675    
676      private boolean isDirtyCurrentElement;
677    
678    private boolean alreadyWarnedAboutPrivateUseCharacters;
679    
680    private char prev;
681    
682      /**
683       * Parse a CDATA section.
684       * <pre>
685       * [18] CDSect ::= CDStart CData CDEnd
686       * [19] CDStart ::= '&lt;![CDATA['
687       * [20] CData ::= (Char* - (Char* ']]&gt;' Char*))
688       * [21] CDEnd ::= ']]&gt;'
689       * </pre>
690       * <p> (The '&lt;![CDATA[' has already been read.)
691       */
692      private void parseCDSect()
693        throws Exception
694      {
695        parseUntil(endDelimCDATA);
696        dataBufferFlush();
697      }
698    
699      /**
700       * Parse the prolog of an XML document.
701       * <pre>
702       * [22] prolog ::= XMLDecl? Misc* (Doctypedecl Misc*)?
703       * </pre>
704       * <p>We do not look for the XML declaration here, because it was
705       * handled by pushURL ().
706       * @see pushURL
707       * @return true if a DTD was read.
708       */
709      private boolean parseProlog()
710        throws Exception
711      {
712        parseMisc();
713    
714        if (tryRead("<!DOCTYPE"))
715          {
716            parseDoctypedecl();
717            parseMisc();
718            return true;
719          }
720        return false;
721      }
722    
723      private void checkLegalVersion(String version)
724        throws SAXException
725      {
726        int len = version.length();
727        for (int i = 0; i < len; i++)
728          {
729            char c = version.charAt(i);
730            if ('0' <= c && c <= '9')
731              {
732                continue;
733              }
734            if (c == '_' || c == '.' || c == ':' || c == '-')
735              {
736                continue;
737              }
738            if ('a' <= c && c <= 'z')
739              {
740                continue;
741              }
742            if ('A' <= c && c <= 'Z')
743              {
744                continue;
745              }
746            fatal ("illegal character in version", version, "1.0");
747          }
748      }
749    
750      /**
751       * Parse the XML declaration.
752       * <pre>
753       * [23] XMLDecl ::= '&lt;?xml' VersionInfo EncodingDecl? SDDecl? S? '?&gt;'
754       * [24] VersionInfo ::= S 'version' Eq
755       *    ("'" VersionNum "'" | '"' VersionNum '"' )
756       * [26] VersionNum ::= ([a-zA-Z0-9_.:] | '-')*
757       * [32] SDDecl ::= S 'standalone' Eq
758       *    ( "'"" ('yes' | 'no') "'"" | '"' ("yes" | "no") '"' )
759       * [80] EncodingDecl ::= S 'encoding' Eq
760       *    ( "'" EncName "'" | "'" EncName "'" )
761       * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
762       * </pre>
763       * <p> (The <code>&lt;?xml</code> and whitespace have already been read.)
764       * @return the encoding in the declaration, uppercased; or null
765       * @see #parseTextDecl
766       * @see #setupDecoding
767       */
768      private String parseXMLDecl(String encoding)
769        throws SAXException, IOException
770      {
771        String version;
772        String encodingName = null;
773        String standalone = null;
774        int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
775        
776        // Read the version.
777        require("version");
778        parseEq();
779        checkLegalVersion(version = readLiteral(flags));
780        if (!version.equals("1.0"))
781          {
782            if (version.equals("1.1"))
783              {
784                fatal("XML 1.1 not supported."); // 2006-04-24 hsivonen
785              }
786            else
787              {
788                fatal("illegal XML version", version, "1.0"); // removed 1.1 -- 2006-04-24 hsivonen
789              }
790          }
791        else
792          {
793            xmlVersion = XML_10;
794          }
795        // Try reading an encoding declaration.
796        boolean white = tryWhitespace();
797        
798        if (tryRead("encoding"))
799          {
800            if (!white)
801              {
802                fatal("whitespace required before 'encoding='");
803              }
804            parseEq();
805            encodingName = readLiteral(flags);
806            checkEncodingLiteral(encodingName); // 2006-04-28 hsivonen
807            if (reader == null)
808              {
809                draconianInputStreamReader(encodingName, is, true);
810              }
811            else
812              {
813                checkEncodingMatch(encoding, encodingName);
814              }
815          }
816        
817        // Try reading a standalone declaration
818        if (encodingName != null)
819          {
820            white = tryWhitespace();
821          }
822        else
823          {
824            if (encoding == null)
825              {
826                draconianInputStreamReader("UTF-8", is, false); // 2006-04-24 hsivonen
827              }
828            warnAboutLackOfEncodingDecl(encoding);  
829          }
830        if (tryRead("standalone"))
831          {
832            if (!white)
833              {
834                fatal("whitespace required before 'standalone='");
835              }
836            parseEq();
837            standalone = readLiteral(flags);
838            if ("yes".equals(standalone))
839              {
840                docIsStandalone = true;
841              }
842            else if (!"no".equals(standalone))
843              {
844                fatal("standalone flag must be 'yes' or 'no'");
845              }
846          }
847    
848        skipWhitespace();
849        require("?>");
850        
851        return encodingName;
852      }
853    
854      // hsivonen 2006-04-28
855      private void checkEncodingLiteral(String encodingName) 
856        throws SAXException
857        {
858          if (encodingName == null) 
859            {
860              return;
861            }
862          if (encodingName.length() == 0)
863            {
864              fatal("The empty string does not a legal encoding name.");
865            }
866          char c = encodingName.charAt(0);
867          if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')))
868            {
869              fatal("The encoding name must start with an ASCII letter.");
870            }
871          for (int i = 1; i < encodingName.length(); i++) 
872            {
873              c = encodingName.charAt(i);
874              if (!((c >= 'a' && c <= 'z') 
875                 || (c >= 'A' && c <= 'Z') 
876                 || (c >= '0' && c <= '9')
877                 || (c == '.')
878                 || (c == '_')
879                 || (c == '-')))
880                {
881                  fatal("Illegal character in encoding name: U+" + Integer.toHexString(c) + ".");
882                }    
883            }
884        }
885    
886    /**
887       * Parse a text declaration.
888       * <pre>
889       * [79] TextDecl ::= '&lt;?xml' VersionInfo? EncodingDecl S? '?&gt;'
890       * [80] EncodingDecl ::= S 'encoding' Eq
891       *    ( '"' EncName '"' | "'" EncName "'" )
892       * [81] EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')*
893       * </pre>
894       * <p> (The <code>&lt;?xml</code>' and whitespace have already been read.)
895       * @return the encoding in the declaration, uppercased; or null
896       * @see #parseXMLDecl
897       * @see #setupDecoding
898       */
899      private String parseTextDecl(String encoding)
900        throws SAXException, IOException
901      {
902        String encodingName = null;
903        int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
904    
905        // Read an optional version.
906        if (tryRead ("version"))
907          {
908            String version;
909            parseEq();
910            checkLegalVersion(version = readLiteral(flags));
911            if (!version.equals("1.0"))
912            {
913              if (version.equals("1.1"))
914                {
915                  fatal("XML 1.1 not supported."); // 2006-04-24 hsivonen
916                }
917              else
918                {
919                  fatal("illegal XML version", version, "1.0"); // removed 1.1 -- 2006-04-24 hsivonen
920                }
921            }
922            requireWhitespace();
923          }
924        
925        // Read the encoding.
926        require("encoding");
927        parseEq();
928        encodingName = readLiteral(flags);
929        checkEncodingLiteral(encodingName); // 2006-04-28 hsivonen
930        if (reader == null)
931          {
932            draconianInputStreamReader(encodingName, is, true);
933          }
934        else 
935          {
936            checkEncodingMatch(encoding, encodingName);
937          }
938        skipWhitespace();
939        require("?>");
940        
941        return encodingName;
942      }
943    
944      private void checkEncodingMatch(String used, String detected) throws SAXException {
945          // method added -- 2006-02-03 hsivonen
946          if (used == null) {
947              if (!characterEncoding.equals(detected)) {
948                  fatal("Declared character encoding was not the one sniffed from the BOM.", detected, characterEncoding);
949              }
950          } else {
951              if (!"".equals(used) && !used.equalsIgnoreCase(detected))
952              {
953                handler.warn("External encoding information specified " + used
954                      + ", but XML declaration specified " + detected
955                      + ". Allowing external to override per RFC 3023. The well-formedness status of this document may change when decoupled from the external character encoding information.");
956              }          
957          }
958      }
959    
960      private void draconianInputStreamReader(String encoding,
961              InputStream stream, boolean requireAsciiSuperset) 
962        throws SAXException, IOException 
963      {
964          draconianInputStreamReader(encoding, stream, requireAsciiSuperset, encoding);
965      }
966          
967      private void draconianInputStreamReader(String encoding,
968              InputStream stream, boolean requireAsciiSuperset, String actualName) 
969        throws SAXException, IOException 
970      {
971        // method added -- 2005-08-21 hsivonen      
972        sourceType = INPUT_READER;
973        characterEncoding = actualName.toUpperCase();
974        encoding = encoding.toUpperCase();
975        try 
976          {
977            Charset cs = Charset.forName(encoding);
978            String canonName = cs.name();
979            if (requireAsciiSuperset)
980            {
981              if (!EncodingInfo.isAsciiSuperset(canonName))
982                {
983                  fatal("The encoding \u201C" + encoding + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration.");
984                }
985            }
986            if (canonName.startsWith("X-") || canonName.startsWith("x-") || canonName.startsWith("Mac"))
987              {
988                if (encoding.startsWith("X-"))
989                  {
990                    err(encoding + " is not an IANA-registered encoding. (Charmod C022)");                
991                  }
992                else
993                  {
994                    err(encoding + "is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)");
995                  }
996              }
997            else if (!canonName.equalsIgnoreCase(encoding))
998              {
999                err(encoding
1000                    + " is not the preferred name of the character encoding in use. The preferred name is "
1001                    + canonName + ". (Charmod C024)");
1002              }
1003            if (!("UTF-8".equals(encoding) || 
1004                  "UTF-16".equals(encoding) || 
1005                  "UTF-16BE".equals(encoding) || 
1006                  "UTF-16LE".equals(encoding) || 
1007                  "ISO-8859-1".equals(encoding) || 
1008                  "US-ASCII".equals(encoding)))
1009              {
1010                handler.warn(
1011                    "XML processors are required to support the UTF-8 and UTF-16 character encodings. The encoding was " 
1012                    + actualName + " instead, which is an incompatibility risk.");
1013              }
1014            CharsetDecoder decoder = cs.newDecoder();
1015            decoder.onMalformedInput(CodingErrorAction.REPORT);
1016            decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
1017            this.reader = new InputStreamReader(stream, decoder);
1018          } 
1019        catch(IllegalCharsetNameException e) 
1020          {
1021            fatal("Illegal character encoding name: "+ encoding);
1022          } 
1023        catch (UnsupportedCharsetException e) 
1024          {
1025            handler.fatal("Unsupported character encoding: "+ encoding);
1026          }
1027      }
1028      
1029      /**
1030       * Parse miscellaneous markup outside the document element and DOCTYPE
1031       * declaration.
1032       * <pre>
1033       * [27] Misc ::= Comment | PI | S
1034       * </pre>
1035       */
1036      private void parseMisc()
1037        throws Exception
1038      {
1039        while (true)
1040          {
1041            skipWhitespace();
1042            if (tryRead(startDelimPI))
1043              {
1044                parsePI();
1045              }
1046            else if (tryRead(startDelimComment))
1047              {
1048                parseComment();
1049              }
1050            else
1051              {
1052                return;
1053              }
1054          }
1055      }
1056    
1057      /**
1058       * Parse a document type declaration.
1059       * <pre>
1060       * [28] doctypedecl ::= '&lt;!DOCTYPE' S Name (S ExternalID)? S?
1061       *    ('[' (markupdecl | PEReference | S)* ']' S?)? '&gt;'
1062       * </pre>
1063       * <p> (The <code>&lt;!DOCTYPE</code> has already been read.)
1064       */
1065      private void parseDoctypedecl()
1066        throws Exception
1067      {
1068        String rootName;
1069        ExternalIdentifiers ids;
1070    
1071        // Read the document type name.
1072        requireWhitespace();
1073        rootName = readNmtoken(true);
1074    
1075        // Read the External subset's IDs
1076        skipWhitespace();
1077        ids = readExternalIds(false, true);
1078    
1079        // report (a) declaration of name, (b) lexical info (ids)
1080        handler.doctypeDecl(rootName, ids.publicId, ids.systemId);
1081        
1082        // Internal subset is parsed first, if present
1083        skipWhitespace();
1084        if (tryRead('['))
1085          {
1086            
1087            // loop until the subset ends
1088            while (true)
1089              {
1090                doReport = expandPE = true;
1091                skipWhitespace();
1092                doReport = expandPE = false;
1093                if (tryRead(']'))
1094                  {
1095                    break;     // end of subset
1096                  }
1097                else
1098                  {
1099                    // WFC, PEs in internal subset (only between decls)
1100                    peIsError = expandPE = true;
1101                    parseMarkupdecl();
1102                    peIsError = expandPE = false;
1103                  }
1104              }
1105          }
1106        skipWhitespace();
1107        require('>');
1108        
1109        // Read the external subset, if any
1110        InputSource subset;
1111        
1112        if (ids.systemId == null)
1113          {
1114            subset = handler.getExternalSubset(rootName,
1115                                               handler.getSystemId());
1116          }
1117        else
1118          {
1119            subset = null;
1120          }
1121        if (ids.systemId != null || subset != null)
1122          {
1123            pushString(null, ">");
1124          
1125            // NOTE:  [dtd] is so we say what SAX2 expects,
1126            // though it's misleading (subset, not entire dtd)
1127            if (ids.systemId != null)
1128              {
1129                pushURL(true, "[dtd]", ids, null, null, null, true);
1130              }
1131            else
1132              {
1133                handler.warn("modifying document by adding external subset");
1134                pushURL(true, "[dtd]",
1135                        new ExternalIdentifiers(subset.getPublicId(),
1136                                                subset.getSystemId(),
1137                                                null),
1138                        subset.getCharacterStream(),
1139                        subset.getByteStream(),
1140                        subset.getEncoding(),
1141                        false);
1142              }
1143            
1144            // Loop until we end up back at '>'
1145            while (true)
1146              {
1147                doReport = expandPE = true;
1148                skipWhitespace();
1149                doReport = expandPE = false;
1150                if (tryRead('>'))
1151                  {
1152                    break;
1153                  }
1154                else
1155                  {
1156                    expandPE = true;
1157                    parseMarkupdecl();
1158                    expandPE = false;
1159                  }
1160              }
1161            
1162            // the ">" string isn't popped yet
1163            if (inputStack.size() != 1)
1164              {
1165                fatal("external subset has unmatched '>'");
1166              }
1167          }
1168        
1169        // done dtd
1170        handler.endDoctype();
1171        expandPE = false;
1172        doReport = true;
1173      }
1174      
1175      /**
1176       * Parse a markup declaration in the internal or external DTD subset.
1177       * <pre>
1178       * [29] markupdecl ::= elementdecl | Attlistdecl | EntityDecl
1179       *    | NotationDecl | PI | Comment
1180       * [30] extSubsetDecl ::= (markupdecl | conditionalSect
1181       *    | PEReference | S) *
1182       * </pre>
1183       * <p> Reading toplevel PE references is handled as a lexical issue
1184       * by the caller, as is whitespace.
1185       */
1186      private void parseMarkupdecl()
1187        throws Exception
1188      {
1189        char[] saved = null;
1190        boolean savedPE = expandPE;
1191    
1192        // prevent "<%foo;" and ensures saved entity is right
1193        require('<');
1194        unread('<');
1195        expandPE = false;
1196        
1197        if (tryRead("<!ELEMENT"))
1198          {
1199            saved = readBuffer;
1200            expandPE = savedPE;
1201            parseElementDecl();
1202          }
1203        else if (tryRead("<!ATTLIST"))
1204          {
1205            saved = readBuffer;
1206            expandPE = savedPE;
1207            parseAttlistDecl();
1208          }
1209        else if (tryRead("<!ENTITY"))
1210          {
1211            saved = readBuffer;
1212            expandPE = savedPE;
1213            parseEntityDecl();
1214          }
1215        else if (tryRead("<!NOTATION"))
1216          {
1217            saved = readBuffer;
1218            expandPE = savedPE;
1219            parseNotationDecl();
1220          }
1221        else if (tryRead(startDelimPI))
1222          {
1223            saved = readBuffer;
1224            expandPE = savedPE;
1225            parsePI();
1226          }
1227        else if (tryRead(startDelimComment))
1228          {
1229            saved = readBuffer;
1230            expandPE = savedPE;
1231            parseComment();
1232          }
1233        else if (tryRead("<!["))
1234          {
1235            saved = readBuffer;
1236            expandPE = savedPE;
1237            if (inputStack.size() > 0)
1238              {
1239                parseConditionalSect(saved);
1240              }
1241            else
1242              {
1243                fatal("conditional sections illegal in internal subset");
1244              }
1245          }
1246        else
1247          {
1248            fatal("expected markup declaration");
1249          }
1250    
1251        // VC: Proper Decl/PE Nesting
1252        if (readBuffer != saved)
1253          {
1254            handler.verror("Illegal Declaration/PE nesting");
1255          }
1256      }
1257      
1258      /**
1259       * Parse an element, with its tags.
1260       * <pre>
1261       * [39] element ::= EmptyElementTag | STag content ETag
1262       * [40] STag ::= '&lt;' Name (S Attribute)* S? '&gt;'
1263       * [44] EmptyElementTag ::= '&lt;' Name (S Attribute)* S? '/&gt;'
1264       * </pre>
1265       * <p> (The '&lt;' has already been read.)
1266       * <p>NOTE: this method actually chains onto parseContent (), if necessary,
1267       * and parseContent () will take care of calling parseETag ().
1268       */
1269      private void parseElement(boolean maybeGetSubset)
1270        throws Exception
1271      {
1272        String gi;
1273        char c;
1274        int oldElementContent = currentElementContent;
1275        String oldElement = currentElement;
1276        ElementDecl element;
1277    
1278        // This is the (global) counter for the
1279        // array of specified attributes.
1280        tagAttributePos = 0;
1281        
1282        // Read the element type name.
1283        gi = readNmtoken(true);
1284        
1285        // If we saw no DTD, and this is the document root element,
1286        // let the application modify the input stream by providing one.
1287        if (maybeGetSubset)
1288          {
1289            InputSource subset = handler.getExternalSubset(gi,
1290                                                           handler.getSystemId());
1291            if (subset != null)
1292              {
1293                String publicId = subset.getPublicId();
1294                String systemId = subset.getSystemId();
1295                
1296                handler.warn("modifying document by adding DTD");
1297                handler.doctypeDecl(gi, publicId, systemId);
1298                pushString(null, ">");
1299                
1300                // NOTE:  [dtd] is so we say what SAX2 expects,
1301                // though it's misleading (subset, not entire dtd)
1302                pushURL(true, "[dtd]",
1303                        new ExternalIdentifiers(publicId, systemId, null),
1304                        subset.getCharacterStream(),
1305                        subset.getByteStream(),
1306                        subset.getEncoding(),
1307                        false);
1308                
1309                // Loop until we end up back at '>'
1310                while (true)
1311                  {
1312                    doReport = expandPE = true;
1313                    skipWhitespace();
1314                    doReport = expandPE = false;
1315                    if (tryRead('>'))
1316                      {
1317                        break;
1318                      }
1319                    else
1320                      {
1321                        expandPE = true;
1322                        parseMarkupdecl();
1323                        expandPE = false;
1324                      }
1325                  }
1326                
1327                // the ">" string isn't popped yet
1328                if (inputStack.size() != 1)
1329                  {
1330                    fatal("external subset has unmatched '>'");
1331                  }
1332                
1333                handler.endDoctype();
1334              }
1335          }
1336        
1337        // Determine the current content type.
1338        currentElement = gi;
1339        element = elementInfo.get(gi);
1340        currentElementContent = getContentType(element, CONTENT_ANY);
1341    
1342        // Read the attributes, if any.
1343        // After this loop, "c" is the closing delimiter.
1344        boolean white = tryWhitespace();
1345        c = readCh();
1346        while (c != '/' && c != '>')
1347          {
1348            unread(c);
1349            if (!white)
1350              {
1351                fatal("need whitespace between attributes");
1352              }
1353            parseAttribute(gi);
1354            white = tryWhitespace();
1355            c = readCh();
1356          }
1357        
1358        // Supply any defaulted attributes.
1359        Iterator<String> atts = declaredAttributes(element);
1360        if (atts != null)
1361          {
1362            String aname;
1363    loop:
1364            while (atts.hasNext())
1365              {
1366                aname = atts.next();
1367                // See if it was specified.
1368                for (int i = 0; i < tagAttributePos; i++)
1369                  {
1370                    if (tagAttributes[i] == aname)
1371                      {
1372                        continue loop;
1373                      }
1374                  }
1375                // ... or has a default
1376                String value = getAttributeDefaultValue(gi, aname);
1377                
1378                if (value == null)
1379                  {
1380                    continue;
1381                  }
1382                handler.attribute(aname, value, false);
1383              }
1384          }
1385    
1386        // Figure out if this is a start tag
1387        // or an empty element, and dispatch an
1388        // event accordingly.
1389        switch (c)
1390          {
1391          case '>':
1392            handler.startElement(gi);
1393            parseContent();
1394            break;
1395          case '/':
1396            require('>');
1397            handler.startElement(gi);
1398            handler.endElement(gi);
1399            break;
1400          }
1401    
1402        // Restore the previous state.
1403        currentElement = oldElement;
1404        currentElementContent = oldElementContent;
1405      }
1406        
1407      /**
1408       * Parse an attribute assignment.
1409       * <pre>
1410       * [41] Attribute ::= Name Eq AttValue
1411       * </pre>
1412       * @param name The name of the attribute's element.
1413       * @see SAXDriver#attribute
1414       */
1415      private void parseAttribute(String name)
1416        throws Exception
1417      {
1418        String aname;
1419        String type;
1420        String value;
1421        int flags = LIT_ATTRIBUTE |  LIT_ENTITY_REF;
1422        
1423        // Read the attribute name.
1424        aname = readNmtoken(true);
1425        type = getAttributeType(name, aname);
1426        
1427        // Parse '='
1428        parseEq();
1429    
1430        // Read the value, normalizing whitespace
1431        // unless it is CDATA.
1432        if (handler.stringInterning)
1433          {
1434            if (type == "CDATA" || type == null)
1435              {
1436                value = readLiteral(flags);
1437              }
1438            else
1439              {
1440                value = readLiteral(flags | LIT_NORMALIZE);
1441              }
1442          }
1443        else
1444          {
1445            if (type.equals("CDATA") || type == null)
1446              {
1447                value = readLiteral(flags);
1448              }
1449            else
1450              {
1451                value = readLiteral(flags | LIT_NORMALIZE);
1452              }
1453          }
1454    
1455        // WFC: no duplicate attributes
1456        for (int i = 0; i < tagAttributePos; i++)
1457          {
1458            if (aname.equals(tagAttributes [i]))
1459              {
1460                fatal("duplicate attribute", aname, null);
1461              }
1462          }
1463    
1464        // Inform the handler about the
1465        // attribute.
1466        handler.attribute(aname, value, true);
1467        dataBufferPos = 0;
1468        
1469        // Note that the attribute has been
1470        // specified.
1471        if (tagAttributePos == tagAttributes.length)
1472          {
1473            String newAttrib[] = new String[tagAttributes.length * 2];
1474            System.arraycopy(tagAttributes, 0, newAttrib, 0, tagAttributePos);
1475            tagAttributes = newAttrib;
1476          }
1477        tagAttributes[tagAttributePos++] = aname;
1478      }
1479    
1480      /**
1481       * Parse an equals sign surrounded by optional whitespace.
1482       * <pre>
1483       * [25] Eq ::= S? '=' S?
1484       * </pre>
1485       */
1486      private void parseEq()
1487        throws SAXException, IOException
1488      {
1489        skipWhitespace();
1490        require('=');
1491        skipWhitespace();
1492      }
1493    
1494      /**
1495       * Parse an end tag.
1496       * <pre>
1497       * [42] ETag ::= '</' Name S? '>'
1498       * </pre>
1499       * <p>NOTE: parseContent () chains to here, we already read the
1500       * "&lt;/".
1501       */
1502      private void parseETag()
1503        throws Exception
1504      {
1505        require(currentElement);
1506        skipWhitespace();
1507        require('>');
1508        handler.endElement(currentElement);
1509        // not re-reporting any SAXException re bogus end tags,
1510        // even though that diagnostic might be clearer ...
1511      }
1512      
1513      /**
1514       * Parse the content of an element.
1515       * <pre>
1516       * [43] content ::= (element | CharData | Reference
1517       *    | CDSect | PI | Comment)*
1518       * [67] Reference ::= EntityRef | CharRef
1519       * </pre>
1520       * <p> NOTE: consumes ETtag.
1521       */
1522      private void parseContent()
1523        throws Exception
1524      {
1525        char c;
1526        
1527        while (true)
1528          {
1529            // consume characters (or ignorable whitspace) until delimiter
1530            parseCharData();
1531    
1532            // Handle delimiters
1533            c = readCh();
1534            switch (c)
1535              {
1536              case '&':       // Found "&"
1537                c = readCh();
1538                if (c == '#')
1539                  {
1540                    parseCharRef();
1541                  }
1542                else
1543                  {
1544                    unread(c);
1545                    parseEntityRef(true);
1546                  }
1547                isDirtyCurrentElement = true;
1548                break;
1549                
1550              case '<':       // Found "<"
1551                dataBufferFlush();
1552                c = readCh();
1553                switch (c)
1554                  {
1555                  case '!':       // Found "<!"
1556                    c = readCh();
1557                    switch (c)
1558                      {
1559                      case '-':     // Found "<!-"
1560                        require('-');
1561                        isDirtyCurrentElement = false;
1562                        parseComment();
1563                        break;
1564                      case '[':     // Found "<!["
1565                        isDirtyCurrentElement = false;
1566                        require("CDATA[");
1567                        handler.startCDATA();
1568                        inCDATA = true;
1569                        parseCDSect();
1570                        inCDATA = false;
1571                        handler.endCDATA();
1572                        break;
1573                      default:
1574                        fatal("expected comment or CDATA section", c, null);
1575                        break;
1576                      }
1577                    break;
1578                  
1579                  case '?':     // Found "<?"
1580                    isDirtyCurrentElement = false;
1581                    parsePI();
1582                    break;
1583                    
1584                  case '/':     // Found "</"
1585                    isDirtyCurrentElement = false;
1586                    parseETag();
1587                    return;
1588                    
1589                  default:     // Found "<" followed by something else
1590                    isDirtyCurrentElement = false;
1591                    unread(c);
1592                    parseElement(false);
1593                    break;
1594                  }
1595              }
1596          }
1597      }
1598      
1599      /**
1600       * Parse an element type declaration.
1601       * <pre>
1602       * [45] elementdecl ::= '&lt;!ELEMENT' S Name S contentspec S? '&gt;'
1603       * </pre>
1604       * <p> NOTE: the '&lt;!ELEMENT' has already been read.
1605       */
1606      private void parseElementDecl()
1607        throws Exception
1608      {
1609        String name;
1610        
1611        requireWhitespace();
1612        // Read the element type name.
1613        name = readNmtoken(true);
1614    
1615        requireWhitespace();
1616        // Read the content model.
1617        parseContentspec(name);
1618        
1619        skipWhitespace();
1620        require('>');
1621      }
1622    
1623      /**
1624       * Content specification.
1625       * <pre>
1626       * [46] contentspec ::= 'EMPTY' | 'ANY' | Mixed | elements
1627       * </pre>
1628       */
1629      private void parseContentspec(String name)
1630        throws Exception
1631      {
1632        // FIXME: move elementDecl() into setElement(), pass EMTPY/ANY ...
1633        if (tryRead("EMPTY"))
1634          {
1635            setElement(name, CONTENT_EMPTY, null, null);
1636            if (!skippedPE)
1637              {
1638                handler.getDeclHandler().elementDecl(name, "EMPTY");
1639              }
1640            return;
1641          }
1642        else if (tryRead("ANY"))
1643          {
1644            setElement(name, CONTENT_ANY, null, null);
1645            if (!skippedPE)
1646              {
1647                handler.getDeclHandler().elementDecl(name, "ANY");
1648              }
1649            return;
1650          }
1651        else
1652          {
1653            String model;
1654            char[] saved; 
1655            
1656            require('(');
1657            saved = readBuffer;
1658            dataBufferAppend('(');
1659            skipWhitespace();
1660            if (tryRead("#PCDATA"))
1661              {
1662                dataBufferAppend("#PCDATA");
1663                parseMixed(saved);
1664                model = dataBufferToString();
1665                setElement(name, CONTENT_MIXED, model, null);
1666              }
1667            else
1668              {
1669                parseElements(saved);
1670                model = dataBufferToString();
1671                setElement(name, CONTENT_ELEMENTS, model, null);
1672              }
1673            if (!skippedPE)
1674              {
1675                handler.getDeclHandler().elementDecl(name, model);
1676              }
1677          }
1678      }
1679      
1680      /**
1681       * Parse an element-content model.
1682       * <pre>
1683       * [47] elements ::= (choice | seq) ('?' | '*' | '+')?
1684       * [49] choice ::= '(' S? cp (S? '|' S? cp)+ S? ')'
1685       * [50] seq ::= '(' S? cp (S? ',' S? cp)* S? ')'
1686       * </pre>
1687       *
1688       * <p> NOTE: the opening '(' and S have already been read.
1689       *
1690       * @param saved Buffer for entity that should have the terminal ')'
1691       */
1692      private void parseElements(char[] saved)
1693        throws Exception
1694      {
1695        char c;
1696        char sep;
1697        
1698        // Parse the first content particle
1699        skipWhitespace();
1700        parseCp();
1701        
1702        // Check for end or for a separator.
1703        skipWhitespace();
1704        c = readCh();
1705        switch (c)
1706          {
1707          case ')':
1708            // VC: Proper Group/PE Nesting
1709            if (readBuffer != saved)
1710              {
1711                handler.verror("Illegal Group/PE nesting");
1712              }
1713            
1714            dataBufferAppend(')');
1715            c = readCh();
1716            switch (c)
1717              {
1718              case '*':
1719              case '+':
1720              case '?':
1721                dataBufferAppend(c);
1722                break;
1723              default:
1724                unread(c);
1725              }
1726            return;
1727          case ',':       // Register the separator.
1728          case '|':
1729            sep = c;
1730            dataBufferAppend(c);
1731            break;
1732          default:
1733            fatal("bad separator in content model", c, null);
1734            return;
1735          }
1736        
1737        // Parse the rest of the content model.
1738        while (true)
1739          {
1740            skipWhitespace();
1741            parseCp();
1742            skipWhitespace();
1743            c = readCh();
1744            if (c == ')')
1745              {
1746                // VC: Proper Group/PE Nesting
1747                if (readBuffer != saved)
1748                  {
1749                    handler.verror("Illegal Group/PE nesting");
1750                  }
1751                
1752                dataBufferAppend(')');
1753                break;
1754              }
1755            else if (c != sep)
1756              {
1757                fatal("bad separator in content model", c, null);
1758                return;
1759              }
1760            else
1761              {
1762                dataBufferAppend(c);
1763              }
1764          }
1765        
1766        // Check for the occurrence indicator.
1767        c = readCh();
1768        switch (c)
1769          {
1770          case '?':
1771          case '*':
1772          case '+':
1773            dataBufferAppend(c);
1774            return;
1775          default:
1776            unread(c);
1777            return;
1778          }
1779      }
1780       
1781      /**
1782       * Parse a content particle.
1783       * <pre>
1784       * [48] cp ::= (Name | choice | seq) ('?' | '*' | '+')?
1785       * </pre>
1786       */
1787      private void parseCp()
1788        throws Exception
1789      {
1790        if (tryRead('('))
1791          {
1792            dataBufferAppend('(');
1793            parseElements(readBuffer);
1794          }
1795        else
1796          {
1797            dataBufferAppend(readNmtoken(true));
1798            char c = readCh();
1799            switch (c)
1800              {
1801              case '?':
1802              case '*':
1803              case '+':
1804                dataBufferAppend(c);
1805                break;
1806              default:
1807                unread(c);
1808                break;
1809              }
1810          }
1811      }
1812    
1813      /**
1814       * Parse mixed content.
1815       * <pre>
1816       * [51] Mixed ::= '(' S? ( '#PCDATA' (S? '|' S? Name)*) S? ')*'
1817       *        | '(' S? ('#PCDATA') S? ')'
1818       * </pre>
1819       *
1820       * @param saved Buffer for entity that should have the terminal ')'
1821       */
1822      private void parseMixed(char[] saved)
1823        throws Exception
1824      {
1825        // Check for PCDATA alone.
1826        skipWhitespace();
1827        if (tryRead(')'))
1828          {
1829            // VC: Proper Group/PE Nesting
1830            if (readBuffer != saved)
1831              {
1832                handler.verror("Illegal Group/PE nesting");
1833              }
1834            
1835            dataBufferAppend(")*");
1836            tryRead('*');
1837            return;
1838          }
1839        
1840        // Parse mixed content.
1841        skipWhitespace();
1842        while (!tryRead(")"))
1843          {
1844            require('|');
1845            dataBufferAppend('|');
1846            skipWhitespace();
1847            dataBufferAppend(readNmtoken(true));
1848            skipWhitespace();
1849          }
1850        
1851        // VC: Proper Group/PE Nesting
1852        if (readBuffer != saved)
1853          {
1854            handler.verror("Illegal Group/PE nesting");
1855          }
1856        
1857        require('*');
1858        dataBufferAppend(")*");
1859      }
1860      
1861      /**
1862       * Parse an attribute list declaration.
1863       * <pre>
1864       * [52] AttlistDecl ::= '&lt;!ATTLIST' S Name AttDef* S? '&gt;'
1865       * </pre>
1866       * <p>NOTE: the '&lt;!ATTLIST' has already been read.
1867       */
1868      private void parseAttlistDecl()
1869        throws Exception
1870      {
1871        String elementName;
1872        
1873        requireWhitespace();
1874        elementName = readNmtoken(true);
1875        boolean white = tryWhitespace();
1876        while (!tryRead('>'))
1877          {
1878            if (!white)
1879              {
1880                fatal("whitespace required before attribute definition");
1881              }
1882            parseAttDef(elementName);
1883            white = tryWhitespace();
1884          }
1885      }
1886      
1887      /**
1888       * Parse a single attribute definition.
1889       * <pre>
1890       * [53] AttDef ::= S Name S AttType S DefaultDecl
1891       * </pre>
1892       */
1893      private void parseAttDef(String elementName)
1894        throws Exception
1895      {
1896        String name;
1897        String type;
1898        String enumer = null;
1899        
1900        // Read the attribute name.
1901        name = readNmtoken(true);
1902    
1903        // Read the attribute type.
1904        requireWhitespace();
1905        type = readAttType();
1906    
1907        // Get the string of enumerated values if necessary.
1908        if (handler.stringInterning)
1909          {
1910            if ("ENUMERATION" == type || "NOTATION" == type)
1911              {
1912                enumer = dataBufferToString();
1913              }
1914          }
1915        else
1916          {
1917            if ("ENUMERATION".equals(type) || "NOTATION".equals(type))
1918              {
1919                enumer = dataBufferToString();
1920              }
1921          }
1922        
1923        // Read the default value.
1924        requireWhitespace();
1925        parseDefault(elementName, name, type, enumer);
1926      }
1927    
1928      /**
1929       * Parse the attribute type.
1930       * <pre>
1931       * [54] AttType ::= StringType | TokenizedType | EnumeratedType
1932       * [55] StringType ::= 'CDATA'
1933       * [56] TokenizedType ::= 'ID' | 'IDREF' | 'IDREFS' | 'ENTITY'
1934       *    | 'ENTITIES' | 'NMTOKEN' | 'NMTOKENS'
1935       * [57] EnumeratedType ::= NotationType | Enumeration
1936       * </pre>
1937       */
1938      private String readAttType()
1939        throws Exception
1940      {
1941        if (tryRead('('))
1942          {
1943            parseEnumeration(false);
1944            return "ENUMERATION";
1945          }
1946        else
1947          {
1948            String typeString = readNmtoken(true);
1949            if (handler.stringInterning)
1950              {
1951                if ("NOTATION" == typeString)
1952                  {
1953                    parseNotationType();
1954                    return typeString;
1955                  }
1956                else if ("CDATA" == typeString
1957                         || "ID" == typeString
1958                         || "IDREF" == typeString
1959                         || "IDREFS" == typeString
1960                         || "ENTITY" == typeString
1961                         || "ENTITIES" == typeString
1962                         || "NMTOKEN" == typeString
1963                         || "NMTOKENS" == typeString)
1964                  {
1965                    return typeString;
1966                  }
1967              }
1968            else
1969              {
1970                if ("NOTATION".equals(typeString))
1971                  {
1972                    parseNotationType();
1973                    return typeString;
1974                  }
1975                else if ("CDATA".equals(typeString)
1976                         || "ID".equals(typeString)
1977                         || "IDREF".equals(typeString)
1978                         || "IDREFS".equals(typeString)
1979                         || "ENTITY".equals(typeString)
1980                         || "ENTITIES".equals(typeString)
1981                         || "NMTOKEN".equals(typeString)
1982                         || "NMTOKENS".equals(typeString))
1983                  {
1984                    return typeString;
1985                  }
1986              }
1987            fatal("illegal attribute type", typeString, null);
1988            return null;
1989          }
1990      }
1991      
1992      /**
1993       * Parse an enumeration.
1994       * <pre>
1995       * [59] Enumeration ::= '(' S? Nmtoken (S? '|' S? Nmtoken)* S? ')'
1996       * </pre>
1997       * <p>NOTE: the '(' has already been read.
1998       */
1999      private void parseEnumeration(boolean isNames)
2000        throws Exception
2001      {
2002        dataBufferAppend('(');
2003    
2004        // Read the first token.
2005        skipWhitespace();
2006        dataBufferAppend(readNmtoken(isNames));
2007        // Read the remaining tokens.
2008        skipWhitespace();
2009        while (!tryRead(')'))
2010          {
2011            require('|');
2012            dataBufferAppend('|');
2013            skipWhitespace();
2014            dataBufferAppend(readNmtoken (isNames));
2015            skipWhitespace();
2016          }
2017        dataBufferAppend(')');
2018      }
2019    
2020      /**
2021       * Parse a notation type for an attribute.
2022       * <pre>
2023       * [58] NotationType ::= 'NOTATION' S '(' S? NameNtoks
2024       *    (S? '|' S? name)* S? ')'
2025       * </pre>
2026       * <p>NOTE: the 'NOTATION' has already been read
2027       */
2028      private void parseNotationType()
2029        throws Exception
2030      {
2031        requireWhitespace();
2032        require('(');
2033        
2034        parseEnumeration(true);
2035      }
2036    
2037      /**
2038       * Parse the default value for an attribute.
2039       * <pre>
2040       * [60] DefaultDecl ::= '#REQUIRED' | '#IMPLIED'
2041       *    | (('#FIXED' S)? AttValue)
2042       * </pre>
2043       */
2044      private void parseDefault(String elementName, String name,
2045                                String type, String enumer)
2046        throws Exception
2047      {
2048        int valueType = ATTRIBUTE_DEFAULT_SPECIFIED;
2049        String value = null;
2050        int flags = LIT_ATTRIBUTE;
2051        boolean saved = expandPE;
2052        String defaultType = null;
2053        
2054        // LIT_ATTRIBUTE forces '<' checks now (ASAP) and turns whitespace
2055        // chars to spaces (doesn't matter when that's done if it doesn't
2056        // interfere with char refs expanding to whitespace).
2057        
2058        if (!skippedPE)
2059          {
2060            flags |= LIT_ENTITY_REF;
2061            if (handler.stringInterning)
2062              {
2063                if ("CDATA" != type)
2064                  {
2065                    flags |= LIT_NORMALIZE;
2066                  }
2067              }
2068            else
2069              {
2070                if (!"CDATA".equals(type))
2071                  {
2072                    flags |= LIT_NORMALIZE;
2073                  }
2074              }
2075          }
2076        
2077        expandPE = false;
2078        if (tryRead('#'))
2079          {
2080            if (tryRead("FIXED"))
2081              {
2082                defaultType = "#FIXED";
2083                valueType = ATTRIBUTE_DEFAULT_FIXED;
2084                requireWhitespace();
2085                value = readLiteral(flags);
2086              }
2087            else if (tryRead("REQUIRED"))
2088              {
2089                defaultType = "#REQUIRED";
2090                valueType = ATTRIBUTE_DEFAULT_REQUIRED;
2091              }
2092            else if (tryRead("IMPLIED"))
2093              {
2094                defaultType = "#IMPLIED";
2095                valueType = ATTRIBUTE_DEFAULT_IMPLIED;
2096              }
2097            else
2098              {
2099                fatal("illegal keyword for attribute default value");
2100              }
2101          }
2102        else
2103          {
2104            value = readLiteral(flags);
2105          }
2106        expandPE = saved;
2107        setAttribute(elementName, name, type, enumer, value, valueType);
2108        if (handler.stringInterning)
2109          {
2110            if ("ENUMERATION" == type)
2111              {
2112                type = enumer;
2113              }
2114            else if ("NOTATION" == type)
2115              {
2116                type = "NOTATION " + enumer;
2117              }
2118          }
2119        else
2120          {
2121            if ("ENUMERATION".equals(type))
2122              {
2123                type = enumer;
2124              }
2125            else if ("NOTATION".equals(type))
2126              {
2127                type = "NOTATION " + enumer;
2128              }
2129          }
2130        if (!skippedPE)
2131          {
2132            handler.getDeclHandler().attributeDecl(elementName, name, type,
2133                                                   defaultType, value);
2134          }
2135      }
2136      
2137      /**
2138       * Parse a conditional section.
2139       * <pre>
2140       * [61] conditionalSect ::= includeSect || ignoreSect
2141       * [62] includeSect ::= '&lt;![' S? 'INCLUDE' S? '['
2142       *    extSubsetDecl ']]&gt;'
2143       * [63] ignoreSect ::= '&lt;![' S? 'IGNORE' S? '['
2144       *    ignoreSectContents* ']]&gt;'
2145       * [64] ignoreSectContents ::= Ignore
2146       *    ('&lt;![' ignoreSectContents* ']]&gt;' Ignore )*
2147       * [65] Ignore ::= Char* - (Char* ( '&lt;![' | ']]&gt;') Char* )
2148       * </pre>
2149       * <p> NOTE: the '&gt;![' has already been read.
2150       */
2151      private void parseConditionalSect(char[] saved)
2152        throws Exception
2153      {
2154        skipWhitespace();
2155        if (tryRead("INCLUDE"))
2156          {
2157            skipWhitespace();
2158            require('[');
2159            // VC: Proper Conditional Section/PE Nesting
2160            if (readBuffer != saved)
2161              {
2162                handler.verror("Illegal Conditional Section/PE nesting");
2163              }
2164            skipWhitespace();
2165            while (!tryRead("]]>"))
2166              {
2167                parseMarkupdecl();
2168                skipWhitespace();
2169              }
2170          }
2171        else if (tryRead("IGNORE"))
2172          {
2173            skipWhitespace();
2174            require('[');
2175            // VC: Proper Conditional Section/PE Nesting
2176            if (readBuffer != saved)
2177              {
2178                handler.verror("Illegal Conditional Section/PE nesting");
2179              }
2180            char c;
2181            expandPE = false;
2182            for (int nest = 1; nest > 0; )
2183              {
2184                c = readCh();
2185                switch (c)
2186                  {
2187                  case '<':
2188                    if (tryRead("!["))
2189                      {
2190                        nest++;
2191                      }
2192                  case ']':
2193                    if (tryRead("]>"))
2194                      {
2195                        nest--;
2196                      }
2197                  }
2198              }
2199            expandPE = true;
2200          }
2201        else
2202          {
2203            fatal("conditional section must begin with INCLUDE or IGNORE");
2204          }
2205      }
2206      
2207      private void parseCharRef()
2208        throws SAXException, IOException
2209      {
2210        parseCharRef(true /* do flushDataBuffer by default */);
2211      }
2212    
2213      /**
2214       * Try to read a character reference without consuming data from buffer.
2215       * <pre>
2216       * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2217       * </pre>
2218       * <p>NOTE: the '&#' has already been read.
2219       */
2220      private void tryReadCharRef()
2221        throws SAXException, IOException
2222      {
2223        int value = 0;
2224        char c;
2225        
2226        if (tryRead('x'))
2227          {
2228    loop1:
2229            while (true)
2230              {
2231                c = readCh();
2232                if (c == ';')
2233                  {
2234                    break loop1;
2235                  }
2236                else
2237                  {
2238                    int n = Character.digit(c, 16);
2239                    if (n == -1)
2240                      {
2241                        fatal("illegal character in character reference", c, null);
2242                        break loop1;
2243                      }
2244                    value *= 16;
2245                    value += n;
2246                  }
2247              }
2248          }
2249        else
2250          {
2251    loop2:
2252            while (true)
2253              {
2254                c = readCh();
2255                if (c == ';')
2256                  {
2257                    break loop2;
2258                  }
2259                else
2260                  {
2261                    int n = Character.digit(c, 10);
2262                    if (n == -1)
2263                      {
2264                        fatal("illegal character in character reference", c, null);
2265                        break loop2;
2266                      }
2267                    value *= 10;
2268                    value += n;
2269                  }
2270              }
2271          }
2272        
2273        // check for character refs being legal XML
2274        if ((value < 0x0020
2275             && ! (value == '\n' || value == '\t' || value == '\r'))
2276            || (value >= 0xD800 && value <= 0xDFFF)
2277            || value == 0xFFFE || value == 0xFFFF
2278            || value > 0x0010ffff)
2279          {
2280            fatal("illegal XML character reference U+"
2281                  + Integer.toHexString(value));
2282          }
2283        else if (value >= 0x007F && value <= 0x009F) // 2006-11-13 hsivonen
2284          {
2285            handler.warn("Character reference expands to a control character: U+00" + Integer.toHexString(c) + ".");
2286          }
2287        if (isPrivateUse(value))
2288          {
2289            warnAboutPrivateUseChar();
2290          }
2291        // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2292        //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2293        if (value > 0x0010ffff)
2294          {
2295            // too big for surrogate
2296            fatal("character reference " + value + " is too large for UTF-16",
2297                  new Integer(value).toString(), null);
2298          }
2299        
2300      }
2301      
2302      /**
2303       * Read and interpret a character reference.
2304       * <pre>
2305       * [66] CharRef ::= '&#' [0-9]+ ';' | '&#x' [0-9a-fA-F]+ ';'
2306       * </pre>
2307       * <p>NOTE: the '&#' has already been read.
2308       */
2309      private void parseCharRef(boolean doFlush)
2310        throws SAXException, IOException
2311      {
2312        int value = 0;
2313        char c;
2314        
2315        if (tryRead('x'))
2316          {
2317    loop1:
2318            while (true)
2319              {
2320                c = readCh();
2321                if (c == ';')
2322                  {
2323                    break loop1;
2324                  }
2325                else
2326                  {
2327                    int n = Character.digit(c, 16);
2328                    if (n == -1)
2329                      {
2330                        fatal("illegal character in character reference", c, null);
2331                        break loop1;
2332                      }
2333                    value *= 16;
2334                    value += n;
2335                  }
2336              }
2337          }
2338        else
2339          {
2340    loop2:
2341            while (true)
2342              {
2343                c = readCh();
2344                if (c == ';')
2345                  {
2346                    break loop2;
2347                  }
2348                else
2349                  {
2350                    int n = Character.digit(c, 10);
2351                    if (n == -1)
2352                      {
2353                        fatal("illegal character in character reference", c, null);
2354                        break loop2;
2355                      }
2356                    value *= 10;
2357                    value += c - '0';
2358                  }
2359              }
2360          }
2361        
2362        // check for character refs being legal XML
2363        if ((value < 0x0020
2364             && ! (value == '\n' || value == '\t' || value == '\r'))
2365            || (value >= 0xD800 && value <= 0xDFFF)
2366            || value == 0xFFFE || value == 0xFFFF
2367            || value > 0x0010ffff)
2368          {
2369            fatal("illegal XML character reference U+"
2370                  + Integer.toHexString(value));
2371          }
2372        else if (value >= 0x007F && value <= 0x009F) // 2006-11-13 hsivonen
2373          {
2374            handler.warn("Character reference expands to a control character: U+00" + Integer.toHexString(c) + ".");
2375          }
2376        if (isPrivateUse(value))
2377          {
2378            warnAboutPrivateUseChar();
2379          }
2380        
2381        // Check for surrogates: 00000000 0000xxxx yyyyyyyy zzzzzzzz
2382        //  (1101|10xx|xxyy|yyyy + 1101|11yy|zzzz|zzzz:
2383        if (value <= 0x0000ffff)
2384          {
2385            // no surrogates needed
2386            dataBufferAppend((char) value);
2387          }
2388        else if (value <= 0x0010ffff)
2389          {
2390            value -= 0x10000;
2391            // > 16 bits, surrogate needed
2392            dataBufferAppend((char) (0xd800 | (value >> 10)));
2393            dataBufferAppend((char) (0xdc00 | (value & 0x0003ff)));
2394          }
2395        else
2396          {
2397            // too big for surrogate
2398            fatal("character reference " + value + " is too large for UTF-16",
2399                  new Integer(value).toString(), null);
2400          }
2401        if (doFlush)
2402          {
2403            dataBufferFlush();
2404          }
2405      }
2406      
2407      /**
2408       * Parse and expand an entity reference.
2409       * <pre>
2410       * [68] EntityRef ::= '&' Name ';'
2411       * </pre>
2412       * <p>NOTE: the '&amp;' has already been read.
2413       * @param externalAllowed External entities are allowed here.
2414       */
2415      private void parseEntityRef(boolean externalAllowed)
2416        throws SAXException, IOException
2417      {
2418        String name;
2419        
2420        name = readNmtoken(true);
2421        require(';');
2422        switch (getEntityType(name))
2423          {
2424          case ENTITY_UNDECLARED:
2425            // NOTE:  XML REC describes amazingly convoluted handling for
2426            // this case.  Nothing as meaningful as being a WFness error
2427            // unless the processor might _legitimately_ not have seen a
2428            // declaration ... which is what this implements.
2429            String message;
2430            
2431            message = "reference to undeclared general entity " + name;
2432            if (skippedPE && !docIsStandalone)
2433              {
2434                handler.verror(message);
2435                // we don't know this entity, and it might be external...
2436                if (externalAllowed)
2437                  {
2438                    handler.skippedEntity(name);
2439                  }
2440              }
2441            else
2442              {
2443                fatal(message);
2444              }
2445            break;
2446          case ENTITY_INTERNAL:
2447              pushString(name, getEntityValue(name));
2448              
2449              //workaround for possible input pop before marking
2450              //the buffer reading position  
2451              char t = readCh();
2452              unread(t);
2453              int bufferPosMark = readBufferPos;
2454              
2455              int end = readBufferPos + getEntityValue(name).length();
2456              for (int k = readBufferPos; k < end; k++)
2457                {
2458                  t = readCh();
2459                  if (t == '&')
2460                    {
2461                      t = readCh();   
2462                      if (t  == '#')
2463                        { 
2464                          //try to match a character ref
2465                          tryReadCharRef();
2466                    
2467                          //everything has been read
2468                          if (readBufferPos >= end)
2469                            {
2470                              break;
2471                            }
2472                          k = readBufferPos;
2473                          continue;
2474                        }
2475                      else if (Character.isLetter(t))
2476                        {
2477                          //looks like an entity ref
2478                          unread(t);
2479                          readNmtoken(true);
2480                          require(';');
2481                          
2482                          //everything has been read
2483                          if (readBufferPos >= end)
2484                            {
2485                              break;
2486                            }
2487                          k = readBufferPos;
2488                          continue;
2489                        }
2490                      fatal(" malformed entity reference");
2491                    }
2492                  
2493                }
2494              readBufferPos = bufferPosMark;
2495              break;
2496          case ENTITY_TEXT:
2497              if (externalAllowed)
2498                {
2499                  pushURL(false, name, getEntityIds(name),
2500                          null, null, null, true);
2501                }
2502              else
2503                {
2504                  fatal("reference to external entity in attribute value.",
2505                        name, null);
2506                }
2507              break;
2508          case ENTITY_NDATA:
2509              if (externalAllowed)
2510                {
2511                  fatal("unparsed entity reference in content", name, null);
2512                }
2513              else
2514                {
2515                  fatal("reference to external entity in attribute value.",
2516                        name, null);
2517                }
2518              break;
2519          default:
2520              throw new RuntimeException();
2521          }
2522      }
2523        
2524      /**
2525       * Parse and expand a parameter entity reference.
2526       * <pre>
2527       * [69] PEReference ::= '%' Name ';'
2528       * </pre>
2529       * <p>NOTE: the '%' has already been read.
2530       */
2531      private void parsePEReference()
2532        throws SAXException, IOException
2533      {
2534        String name;
2535        
2536        name = "%" + readNmtoken(true);
2537        require(';');
2538        switch (getEntityType(name))
2539          {
2540          case ENTITY_UNDECLARED:
2541            // VC: Entity Declared
2542            handler.verror("reference to undeclared parameter entity " + name);
2543            
2544            // we should disable handling of all subsequent declarations
2545            // unless this is a standalone document (info discarded)
2546            break;
2547          case ENTITY_INTERNAL:
2548            if (inLiteral)
2549              {
2550                pushString(name, getEntityValue(name));
2551              }
2552            else
2553              {
2554                pushString(name, ' ' + getEntityValue(name) + ' ');
2555              }
2556            break;
2557          case ENTITY_TEXT:
2558            if (!inLiteral)
2559              {
2560                pushString(null, " ");
2561              }
2562            pushURL(true, name, getEntityIds(name), null, null, null, true);
2563            if (!inLiteral)
2564              {
2565                pushString(null, " ");
2566              }
2567            break;
2568          }
2569      }
2570      
2571      /**
2572       * Parse an entity declaration.
2573       * <pre>
2574       * [70] EntityDecl ::= GEDecl | PEDecl
2575       * [71] GEDecl ::= '&lt;!ENTITY' S Name S EntityDef S? '&gt;'
2576       * [72] PEDecl ::= '&lt;!ENTITY' S '%' S Name S PEDef S? '&gt;'
2577       * [73] EntityDef ::= EntityValue | (ExternalID NDataDecl?)
2578       * [74] PEDef ::= EntityValue | ExternalID
2579       * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
2580       *       | 'PUBLIC' S PubidLiteral S SystemLiteral
2581       * [76] NDataDecl ::= S 'NDATA' S Name
2582       * </pre>
2583       * <p>NOTE: the '&lt;!ENTITY' has already been read.
2584       */
2585      private void parseEntityDecl()
2586        throws Exception
2587      {
2588        boolean peFlag = false;
2589        int flags = 0;
2590        
2591        // Check for a parameter entity.
2592        expandPE = false;
2593        requireWhitespace();
2594        if (tryRead('%'))
2595          {
2596            peFlag = true;
2597            requireWhitespace();
2598          }
2599        expandPE = true;
2600        
2601        // Read the entity name, and prepend
2602        // '%' if necessary.
2603        String name = readNmtoken(true);
2604        //NE08
2605        if (name.indexOf(':') >= 0)
2606          {
2607            fatal("Illegal character(':') in entity name ", name, null);
2608          }
2609        if (peFlag)
2610          {
2611            name = "%" + name;
2612          }
2613    
2614        // Read the entity value.
2615        requireWhitespace();
2616        char c = readCh();
2617        unread (c);
2618        if (c == '"' || c == '\'')
2619          {
2620            // Internal entity ... replacement text has expanded refs
2621            // to characters and PEs, but not to general entities
2622            String value = readLiteral(flags);
2623            setInternalEntity(name, value);
2624          }
2625        else
2626          {
2627            // Read the external IDs
2628            ExternalIdentifiers ids = readExternalIds(false, false);
2629            
2630            // Check for NDATA declaration.
2631            boolean white = tryWhitespace();
2632            if (!peFlag && tryRead("NDATA"))
2633              {
2634                if (!white)
2635                  {
2636                    fatal("whitespace required before NDATA");
2637                  }
2638                requireWhitespace();
2639                String notationName = readNmtoken(true);
2640                if (!skippedPE)
2641                  {
2642                    setExternalEntity(name, ENTITY_NDATA, ids, notationName);
2643                    handler.unparsedEntityDecl(name, ids.publicId, ids.systemId,
2644                                               ids.baseUri, notationName);
2645                  }
2646              }
2647            else if (!skippedPE)
2648              {
2649                setExternalEntity(name, ENTITY_TEXT, ids, null);
2650                handler.getDeclHandler()
2651                  .externalEntityDecl(name, ids.publicId,
2652                                       handler.resolveURIs()
2653                                       // FIXME: ASSUMES not skipped
2654                                       // "false" forces error on bad URI
2655                                       ? handler.absolutize(ids.baseUri,
2656                                                            ids.systemId,
2657                                                            false)
2658                                       : ids.systemId);
2659              }
2660          }
2661        
2662        // Finish the declaration.
2663        skipWhitespace();
2664        require('>');
2665      }
2666    
2667      /**
2668       * Parse a notation declaration.
2669       * <pre>
2670       * [82] NotationDecl ::= '&lt;!NOTATION' S Name S
2671       *    (ExternalID | PublicID) S? '&gt;'
2672       * [83] PublicID ::= 'PUBLIC' S PubidLiteral
2673       * </pre>
2674       * <P>NOTE: the '&lt;!NOTATION' has already been read.
2675       */
2676      private void parseNotationDecl()
2677        throws Exception
2678      {
2679        String nname;
2680        ExternalIdentifiers ids;
2681    
2682        requireWhitespace();
2683        nname = readNmtoken(true);
2684        //NE08
2685        if (nname.indexOf(':') >= 0)
2686          {
2687            fatal("Illegal character(':') in notation name ", nname, null);
2688          }
2689        requireWhitespace();
2690    
2691        // Read the external identifiers.
2692        ids = readExternalIds(true, false);
2693    
2694        // Register the notation.
2695        setNotation(nname, ids);
2696        
2697        skipWhitespace();
2698        require('>');
2699      }
2700      
2701      /**
2702       * Parse character data.
2703       * <pre>
2704       * [14] CharData ::= [^&lt;&amp;]* - ([^&lt;&amp;]* ']]&gt;' [^&lt;&amp;]*)
2705       * </pre>
2706       */
2707      private void parseCharData()
2708        throws Exception
2709      {
2710        char c;
2711        int state = 0;
2712        boolean pureWhite = false;
2713    
2714        // assert (dataBufferPos == 0);
2715        
2716        // are we expecting pure whitespace?  it might be dirty...
2717        if ((currentElementContent == CONTENT_ELEMENTS) && !isDirtyCurrentElement)
2718          {
2719            pureWhite = true;
2720          }
2721    
2722        // always report right out of readBuffer
2723        // to minimize (pointless) buffer copies
2724        while (true)
2725          {
2726            int lineAugment = 0;
2727            int columnAugment = 0;
2728            int i;
2729            
2730    loop:
2731            for (i = readBufferPos; i < readBufferLength; i++)
2732              {
2733                switch (c = readBuffer[i])
2734                  {
2735                  case '\n':
2736                    lineAugment++;
2737                    columnAugment = 0;
2738                    // pureWhite unmodified
2739                    break;
2740                  case '\r':  // should not happen!!
2741                  case '\t':
2742                  case ' ':
2743                    // pureWhite unmodified
2744                    columnAugment++;
2745                    break;
2746                  case '&':
2747                  case '<':
2748                    columnAugment++;
2749                    // pureWhite unmodified
2750                    // CLEAN end of text sequence
2751                    state = 1;
2752                    break loop;
2753                  case ']':
2754                    // that's not a whitespace char, and
2755                    // can not terminate pure whitespace either
2756                    pureWhite = false;
2757                    if ((i + 2) < readBufferLength)
2758                      {
2759                        if (readBuffer [i + 1] == ']'
2760                            && readBuffer [i + 2] == '>')
2761                          {
2762                            // ERROR end of text sequence
2763                            state = 2;
2764                            break loop;
2765                          }
2766                      }
2767                    else
2768                      {
2769                        // FIXME missing two end-of-buffer cases
2770                      }
2771                    columnAugment++;
2772                    break;
2773                  default:
2774                    if ((c < 0x0020 || c > 0xFFFD)
2775                        || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) 
2776                            && xmlVersion == XML_11)) 
2777                      {
2778                        fatal("illegal XML character U+"
2779                              + Integer.toHexString(c));
2780                      }
2781                    else if (c >= '\u007F' && c <= '\u009F') // 2006-04-25 hsivonen
2782                      {
2783                        handler.warn("Saw a control character: U+00" + Integer.toHexString(c) + ".");
2784                      }
2785                    // that's not a whitespace char
2786                    pureWhite = false;
2787                    columnAugment++;
2788                  }
2789              }
2790            
2791            // report text thus far
2792            if (lineAugment > 0)
2793              {
2794                line += lineAugment;
2795                column = columnAugment;
2796              }
2797            else
2798              {
2799                column += columnAugment;
2800              }
2801            
2802            // report characters/whitspace
2803            int length = i - readBufferPos;
2804            
2805            if (length != 0)
2806              {
2807                if (pureWhite)
2808                  {
2809                    handler.ignorableWhitespace(readBuffer,
2810                                                readBufferPos, length);
2811                  }
2812                else
2813                  {
2814                    handler.charData(readBuffer, readBufferPos, length);
2815                  }
2816                readBufferPos = i;
2817              }
2818            
2819            if (state != 0)
2820              {
2821                break;
2822              }
2823            
2824            // fill next buffer from this entity, or
2825            // pop stack and continue with previous entity
2826            unread(readCh());
2827          }
2828        if (!pureWhite)
2829          {
2830            isDirtyCurrentElement = true;
2831          }
2832        // finish, maybe with error
2833        if (state != 1)  // finish, no error
2834          {
2835            fatal("character data may not contain ']]>'");
2836          }
2837      }
2838      
2839      //////////////////////////////////////////////////////////////////////
2840      // High-level reading and scanning methods.
2841      //////////////////////////////////////////////////////////////////////
2842      
2843      /**
2844       * Require whitespace characters.
2845       */
2846      private void requireWhitespace()
2847        throws SAXException, IOException
2848      {
2849        char c = readCh();
2850        if (isWhitespace(c))
2851          {
2852            skipWhitespace();
2853          }
2854        else
2855          {
2856            fatal("whitespace required", c, null);
2857          }
2858      }
2859    
2860      /**
2861       * Skip whitespace characters.
2862       * <pre>
2863       * [3] S ::= (#x20 | #x9 | #xd | #xa)+
2864       * </pre>
2865       */
2866      private void skipWhitespace()
2867        throws SAXException, IOException
2868      {
2869        // Start with a little cheat.  Most of
2870        // the time, the white space will fall
2871        // within the current read buffer; if
2872        // not, then fall through.
2873        if (USE_CHEATS)
2874          {
2875            int lineAugment = 0;
2876            int columnAugment = 0;
2877            
2878    loop:
2879            for (int i = readBufferPos; i < readBufferLength; i++)
2880              {
2881                switch (readBuffer[i])
2882                  {
2883                  case ' ':
2884                  case '\t':
2885                  case '\r':
2886                    columnAugment++;
2887                    break;
2888                  case '\n':
2889                    lineAugment++;
2890                    columnAugment = 0;
2891                    break;
2892                  case '%':
2893                    if (expandPE)
2894                      {
2895                        break loop;
2896                      }
2897                    // else fall through...
2898                  default:
2899                    readBufferPos = i;
2900                    if (lineAugment > 0)
2901                      {
2902                        line += lineAugment;
2903                        column = columnAugment;
2904                      }
2905                    else
2906                      {
2907                        column += columnAugment;
2908                      }
2909                    return;
2910                  }
2911              }
2912          }
2913        
2914        // OK, do it the slow way.
2915        char c = readCh ();
2916        while (isWhitespace(c))
2917          {
2918            c = readCh();
2919          }
2920        unread(c);
2921      }
2922      
2923      /**
2924       * Read a name or (when parsing an enumeration) name token.
2925       * <pre>
2926       * [5] Name ::= (Letter | '_' | ':') (NameChar)*
2927       * [7] Nmtoken ::= (NameChar)+
2928       * </pre>
2929       */
2930      private String readNmtoken(boolean isName)
2931        throws SAXException, IOException
2932      {
2933        char c;
2934        
2935        if (USE_CHEATS)
2936          {
2937    loop:
2938            for (int i = readBufferPos; i < readBufferLength; i++)
2939              {
2940                c = readBuffer[i];
2941                switch (c)
2942                  {
2943                  case '%':
2944                    if (expandPE)
2945                      {
2946                        break loop;
2947                      }
2948                    // else fall through...
2949                    
2950                    // What may legitimately come AFTER a name/nmtoken?
2951                  case '<': case '>': case '&':
2952                  case ',': case '|': case '*': case '+': case '?':
2953                  case ')':
2954                  case '=':
2955                  case '\'': case '"':
2956                  case '[':
2957                  case ' ': case '\t': case '\r': case '\n':
2958                  case ';':
2959                  case '/':
2960                    int start = readBufferPos;
2961                    if (i == start)
2962                      {
2963                        fatal("name expected", readBuffer[i], null);
2964                      }
2965                    readBufferPos = i;
2966                    return intern(readBuffer, start, i - start);
2967                    
2968                  default:
2969                    // FIXME ... per IBM's OASIS test submission, these:
2970                    //   ?    U+06dd 
2971                    //   Combining  U+309B
2972                    //these switches are kind of ugly but at least we won't
2973                    //have to go over the whole lits for each char
2974                    if (isName && i == readBufferPos)
2975                      {
2976                        char c2 = (char) (c & 0x00f0);
2977                        switch (c & 0xff00)
2978                          {
2979                            //starting with 01
2980                          case 0x0100:
2981                            switch (c2)
2982                              {
2983                              case 0x0030:
2984                                if (c == 0x0132 || c == 0x0133 || c == 0x013f)
2985                                  {
2986                                    fatal("Not a name start character, U+"
2987                                          + Integer.toHexString(c));
2988                                  }
2989                                break;
2990                              case 0x0040:
2991                                if (c == 0x0140 || c == 0x0149)
2992                                  {
2993                                    fatal("Not a name start character, U+"
2994                                          + Integer.toHexString(c));
2995                                  }
2996                                break;
2997                              case 0x00c0:
2998                                if (c == 0x01c4 || c == 0x01cc)
2999                                  {
3000                                    fatal("Not a name start character, U+"
3001                                          + Integer.toHexString(c));
3002                                  }
3003                                break;
3004                              case 0x00f0:
3005                                if (c == 0x01f1 || c == 0x01f3)
3006                                  {
3007                                    fatal("Not a name start character, U+"
3008                                          + Integer.toHexString(c));
3009                                  }
3010                                break;
3011                              case 0x00b0:
3012                                if (c == 0x01f1 || c == 0x01f3)
3013                                  {
3014                                    fatal("Not a name start character, U+"
3015                                          + Integer.toHexString(c));
3016                                  }
3017                                break;
3018                              default:
3019                                if (c == 0x017f)
3020                                  {
3021                                    fatal("Not a name start character, U+"
3022                                          + Integer.toHexString(c));
3023                                  }
3024                              }
3025                            
3026                            break;
3027                            //starting with 11
3028                          case 0x1100:
3029                            switch (c2)
3030                              {
3031                              case 0x0000:
3032                                if (c == 0x1104 || c == 0x1108 ||
3033                                    c == 0x110a || c == 0x110d)
3034                                  {
3035                                    fatal("Not a name start character, U+"
3036                                          + Integer.toHexString(c));
3037                                  }
3038                                break;
3039                              case 0x0030:
3040                                if (c == 0x113b || c == 0x113f)
3041                                  {
3042                                    fatal("Not a name start character, U+"
3043                                          + Integer.toHexString(c));
3044                                  }
3045                                break;
3046                              case 0x0040:
3047                                if (c == 0x1141 || c == 0x114d
3048                                    || c == 0x114f )
3049                                  {
3050                                    fatal("Not a name start character, U+"
3051                                          + Integer.toHexString(c));
3052                                  }
3053                                break;
3054                              case 0x0050:
3055                                if (c == 0x1151 || c == 0x1156)
3056                                  {
3057                                    fatal("Not a name start character, U+"
3058                                          + Integer.toHexString(c));
3059                                  }
3060                                break;
3061                              case 0x0060:
3062                                if (c == 0x1162 || c == 0x1164
3063                                    || c == 0x1166 || c == 0x116b
3064                                    || c == 0x116f)
3065                                  {
3066                                    fatal("Not a name start character, U+"
3067                                          + Integer.toHexString(c));
3068                                  }
3069                                break;
3070                              case 0x00b0:
3071                                if (c == 0x11b6 || c == 0x11b9
3072                                    || c == 0x11bb || c == 0x116f)
3073                                  {
3074                                    fatal("Not a name start character, U+"
3075                                          + Integer.toHexString(c));
3076                                  }
3077                                break;
3078                              default:
3079                                if (c == 0x1174 || c == 0x119f
3080                                    || c == 0x11ac || c == 0x11c3
3081                                    || c == 0x11f1)
3082                                  {
3083                                    fatal("Not a name start character, U+"
3084                                          + Integer.toHexString(c));
3085                                  }
3086                              }
3087                            break;
3088                          default:
3089                            if (c == 0x0e46 || c == 0x1011 
3090                                || c == 0x212f || c == 0x0587
3091                                || c == 0x0230 )
3092                              {
3093                                fatal("Not a name start character, U+"
3094                                      + Integer.toHexString(c));
3095                              }
3096                          }
3097                      }
3098                    // punt on exact tests from Appendix A; approximate
3099                    // them using the Unicode ID start/part rules
3100                    if (i == readBufferPos && isName)
3101                      {
3102                        if (!Character.isUnicodeIdentifierStart(c)
3103                            && c != ':' && c != '_')
3104                          {
3105                            fatal("Not a name start character, U+"
3106                                  + Integer.toHexString(c));
3107                          }
3108                      }
3109                    else if (!Character.isUnicodeIdentifierPart(c)
3110                             && c != '-' && c != ':' && c != '_' && c != '.'
3111                             && !isExtender(c))
3112                      {
3113                        fatal("Not a name character, U+"
3114                              + Integer.toHexString(c));
3115                      }
3116                  }
3117              }
3118          }
3119        
3120        nameBufferPos = 0;
3121    
3122        // Read the first character.
3123    loop:
3124        while (true)
3125          {
3126            c = readCh();
3127            switch (c)
3128              {
3129              case '%':
3130              case '<': case '>': case '&':
3131              case ',': case '|': case '*': case '+': case '?':
3132              case ')':
3133              case '=':
3134              case '\'': case '"':
3135              case '[':
3136              case ' ': case '\t': case '\n': case '\r':
3137              case ';':
3138              case '/':
3139                unread(c);
3140                if (nameBufferPos == 0)
3141                  {
3142                    fatal ("name expected");
3143                  }
3144                // punt on exact tests from Appendix A, but approximate them
3145                if (isName
3146                    && !Character.isUnicodeIdentifierStart(nameBuffer[0])
3147                    && ":_".indexOf(nameBuffer[0]) == -1)
3148                  {
3149                    fatal("Not a name start character, U+"
3150                          + Integer.toHexString(nameBuffer[0]));
3151                  }
3152                String s = intern(nameBuffer, 0, nameBufferPos);
3153                nameBufferPos = 0;
3154                return s;
3155              default:
3156                // punt on exact tests from Appendix A, but approximate them
3157                
3158                if ((nameBufferPos != 0 || !isName)
3159                    && !Character.isUnicodeIdentifierPart(c)
3160                    && ":-_.".indexOf(c) == -1
3161                    && !isExtender(c))
3162                  {
3163                    fatal("Not a name character, U+"
3164                          + Integer.toHexString(c));
3165                  }
3166                if (nameBufferPos >= nameBuffer.length)
3167                  {
3168                    nameBuffer =
3169                      (char[]) extendArray(nameBuffer,
3170                                           nameBuffer.length, nameBufferPos);
3171                  }
3172                nameBuffer[nameBufferPos++] = c;
3173              }
3174          }
3175      }
3176      
3177      private static boolean isExtender(char c)
3178      {
3179        // [88] Extender ::= ...
3180        return c == 0x00b7 || c == 0x02d0 || c == 0x02d1 || c == 0x0387
3181          || c == 0x0640 || c == 0x0e46 || c == 0x0ec6 || c == 0x3005
3182          || (c >= 0x3031 && c <= 0x3035)
3183          || (c >= 0x309d && c <= 0x309e)
3184          || (c >= 0x30fc && c <= 0x30fe);
3185      }
3186    
3187      /**
3188       * Read a literal.  With matching single or double quotes as
3189       * delimiters (and not embedded!) this is used to parse:
3190       * <pre>
3191       *  [9] EntityValue ::= ... ([^%&amp;] | PEReference | Reference)* ...
3192       *  [10] AttValue ::= ... ([^<&] | Reference)* ...
3193       *  [11] SystemLiteral ::= ... (URLchar - "'")* ...
3194       *  [12] PubidLiteral ::= ... (PubidChar - "'")* ...
3195       * </pre>
3196       * as well as the quoted strings in XML and text declarations
3197       * (for version, encoding, and standalone) which have their
3198       * own constraints.
3199       */
3200      private String readLiteral(int flags)
3201        throws SAXException, IOException
3202      {
3203        char delim, c;
3204        int startLine = line;
3205        boolean saved = expandPE;
3206        boolean savedReport = doReport;
3207        
3208        // Find the first delimiter.
3209        delim = readCh();
3210        if (delim != '"' && delim != '\'')
3211          {
3212            fatal("expected '\"' or \"'\"", delim, null);
3213            return null;
3214          }
3215        inLiteral = true;
3216        if ((flags & LIT_DISABLE_PE) != 0)
3217          {
3218            expandPE = false;
3219          }
3220        doReport = false;
3221        
3222        // Each level of input source has its own buffer; remember
3223        // ours, so we won't read the ending delimiter from any
3224        // other input source, regardless of entity processing.
3225        char[] ourBuf = readBuffer;
3226    
3227        // Read the literal.
3228        try
3229          {
3230            c = readCh();
3231    loop:
3232            while (! (c == delim && readBuffer == ourBuf))
3233              {
3234                switch (c)
3235                  {
3236                    // attributes and public ids are normalized
3237                    // in almost the same ways
3238                  case '\n':
3239                  case '\r':
3240                    if ((flags & (LIT_ATTRIBUTE | LIT_PUBID)) != 0)
3241                      {
3242                        c = ' ';
3243                      }
3244                    break;
3245                  case '\t':
3246                    if ((flags & LIT_ATTRIBUTE) != 0)
3247                      {
3248                        c = ' ';
3249                      }
3250                    break;
3251                  case '&':
3252                    c = readCh();
3253                    // Char refs are expanded immediately, except for
3254                    // all the cases where it's deferred.
3255                    if (c == '#')
3256                      {
3257                        if ((flags & LIT_DISABLE_CREF) != 0)
3258                          {
3259                            dataBufferAppend('&');
3260                            break;
3261                          }
3262                        parseCharRef(false /* Do not do flushDataBuffer */);
3263                        
3264                        // exotic WFness risk: this is an entity literal,
3265                        // dataBuffer [dataBufferPos - 1] == '&', and
3266                        // following chars are a _partial_ entity/char ref
3267                        
3268                        // It looks like an entity ref ...
3269                      }
3270                    else
3271                      {
3272                        unread(c);
3273                        // Expand it?
3274                        if ((flags & LIT_ENTITY_REF) > 0)
3275                          {
3276                            parseEntityRef(false);
3277                            //Is it just data?
3278                          }
3279                        else if ((flags & LIT_DISABLE_EREF) != 0)
3280                          {
3281                            dataBufferAppend('&');
3282                            
3283                            // OK, it will be an entity ref -- expanded later.
3284                          }
3285                        else
3286                          {
3287                            String name = readNmtoken(true);
3288                            require(';');
3289                            dataBufferAppend('&');
3290                            dataBufferAppend(name);
3291                            dataBufferAppend(';');
3292                          }
3293                      }
3294                    c = readCh();
3295                    continue loop;
3296                    
3297                  case '<':
3298                    // and why?  Perhaps so "&foo;" expands the same
3299                    // inside and outside an attribute?
3300                    if ((flags & LIT_ATTRIBUTE) != 0)
3301                      {
3302                        fatal("attribute values may not contain '<'");
3303                      }
3304                    break;
3305    
3306                    // We don't worry about case '%' and PE refs, readCh does.
3307                    
3308                  default:
3309                    break;
3310                  }
3311                dataBufferAppend(c);
3312                c = readCh();
3313              }
3314          }
3315        catch (EOFException e)
3316          {
3317            fatal("end of input while looking for delimiter (started on line "
3318                  + startLine + ')', null, new Character(delim).toString());
3319          }
3320        inLiteral = false;
3321        expandPE = saved;
3322        doReport = savedReport;
3323        
3324        // Normalise whitespace if necessary.
3325        if ((flags & LIT_NORMALIZE) > 0)
3326          {
3327            dataBufferNormalize();
3328          }
3329        
3330        // Return the value.
3331        return dataBufferToString();
3332      }
3333      
3334      /**
3335       * Try reading external identifiers.
3336       * A system identifier is not required for notations.
3337       * @param inNotation Are we parsing a notation decl?
3338       * @param isSubset Parsing external subset decl (may be omitted)?
3339       * @return A three-member String array containing the identifiers,
3340       *  or nulls. Order: public, system, baseURI.
3341       */
3342      private ExternalIdentifiers readExternalIds(boolean inNotation,
3343                                                  boolean isSubset)
3344        throws Exception
3345      {
3346        char c;
3347        ExternalIdentifiers ids = new ExternalIdentifiers();
3348        int flags = LIT_DISABLE_CREF | LIT_DISABLE_PE | LIT_DISABLE_EREF;
3349        
3350        if (tryRead("PUBLIC"))
3351          {
3352            requireWhitespace();
3353            ids.publicId = readLiteral(LIT_NORMALIZE | LIT_PUBID | flags);
3354            if (inNotation)
3355              {
3356                skipWhitespace();
3357                c = readCh();
3358                unread(c);
3359                if (c == '"' || c == '\'')
3360                  {
3361                    ids.systemId = readLiteral(flags);
3362                  }
3363              }
3364            else
3365              {
3366                requireWhitespace();
3367                ids.systemId = readLiteral(flags);
3368              }
3369            
3370            for (int i = 0; i < ids.publicId.length(); i++)
3371              {
3372                c = ids.publicId.charAt(i);
3373                if (c >= 'a' && c <= 'z')
3374                  {
3375                    continue;
3376                  }
3377                if (c >= 'A' && c <= 'Z')
3378                  {
3379                    continue;
3380                  }
3381                if (" \r\n0123456789-' ()+,./:=?;!*#@$_%".indexOf(c) != -1)
3382                  {
3383                    continue;
3384                  }
3385                fatal("illegal PUBLIC id character U+"
3386                      + Integer.toHexString(c));
3387              }
3388          }
3389        else if (tryRead("SYSTEM"))
3390          {
3391            requireWhitespace();
3392            ids.systemId = readLiteral(flags);
3393          }
3394        else if (!isSubset)
3395          {
3396            fatal("missing SYSTEM or PUBLIC keyword");
3397          }
3398          
3399        if (ids.systemId != null)
3400          {
3401            if (ids.systemId.indexOf('#') != -1)
3402              {
3403                handler.verror("SYSTEM id has a URI fragment: " + ids.systemId);
3404              }
3405            ids.baseUri = handler.getSystemId();
3406            if (ids.baseUri == null && uriWarnings)
3407              {
3408                handler.warn("No base URI; hope URI is absolute: "
3409                             + ids.systemId);
3410              }
3411          }
3412        
3413        return ids;
3414      }
3415    
3416      /**
3417       * Test if a character is whitespace.
3418       * <pre>
3419       * [3] S ::= (#x20 | #x9 | #xd | #xa)+
3420       * </pre>
3421       * @param c The character to test.
3422       * @return true if the character is whitespace.
3423       */
3424      private final boolean isWhitespace(char c)
3425      {
3426        if (c > 0x20)
3427          {
3428            return false;
3429          }
3430        if (c == 0x20 || c == 0x0a || c == 0x09 || c == 0x0d)
3431          {
3432            return true;
3433          }
3434        return false;  // illegal ...
3435      }
3436    
3437      //////////////////////////////////////////////////////////////////////
3438      // Utility routines.
3439      //////////////////////////////////////////////////////////////////////
3440        
3441      /**
3442       * Add a character to the data buffer.
3443       */
3444      private void dataBufferAppend(char c)
3445      {
3446        // Expand buffer if necessary.
3447        if (dataBufferPos >= dataBuffer.length)
3448          {
3449            dataBuffer = (char[]) extendArray(dataBuffer,
3450                                              dataBuffer.length, dataBufferPos);
3451          }
3452        dataBuffer[dataBufferPos++] = c;
3453      }
3454    
3455      /**
3456       * Add a string to the data buffer.
3457       */
3458      private void dataBufferAppend(String s)
3459      {
3460        dataBufferAppend(s.toCharArray(), 0, s.length());
3461      }
3462    
3463      /**
3464       * Append (part of) a character array to the data buffer.
3465       */
3466      private void dataBufferAppend(char[] ch, int start, int length)
3467      {
3468        dataBuffer = (char[]) extendArray(dataBuffer, dataBuffer.length,
3469                                          dataBufferPos + length);
3470        
3471        System.arraycopy(ch, start, dataBuffer, dataBufferPos, length);
3472        dataBufferPos += length;
3473      }
3474    
3475      /**
3476       * Normalise space characters in the data buffer.
3477       */
3478      private void dataBufferNormalize()
3479      {
3480        int i = 0;
3481        int j = 0;
3482        int end = dataBufferPos;
3483        
3484        // Skip spaces at the start.
3485        while (j < end && dataBuffer[j] == ' ')
3486          {
3487            j++;
3488          }
3489        
3490        // Skip whitespace at the end.
3491        while (end > j && dataBuffer[end - 1] == ' ')
3492          {
3493            end --;
3494          }
3495    
3496        // Start copying to the left.
3497        while (j < end)
3498          {
3499            
3500            char c = dataBuffer[j++];
3501            
3502            // Normalise all other spaces to
3503            // a single space.
3504            if (c == ' ')
3505              {
3506                while (j < end && dataBuffer[j++] == ' ')
3507                  {
3508                    continue;
3509                  }
3510                dataBuffer[i++] = ' ';
3511                dataBuffer[i++] = dataBuffer[j - 1];
3512              }
3513            else
3514              {
3515                dataBuffer[i++] = c;
3516              }
3517          }
3518        
3519        // The new length is <= the old one.
3520        dataBufferPos = i;
3521      }
3522    
3523      /**
3524       * Convert the data buffer to a string.
3525       */
3526      private String dataBufferToString()
3527      {
3528        String s = new String(dataBuffer, 0, dataBufferPos);
3529        dataBufferPos = 0;
3530        return s;
3531      }
3532    
3533      /**
3534       * Flush the contents of the data buffer to the handler, as
3535       * appropriate, and reset the buffer for new input.
3536       */
3537      private void dataBufferFlush()
3538        throws SAXException
3539      {
3540        if (currentElementContent == CONTENT_ELEMENTS
3541            && dataBufferPos > 0
3542            && !inCDATA)
3543          {
3544            // We can't just trust the buffer to be whitespace, there
3545            // are (error) cases when it isn't
3546            for (int i = 0; i < dataBufferPos; i++)
3547              {
3548                if (!isWhitespace(dataBuffer[i]))
3549                  {
3550                    handler.charData(dataBuffer, 0, dataBufferPos);
3551                    dataBufferPos = 0;
3552                  }
3553              }
3554            if (dataBufferPos > 0)
3555              {
3556                handler.ignorableWhitespace(dataBuffer, 0, dataBufferPos);
3557                dataBufferPos = 0;
3558              }
3559          }
3560        else if (dataBufferPos > 0)
3561          {
3562            handler.charData(dataBuffer, 0, dataBufferPos);
3563            dataBufferPos = 0;
3564          }
3565      }
3566    
3567      /**
3568       * Require a string to appear, or throw an exception.
3569       * <p><em>Precondition:</em> Entity expansion is not required.
3570       * <p><em>Precondition:</em> data buffer has no characters that
3571       * will get sent to the application.
3572       */
3573      private void require(String delim)
3574        throws SAXException, IOException
3575      {
3576        int length = delim.length();
3577        char[] ch;
3578        
3579        if (length < dataBuffer.length)
3580          {
3581            ch = dataBuffer;
3582            delim.getChars(0, length, ch, 0);
3583          }
3584        else
3585          {
3586            ch = delim.toCharArray();
3587          }
3588          
3589        if (USE_CHEATS && length <= (readBufferLength - readBufferPos))
3590          {
3591            int offset = readBufferPos;
3592            
3593            for (int i = 0; i < length; i++, offset++)
3594              {
3595                if (ch[i] != readBuffer[offset])
3596                  {
3597                    fatal ("required string", null, delim);
3598                  }
3599              }
3600            readBufferPos = offset;
3601            
3602          }
3603        else
3604          {
3605            for (int i = 0; i < length; i++)
3606              {
3607                require(ch[i]);
3608              }
3609          }
3610      }
3611    
3612      /**
3613       * Require a character to appear, or throw an exception.
3614       */
3615      private void require(char delim)
3616        throws SAXException, IOException
3617      {
3618        char c = readCh();
3619        
3620        if (c != delim)
3621          {
3622            fatal("required character", c, new Character(delim).toString());
3623          }
3624      }
3625      
3626      /**
3627       * Create an interned string from a character array.
3628       * &AElig;lfred uses this method to create an interned version
3629       * of all names and name tokens, so that it can test equality
3630       * with <code>==</code> instead of <code>String.equals ()</code>.
3631       *
3632       * <p>This is much more efficient than constructing a non-interned
3633       * string first, and then interning it.
3634       *
3635       * @param ch an array of characters for building the string.
3636       * @param start the starting position in the array.
3637       * @param length the number of characters to place in the string.
3638       * @return an interned string.
3639       * @see #intern (String)
3640       * @see java.lang.String#intern
3641       */
3642      public String intern(char[] ch, int start, int length)
3643      {
3644        int index = 0;
3645        int hash = 0;
3646        Object[] bucket;
3647    
3648        // Generate a hash code.  This is a widely used string hash,
3649        // often attributed to Brian Kernighan.
3650        for (int i = start; i < start + length; i++)
3651          {
3652            hash = 31 * hash + ch[i];
3653          }
3654        hash = (hash & 0x7fffffff) % SYMBOL_TABLE_LENGTH;
3655        
3656        // Get the bucket -- consists of {array,String} pairs
3657        if ((bucket = symbolTable[hash]) == null)
3658          {
3659            // first string in this bucket
3660            bucket = new Object[8];
3661            
3662            // Search for a matching tuple, and
3663            // return the string if we find one.
3664          }
3665        else
3666          {
3667            while (index < bucket.length)
3668              {
3669                char[] chFound = (char[]) bucket[index];
3670            
3671                // Stop when we hit an empty entry.
3672                if (chFound == null)
3673                  {
3674                    break;
3675                  }
3676                
3677                // If they're the same length, check for a match.
3678                if (chFound.length == length)
3679                  {
3680                    for (int i = 0; i < chFound.length; i++)
3681                      {
3682                        // continue search on failure
3683                        if (ch[start + i] != chFound[i])
3684                          {
3685                            break;
3686                          }
3687                        else if (i == length - 1)
3688                          {
3689                            // That's it, we have a match!
3690                            return (String) bucket[index + 1];
3691                          }
3692                      }
3693                  }
3694                index += 2;
3695              }
3696            // Not found -- we'll have to add it.
3697            
3698            // Do we have to grow the bucket?
3699            bucket = (Object[]) extendArray(bucket, bucket.length, index);
3700          }
3701        symbolTable[hash] = bucket;
3702        
3703        // OK, add it to the end of the bucket -- "local" interning.
3704        // Intern "globally" to let applications share interning benefits.
3705        // That is, "!=" and "==" work on our strings, not just equals().
3706        String s = new String(ch, start, length).intern();
3707        bucket[index] = s.toCharArray();
3708        bucket[index + 1] = s;
3709        return s;
3710      }
3711    
3712      /**
3713       * Ensure the capacity of an array, allocating a new one if
3714       * necessary.  Usually extends only for name hash collisions. 
3715       */
3716      private Object extendArray(Object array, int currentSize, int requiredSize)
3717      {
3718        if (requiredSize < currentSize)
3719          {
3720            return array;
3721          }
3722        else
3723          {
3724            Object newArray = null;
3725            int newSize = currentSize * 2;
3726            
3727            if (newSize <= requiredSize)
3728              {
3729                newSize = requiredSize + 1;
3730              }
3731            
3732            if (array instanceof char[])
3733              {
3734                newArray = new char[newSize];
3735              }
3736            else if (array instanceof Object[])
3737              {
3738                newArray = new Object[newSize];
3739              }
3740            else
3741              {
3742                throw new RuntimeException();
3743              }
3744            
3745            System.arraycopy(array, 0, newArray, 0, currentSize);
3746            return newArray;
3747          }
3748      }
3749    
3750      //////////////////////////////////////////////////////////////////////
3751      // XML query routines.
3752      //////////////////////////////////////////////////////////////////////
3753      
3754      boolean isStandalone()
3755      {
3756        return docIsStandalone;
3757      }
3758        
3759      //
3760      // Elements
3761      //
3762      
3763      private int getContentType(ElementDecl element, int defaultType)
3764      {
3765        int retval;
3766        
3767        if (element == null)
3768          {
3769            return defaultType;
3770          }
3771        retval = element.contentType;
3772        if (retval == CONTENT_UNDECLARED)
3773          {
3774            retval = defaultType;
3775          }
3776        return retval;
3777      }
3778    
3779      /**
3780       * Look up the content type of an element.
3781       * @param name The element type name.
3782       * @return An integer constant representing the content type.
3783       * @see #CONTENT_UNDECLARED
3784       * @see #CONTENT_ANY
3785       * @see #CONTENT_EMPTY
3786       * @see #CONTENT_MIXED
3787       * @see #CONTENT_ELEMENTS
3788       */
3789      public int getElementContentType(String name)
3790      {
3791        ElementDecl element = elementInfo.get(name);
3792        return getContentType(element, CONTENT_UNDECLARED);
3793      }
3794      
3795      /**
3796       * Register an element.
3797       * Array format:
3798       *  [0] element type name
3799       *  [1] content model (mixed, elements only)
3800       *  [2] attribute hash table
3801       */
3802      private void setElement(String name, int contentType,
3803                              String contentModel, HashMap<String, AttributeDecl> attributes)
3804        throws SAXException
3805      {
3806        if (skippedPE)
3807          {
3808            return;
3809          }
3810    
3811        ElementDecl element = elementInfo.get(name);
3812        
3813        // first <!ELEMENT ...> or <!ATTLIST ...> for this type?
3814        if (element == null)
3815          {
3816            element = new ElementDecl();
3817            element.contentType = contentType;
3818            element.contentModel = contentModel;
3819            element.attributes = attributes;
3820            elementInfo.put(name, element);
3821            return;
3822          }
3823        
3824        // <!ELEMENT ...> declaration?
3825        if (contentType != CONTENT_UNDECLARED)
3826          {
3827            // ... following an associated <!ATTLIST ...>
3828            if (element.contentType == CONTENT_UNDECLARED)
3829              {
3830                element.contentType = contentType;
3831                element.contentModel = contentModel;
3832              }
3833            else
3834              {
3835                // VC: Unique Element Type Declaration
3836                handler.verror("multiple declarations for element type: "
3837                               + name);
3838              }
3839          }
3840        
3841        // first <!ATTLIST ...>, before <!ELEMENT ...> ?
3842        else if (attributes != null)
3843          {
3844            element.attributes = attributes;
3845          }
3846      }
3847      
3848      /**
3849       * Look up the attribute hash table for an element.
3850       * The hash table is the second item in the element array.
3851       */
3852      private HashMap<String, AttributeDecl> getElementAttributes(String name)
3853      {
3854        ElementDecl element = elementInfo.get(name);
3855        return (element == null) ? null : element.attributes;
3856      }
3857    
3858      //
3859      // Attributes
3860      //
3861      
3862      /**
3863       * Get the declared attributes for an element type.
3864       * @param elname The name of the element type.
3865       * @return An iterator over all the attributes declared for
3866       *   a specific element type.  The results will be valid only
3867       *   after the DTD (if any) has been parsed.
3868       * @see #getAttributeType
3869       * @see #getAttributeEnumeration
3870       * @see #getAttributeDefaultValueType
3871       * @see #getAttributeDefaultValue
3872       * @see #getAttributeExpandedValue
3873       */
3874      private Iterator<String> declaredAttributes(ElementDecl element)
3875      {
3876        HashMap<String, AttributeDecl> attlist;
3877        
3878        if (element == null)
3879          {
3880            return null;
3881          }
3882        if ((attlist = element.attributes) == null)
3883          {
3884            return null;
3885          }
3886        return attlist.keySet().iterator();
3887      }
3888    
3889      /**
3890       * Get the declared attributes for an element type.
3891       * @param elname The name of the element type.
3892       * @return An iterator over all the attributes declared for
3893       *   a specific element type.  The results will be valid only
3894       *   after the DTD (if any) has been parsed.
3895       * @see #getAttributeType
3896       * @see #getAttributeEnumeration
3897       * @see #getAttributeDefaultValueType
3898       * @see #getAttributeDefaultValue
3899       * @see #getAttributeExpandedValue
3900       */
3901      public Iterator<String> declaredAttributes(String elname)
3902      {
3903        return declaredAttributes(elementInfo.get(elname));
3904      }
3905    
3906      /**
3907       * Retrieve the declared type of an attribute.
3908       * @param name The name of the associated element.
3909       * @param aname The name of the attribute.
3910       * @return An interend string denoting the type, or null
3911       *  indicating an undeclared attribute.
3912       */
3913      public String getAttributeType(String name, String aname)
3914      {
3915        AttributeDecl attribute = getAttribute(name, aname);
3916        return (attribute == null) ? null : attribute.type;
3917      }
3918    
3919      /**
3920       * Retrieve the allowed values for an enumerated attribute type.
3921       * @param name The name of the associated element.
3922       * @param aname The name of the attribute.
3923       * @return A string containing the token list.
3924       */
3925      public String getAttributeEnumeration(String name, String aname)
3926      {
3927        AttributeDecl attribute = getAttribute(name, aname);
3928        // assert:  attribute.enumeration is "ENUMERATION" or "NOTATION"
3929        return (attribute == null) ? null : attribute.enumeration;
3930      }
3931    
3932      /**
3933       * Retrieve the default value of a declared attribute.
3934       * @param name The name of the associated element.
3935       * @param aname The name of the attribute.
3936       * @return The default value, or null if the attribute was
3937       *   #IMPLIED or simply undeclared and unspecified.
3938       * @see #getAttributeExpandedValue
3939       */
3940      public String getAttributeDefaultValue(String name, String aname)
3941      {
3942        AttributeDecl attribute = getAttribute(name, aname);
3943        return (attribute == null) ? null : attribute.value;
3944      }
3945    
3946        /*
3947    
3948    // FIXME:  Leaving this in, until W3C finally resolves the confusion
3949    // between parts of the XML 2nd REC about when entity declararations
3950    // are guaranteed to be known.  Current code matches what section 5.1
3951    // (conformance) describes, but some readings of the self-contradicting
3952    // text in 4.1 (the "Entity Declared" WFC and VC) seem to expect that
3953    // attribute expansion/normalization must be deferred in some cases
3954    // (just TRY to identify them!).
3955    
3956         * Retrieve the expanded value of a declared attribute.
3957         * <p>General entities (and char refs) will be expanded (once).
3958         * @param name The name of the associated element.
3959         * @param aname The name of the attribute.
3960         * @return The expanded default value, or null if the attribute was
3961         *   #IMPLIED or simply undeclared
3962         * @see #getAttributeDefaultValue
3963        public String getAttributeExpandedValue (String name, String aname)
3964        throws Exception
3965        {
3966      AttributeDecl attribute = getAttribute (name, aname);
3967    
3968      if (attribute == null) {
3969          return null;
3970      } else if (attribute.defaultValue == null && attribute.value != null) {
3971          // we MUST use the same buf for both quotes else the literal
3972          // can't be properly terminated
3973          char buf [] = new char [1];
3974          int  flags = LIT_ENTITY_REF | LIT_ATTRIBUTE;
3975          String type = getAttributeType (name, aname);
3976    
3977          if (type != "CDATA" && type != null)
3978        flags |= LIT_NORMALIZE;
3979          buf [0] = '"';
3980          pushCharArray (null, buf, 0, 1);
3981          pushString (null, attribute.value);
3982          pushCharArray (null, buf, 0, 1);
3983          attribute.defaultValue = readLiteral (flags);
3984      }
3985      return attribute.defaultValue;
3986        }
3987         */
3988    
3989      /**
3990       * Retrieve the default value mode of a declared attribute.
3991       * @see #ATTRIBUTE_DEFAULT_SPECIFIED
3992       * @see #ATTRIBUTE_DEFAULT_IMPLIED
3993       * @see #ATTRIBUTE_DEFAULT_REQUIRED
3994       * @see #ATTRIBUTE_DEFAULT_FIXED
3995       */
3996      public int getAttributeDefaultValueType(String name, String aname)
3997      {
3998        AttributeDecl attribute = getAttribute(name, aname);
3999        return (attribute == null) ? ATTRIBUTE_DEFAULT_UNDECLARED :
4000          attribute.valueType;
4001      }  
4002      
4003      /**
4004       * Register an attribute declaration for later retrieval.
4005       * Format:
4006       * - String type
4007       * - String default value
4008       * - int value type
4009       * - enumeration
4010       * - processed default value
4011       */
4012      private void setAttribute(String elName, String name, String type,
4013                                String enumeration, String value, int valueType)
4014        throws Exception
4015      {
4016        HashMap<String, AttributeDecl> attlist;
4017        
4018        if (skippedPE)
4019          {
4020            return;
4021          }
4022        
4023        // Create a new hashtable if necessary.
4024        attlist = getElementAttributes(elName);
4025        if (attlist == null)
4026          {
4027            attlist = new HashMap<String, AttributeDecl>();
4028          }
4029        
4030        // ignore multiple attribute declarations!
4031        if (attlist.get(name) != null)
4032          {
4033            // warn ...
4034            return;
4035          }
4036        else
4037          {
4038            AttributeDecl attribute = new AttributeDecl();
4039            attribute.type = type;
4040            attribute.value = value;
4041            attribute.valueType = valueType;
4042            attribute.enumeration = enumeration;
4043            attlist.put(name, attribute);
4044          
4045            // save; but don't overwrite any existing <!ELEMENT ...>
4046            setElement(elName, CONTENT_UNDECLARED, null, attlist);
4047          }
4048      }
4049    
4050      /**
4051       * Retrieve the attribute declaration for the given element name and name.
4052       */
4053      private AttributeDecl getAttribute(String elName, String name)
4054      {
4055        HashMap<String, AttributeDecl> attlist = getElementAttributes(elName);
4056        return (attlist == null) ? null : attlist.get(name);
4057      }
4058    
4059      //
4060      // Entities
4061      //
4062      
4063      /**
4064       * Find the type of an entity.
4065       * @returns An integer constant representing the entity type.
4066       * @see #ENTITY_UNDECLARED
4067       * @see #ENTITY_INTERNAL
4068       * @see #ENTITY_NDATA
4069       * @see #ENTITY_TEXT
4070       */
4071      public int getEntityType(String ename)
4072      {
4073        EntityInfo entity = entityInfo.get(ename);
4074        return (entity == null) ?  ENTITY_UNDECLARED : entity.type;
4075      }
4076    
4077      /**
4078       * Return an external entity's identifiers.
4079       * @param ename The name of the external entity.
4080       * @return The entity's public identifier, system identifier, and base URI.
4081       *  Null if the entity was not declared as an external entity.
4082       * @see #getEntityType
4083       */
4084      public ExternalIdentifiers getEntityIds(String ename)
4085      {
4086        EntityInfo entity = entityInfo.get(ename);
4087        return (entity == null) ? null : entity.ids;
4088      }
4089    
4090      /**
4091       * Return an internal entity's replacement text.
4092       * @param ename The name of the internal entity.
4093       * @return The entity's replacement text, or null if
4094       *   the entity was not declared as an internal entity.
4095       * @see #getEntityType
4096       */
4097      public String getEntityValue(String ename)
4098      {
4099        EntityInfo entity = entityInfo.get(ename);
4100        return (entity == null) ? null : entity.value;
4101      }
4102    
4103      /**
4104       * Register an entity declaration for later retrieval.
4105       */
4106      private void setInternalEntity(String eName, String value)
4107        throws SAXException
4108      {
4109        if (skippedPE)
4110          {
4111            return;
4112          }
4113    
4114        if (entityInfo.get(eName) == null)
4115          {
4116            EntityInfo entity = new EntityInfo();
4117            entity.type = ENTITY_INTERNAL;
4118            entity.value = value;
4119            entityInfo.put(eName, entity);
4120          }
4121        if (handler.stringInterning)
4122          {
4123            if ("lt" == eName || "gt" == eName || "quot" == eName
4124                || "apos" == eName || "amp" == eName)
4125              {
4126                return;
4127              }
4128          }
4129        else
4130          {
4131            if ("lt".equals(eName) || "gt".equals(eName) || "quot".equals(eName)
4132                || "apos".equals(eName) || "amp".equals(eName))
4133              {
4134                return;
4135              }
4136          }
4137        handler.getDeclHandler().internalEntityDecl(eName, value);
4138      }
4139    
4140      /**
4141       * Register an external entity declaration for later retrieval.
4142       */
4143      private void setExternalEntity(String eName, int eClass,
4144                                     ExternalIdentifiers ids, String nName)
4145      {
4146        if (entityInfo.get(eName) == null)
4147          {
4148            EntityInfo entity = new EntityInfo();
4149            entity.type = eClass;
4150            entity.ids = ids;
4151            entity.notationName = nName;
4152            entityInfo.put(eName, entity);
4153          }
4154      }
4155    
4156      //
4157      // Notations.
4158      //
4159      
4160      /**
4161       * Report a notation declaration, checking for duplicates.
4162       */
4163      private void setNotation(String nname, ExternalIdentifiers ids)
4164        throws SAXException
4165      {
4166        if (skippedPE)
4167          {
4168            return;
4169          }
4170        
4171        handler.notationDecl(nname, ids.publicId, ids.systemId, ids.baseUri);
4172        if (notationInfo.get(nname) == null)
4173          {
4174            notationInfo.put(nname, nname);
4175          }
4176        else
4177          {
4178            // VC: Unique Notation Name
4179            handler.verror("Duplicate notation name decl: " + nname);
4180          }
4181      }
4182      
4183      //
4184      // Location.
4185      //
4186      
4187      /**
4188       * Return the current line number.
4189       */
4190      public int getLineNumber()
4191      {
4192        return line;
4193      }
4194    
4195      /**
4196       * Return the current column number.
4197       */
4198      public int getColumnNumber()
4199      {
4200        return column;
4201      }
4202    
4203      //////////////////////////////////////////////////////////////////////
4204      // High-level I/O.
4205      //////////////////////////////////////////////////////////////////////
4206      
4207      /**
4208       * Read a single character from the readBuffer.
4209       * <p>The readDataChunk () method maintains the buffer.
4210       * <p>If we hit the end of an entity, try to pop the stack and
4211       * keep going.
4212       * <p> (This approach doesn't really enforce XML's rules about
4213       * entity boundaries, but this is not currently a validating
4214       * parser).
4215       * <p>This routine also attempts to keep track of the current
4216       * position in external entities, but it's not entirely accurate.
4217       * @return The next available input character.
4218       * @see #unread (char)
4219       * @see #readDataChunk
4220       * @see #readBuffer
4221       * @see #line
4222       * @return The next character from the current input source.
4223       */
4224      private char readCh()
4225        throws SAXException, IOException
4226      {
4227        // As long as there's nothing in the
4228        // read buffer, try reading more data
4229        // (for an external entity) or popping
4230        // the entity stack (for either).
4231        while (readBufferPos >= readBufferLength)
4232          {
4233            switch (sourceType)
4234              {
4235              case INPUT_READER:
4236                readDataChunk();
4237                while (readBufferLength < 1)
4238                  {
4239                    popInput();
4240                    if (readBufferLength < 1)
4241                      {
4242                        readDataChunk();
4243                      }
4244                  }
4245                break;
4246                
4247              default:
4248                
4249                popInput();
4250                break;
4251              }
4252          }
4253        
4254        char c = readBuffer[readBufferPos++];
4255        
4256        // copied from fi.iki.hsivonen.htmlparser
4257        if ((c & 0xFC00) == 0xDC00) {
4258            // Got a low surrogate. See if prev was high surrogate
4259            if ((prev & 0xFC00) == 0xD800) {
4260                int intVal = (prev << 10) + c + SURROGATE_OFFSET;
4261                if (isNonCharacter(intVal)) {
4262                    handler.warn("Astral non-character.");
4263                }
4264                if (isAstralPrivateUse(intVal)) {
4265                    warnAboutPrivateUseChar();
4266                }
4267            } else {
4268                fatal("Unmatched low surrogate.");
4269            }
4270            prev = c;
4271        } else {
4272            // see if there was a lone high surrogate
4273            if ((prev & 0xFC00) == 0xD800) {
4274                fatal("Unmatched high surrogate.");
4275            }
4276        }
4277        
4278        if (c == '\n')
4279          {
4280            line++;
4281            column = 0;
4282          }
4283        else
4284          {
4285            if (c == '<')
4286              {
4287                /* the most common return to parseContent () ... NOP */
4288              }
4289            else if (((c < 0x0020 && (c != '\t') && (c != '\r')) || c > 0xFFFD)
4290                     || ((c >= 0x007f) && (c <= 0x009f) && (c != 0x0085) 
4291                         && xmlVersion == XML_11))
4292              {
4293                fatal("illegal XML character U+" + Integer.toHexString(c));
4294              }
4295            else if (c >= '\u007F' && c <= '\u009F') // 2006-04-25 hsivonen
4296              {
4297                handler.warn("Saw a control character: U+00" + Integer.toHexString(c) + ".");
4298              }
4299            
4300            if (isPrivateUse(c)) 
4301              {
4302                warnAboutPrivateUseChar();
4303              }
4304            // If we're in the DTD and in a context where PEs get expanded,
4305            // do so ... 1/14/2000 errata identify those contexts.  There
4306            // are also spots in the internal subset where PE refs are fatal
4307            // errors, hence yet another flag.
4308            else if (c == '%' && expandPE)
4309              {
4310                if (peIsError)
4311                  {
4312                    fatal("PE reference within decl in internal subset.");
4313                  }
4314                parsePEReference();
4315                return readCh();
4316              }
4317            column++;
4318          }
4319    
4320        return c;
4321      }
4322    
4323      /**
4324       * Push a single character back onto the current input stream.
4325       * <p>This method usually pushes the character back onto
4326       * the readBuffer.
4327       * <p>I don't think that this would ever be called with 
4328       * readBufferPos = 0, because the methods always reads a character
4329       * before unreading it, but just in case, I've added a boundary
4330       * condition.
4331       * @param c The character to push back.
4332       * @see #readCh
4333       * @see #unread (char[])
4334       * @see #readBuffer
4335       */
4336      private void unread(char c)
4337        throws SAXException
4338      {
4339        // Normal condition.
4340        if (c == '\n')
4341          {
4342            line--;
4343            column = -1;
4344          }
4345        if (readBufferPos > 0)
4346          {
4347            readBuffer[--readBufferPos] = c;
4348          }
4349        else
4350          {
4351            pushString(null, new Character(c).toString());
4352          }
4353      }
4354    
4355      /**
4356       * Push a char array back onto the current input stream.
4357       * <p>NOTE: you must <em>never</em> push back characters that you
4358       * haven't actually read: use pushString () instead.
4359       * @see #readCh
4360       * @see #unread (char)
4361       * @see #readBuffer
4362       * @see #pushString
4363       */
4364      private void unread(char[] ch, int length)
4365        throws SAXException
4366      {
4367        for (int i = 0; i < length; i++)
4368          {
4369            if (ch[i] == '\n')
4370              {
4371                line--;
4372                column = -1;
4373              }
4374          }
4375        if (length < readBufferPos)
4376          {
4377            readBufferPos -= length;
4378          }
4379        else
4380          {
4381            pushCharArray(null, ch, 0, length);
4382          }
4383      }
4384    
4385      /**
4386       * Push, or skip, a new external input source.
4387       * The source will be some kind of parsed entity, such as a PE
4388       * (including the external DTD subset) or content for the body.
4389       *
4390       * @param url The java.net.URL object for the entity.
4391       * @see SAXDriver#resolveEntity
4392       * @see #pushString
4393       * @see #sourceType
4394       * @see #pushInput
4395       * @see #detectEncoding
4396       * @see #sourceType
4397       * @see #readBuffer
4398       */
4399      private void pushURL(boolean isPE,
4400                           String ename,
4401                           ExternalIdentifiers ids,
4402                           Reader aReader,
4403                           InputStream aStream,
4404                           String aEncoding,
4405                           boolean doResolve)
4406        throws SAXException, IOException
4407      {
4408        // removed boolean ignoreEncoding -- 2006-02-03 hsivonen
4409        String systemId;
4410        InputSource source;
4411        InputSource scratch = new InputSource();
4412    
4413        if (!isPE)
4414          {
4415            dataBufferFlush();
4416          }
4417    
4418        scratch.setPublicId(ids.publicId);
4419        scratch.setSystemId(ids.systemId);
4420    
4421        // See if we should skip or substitute the entity.
4422        // If we're not skipping, resolving reports startEntity()
4423        // and updates the (handler's) stack of URIs.
4424        if (doResolve)
4425          {
4426            // assert (stream == null && reader == null && encoding == null)
4427            source = handler.resolveEntity(isPE, ename, scratch, ids.baseUri);
4428            if (source == null)
4429              {
4430                handler.warn("skipping entity: " + ename);
4431                handler.skippedEntity(ename);
4432                if (isPE)
4433                  {
4434                    skippedPE = true;
4435                  }
4436                return;
4437              }
4438    
4439            // we might be using alternate IDs/encoding
4440            systemId = source.getSystemId();
4441            // The following warning and setting systemId was deleted bcause
4442            // the application has the option of not setting systemId
4443            // provided that it has set the characte/byte stream.
4444            /*
4445               if (systemId == null) {
4446               handler.warn ("missing system ID, using " + ids.systemId);
4447               systemId = ids.systemId;
4448               }
4449             */
4450          }
4451        else
4452          {
4453            // "[document]", or "[dtd]" via getExternalSubset()
4454            scratch.setCharacterStream(aReader);
4455            scratch.setByteStream(aStream);
4456            scratch.setEncoding(aEncoding);
4457            source = scratch;
4458            systemId = ids.systemId;
4459            if (handler.stringInterning)
4460              {
4461                handler.startExternalEntity(ename, systemId,
4462                                            "[document]" == ename);
4463              }
4464            else
4465              {
4466                handler.startExternalEntity(ename, systemId,
4467                                            "[document]".equals(ename));
4468              }
4469          }
4470        
4471        // Push the existing status.
4472        pushInput(ename);
4473    
4474        // Create a new read buffer.
4475        // (Note the four-character margin)
4476        readBuffer = new char[READ_BUFFER_MAX + 4];
4477        readBufferPos = 0;
4478        readBufferLength = 0;
4479        readBufferOverflow = -1;
4480        is = null;
4481        reader = null;
4482        line = 1;
4483        column = 0;
4484        currentByteCount = 0;
4485    
4486        // If there's an explicit character stream, just
4487        // ignore encoding declarations.
4488        if (source.getCharacterStream() != null)
4489          {
4490            sourceType = INPUT_READER;
4491            this.reader = source.getCharacterStream();
4492            // swallow UTF-8 BOM -- 2006-02-03 hsivonen
4493            if ("UTF-8".equalsIgnoreCase(source.getEncoding()))
4494              {
4495                char bom = readCh();
4496                if (bom != '\uFEFF') {
4497                    unread(bom);
4498                }
4499              }
4500            tryEncodingDecl(source.getEncoding() == null ? "" : source.getEncoding());
4501            return;
4502          }
4503      
4504        // Else we handle the conversion, and need to ensure
4505        // it's done right.
4506        if (source.getByteStream() != null)
4507          {
4508            is = source.getByteStream();
4509          }
4510        else
4511          {
4512            // Stop -- 2006-11-10 hsivonen
4513            fatal("The entity resolver didn't properly resolve the entity.");
4514          }
4515        
4516        // If we get to here, there must be
4517        // an InputStream available.
4518        if (!is.markSupported())
4519          {
4520            is = new BufferedInputStream(is);
4521          }
4522    
4523        // Zapped bogus external encoding label code -- 2006-11-10 hsivonen
4524        
4525        // if we got an external encoding label, use it ...
4526        if (source.getEncoding() != null)
4527          {
4528            draconianInputStreamReader(source.getEncoding(), is, false);
4529            if ("UTF-8".equalsIgnoreCase(source.getEncoding()))
4530            {
4531              char bom = readCh();
4532              if (bom != '\uFEFF') {
4533                  unread(bom);
4534              }
4535            }
4536            tryEncodingDecl(source.getEncoding());
4537            // ... else autodetect from first bytes.
4538          }
4539        else
4540          {
4541            detectEncoding();
4542            // Read any XML or text declaration.
4543            String enc = tryEncodingDecl(null);
4544            if (enc == null && "UTF-32" == characterEncoding) 
4545              {
4546                fatal("UTF-32 was sniffed from the BOM, but there was no matching encoding declaration. The omission of explicit encoding declaration is only allowed with UTF-8 and UTF-16.");
4547              }
4548          }
4549      }
4550    
4551      /**
4552       * Check for an encoding declaration.  This is the second part of the
4553       * XML encoding autodetection algorithm, relying on detectEncoding to
4554       * get to the point that this part can read any encoding declaration
4555       * in the document (using only US-ASCII characters).
4556       *
4557       * <p> Because this part starts to fill parser buffers with this data,
4558       * it's tricky to setup a reader so that Java's built-in decoders can be
4559       * used for the character encodings that aren't built in to this parser
4560       * (such as EUC-JP, KOI8-R, Big5, etc).
4561       *
4562       * @return any encoding in the declaration, uppercased; or null
4563       * @see detectEncoding
4564       */
4565      private String tryEncodingDecl(String encoding)
4566        throws SAXException, IOException
4567      {
4568        // Read the XML/text declaration.
4569        if (tryRead("<?xml"))
4570          {
4571            if (tryWhitespace())
4572              {
4573                if (inputStack.size() > 0)
4574                  {
4575                    return parseTextDecl(encoding);
4576                  }
4577                else
4578                  {
4579                    return parseXMLDecl(encoding);
4580                  }
4581              }
4582            else
4583              {
4584                // <?xml-stylesheet ...?> or similar
4585                unread('l');
4586                unread('m');
4587                unread('x');
4588                unread('?');
4589                unread('<');
4590              }
4591          }
4592        //  2006-02-03 hsivonen
4593        warnAboutLackOfEncodingDecl(encoding);
4594        return null;
4595      }
4596    
4597    /**
4598     * @param characterEncoding
4599     * @throws SAXException
4600     */
4601    private void warnAboutLackOfEncodingDecl(String encoding) throws SAXException {
4602        if (!(encoding == null || "".equals(encoding)
4603                    || "UTF-8".equalsIgnoreCase(encoding) || "UTF-16".equalsIgnoreCase(encoding)))
4604          {
4605            handler.warn(
4606                    "External encoding information specified a non-UTF-8/non-UTF-16 encoding (" + encoding + "), but there was no matching internal encoding declaration. The well-formedness status of this document may change when decoupled from the external encoding information.");
4607          }
4608    }
4609    
4610      /**
4611       * Attempt to detect the encoding of an entity.
4612       * <p>The trick here (as suggested in the XML standard) is that
4613       * any entity not in UTF-8, or in UCS-2 with a byte-order mark, 
4614       * <b>must</b> begin with an XML declaration or an encoding
4615       * declaration; we simply have to look for "&lt;?xml" in various
4616       * encodings.
4617       * <p>This method has no way to distinguish among 8-bit encodings.
4618       * Instead, it sets up for UTF-8, then (possibly) revises its assumption
4619       * later in setupDecoding ().  Any ASCII-derived 8-bit encoding
4620       * should work, but most will be rejected later by setupDecoding ().
4621       * @see #tryEncoding (byte[], byte, byte, byte, byte)
4622       * @see #tryEncoding (byte[], byte, byte)
4623       * @see #setupDecoding
4624       */
4625      private void detectEncoding()
4626        throws SAXException, IOException
4627      {
4628        byte[] signature = new byte[4];
4629    
4630        // Read the first four bytes for
4631        // autodetection.
4632        is.mark(4);
4633        is.read(signature);
4634        is.reset();
4635    
4636        //
4637        // FIRST:  four byte encodings (who uses these?)
4638        //
4639        if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4640                        (byte) 0x00, (byte) 0x3c))
4641          {
4642            // UCS-4 must begin with "<?xml"
4643            // 0x00 0x00 0x00 0x3c: UCS-4, big-endian (1234)
4644            // "UTF-32BE"
4645            draconianInputStreamReader("UTF-32BE", is, false);
4646          }
4647        else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4648                             (byte) 0x00, (byte) 0x00))
4649          {
4650            // 0x3c 0x00 0x00 0x00: UCS-4, little-endian (4321)
4651            // "UTF-32LE"
4652            draconianInputStreamReader("UTF-32LE", is, false);
4653          }
4654        else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4655                             (byte) 0x3c, (byte) 0x00))
4656          {
4657            // 0x00 0x00 0x3c 0x00: UCS-4, unusual (2143)
4658            fatal("Unsupported 32-bit encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); // 2006-02-03 hsivonen
4659          }
4660        else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4661                             (byte) 0x00, (byte) 0x00))
4662          {
4663            // 0x00 0x3c 0x00 0x00: UCS-4, unusual (3421)
4664            fatal("Unsupported 32-bit encoding. (XML processors are only required to support UTF-8 and UTF-16.)"); // 2006-02-03 hsivonen
4665          }
4666        else if (tryEncoding(signature, (byte) 0x00, (byte) 0x00,
4667                (byte) 0xfe, (byte) 0xff))
4668          {
4669            // 00 00 fe ff UCS_4_1234 (with BOM)
4670            is.read(); is.read(); is.read(); is.read();
4671            draconianInputStreamReader("UTF-32BE", is, false, "UTF-32");        
4672          }
4673        else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4674                (byte) 0x00, (byte) 0x00))
4675          {
4676            // ff fe 00 00 UCS_4_4321 (with BOM)
4677            is.read(); is.read(); is.read(); is.read();
4678            draconianInputStreamReader("UTF-32LE", is, false, "UTF-32");        
4679          }
4680        // SECOND:  two byte encodings
4681        // note ... with 1/14/2000 errata the XML spec identifies some
4682        // more "broken UTF-16" autodetection cases, with no XML decl,
4683        // which we don't handle here (that's legal too).
4684        //
4685        else if (tryEncoding(signature, (byte) 0xfe, (byte) 0xff))
4686          {
4687            // UCS-2 with a byte-order marker. (UTF-16)
4688            // 0xfe 0xff: UCS-2, big-endian (12)
4689            is.read(); is.read();
4690            draconianInputStreamReader("UTF-16BE", is, false, "UTF-16");        
4691          }
4692        else if (tryEncoding(signature, (byte) 0xff, (byte) 0xfe))
4693          {
4694            // UCS-2 with a byte-order marker. (UTF-16)
4695            // 0xff 0xfe: UCS-2, little-endian (21)
4696            is.read(); is.read();
4697            draconianInputStreamReader("UTF-16LE", is, false, "UTF-16");        
4698          }
4699        else if (tryEncoding(signature, (byte) 0x00, (byte) 0x3c,
4700                             (byte) 0x00, (byte) 0x3f))
4701          {
4702            // UTF-16BE (otherwise, malformed UTF-16)
4703            // 0x00 0x3c 0x00 0x3f: UCS-2, big-endian, no byte-order mark
4704            fatal("no byte-order mark for UTF-16 entity"); // s/UCS-2/UTF-16/ -- 2006-02-03 hsivonen
4705          }
4706        else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x00,
4707                             (byte) 0x3f, (byte) 0x00))
4708          {
4709            // UTF-16LE (otherwise, malformed UTF-16)
4710            // 0x3c 0x00 0x3f 0x00: UCS-2, little-endian, no byte-order mark
4711            fatal("no byte-order mark for UTF-16 entity"); // s/UCS-2/UTF-16/ -- 2006-02-03 hsivonen
4712          }
4713        //
4714        // THIRD: EBCDIC
4715        //
4716        else if (tryEncoding(signature, (byte) 0x4c, (byte) 0x6f,
4717                (byte) 0xa7, (byte) 0x94))
4718         {
4719            // 4c 6f a7 94 ... we don't understand EBCDIC flavors
4720            fatal("Unsupported EBCDIC encoding. (XML processors are only required to support UTF-8 and UTF-16.)");       
4721         }
4722        //
4723        // FOURTH:  ASCII-derived encodings, fixed and variable lengths
4724        //
4725        else if (tryEncoding(signature, (byte) 0x3c, (byte) 0x3f,
4726                             (byte) 0x78, (byte) 0x6d))
4727          {
4728            // ASCII derived
4729            // 0x3c 0x3f 0x78 0x6d: UTF-8 or other 8-bit markup (read ENCODING)
4730            characterEncoding = null;
4731            prefetchASCIIEncodingDecl();
4732          }
4733        else if (signature[0] == (byte) 0xef
4734                 && signature[1] == (byte) 0xbb
4735                 && signature[2] == (byte) 0xbf)
4736          {
4737            // 0xef 0xbb 0xbf: UTF-8 BOM (not part of document text)
4738            // this un-needed notion slipped into XML 2nd ed through a
4739            // "non-normative" erratum; now required by MSFT and UDDI,
4740            // and E22 made it normative.
4741            is.read(); is.read(); is.read();
4742            draconianInputStreamReader("UTF-8", is, false);        
4743          }
4744        else
4745          {        
4746            // (default) UTF-8 without encoding/XML declaration
4747            draconianInputStreamReader("UTF-8", is, false);        
4748          }
4749      }
4750    
4751      /**
4752       * Check for a four-byte signature.
4753       * <p>Utility routine for detectEncoding ().
4754       * <p>Always looks for some part of "<?XML" in a specific encoding.
4755       * @param sig The first four bytes read.
4756       * @param b1 The first byte of the signature
4757       * @param b2 The second byte of the signature
4758       * @param b3 The third byte of the signature
4759       * @param b4 The fourth byte of the signature
4760       * @see #detectEncoding
4761       */
4762      private static boolean tryEncoding(byte[] sig, byte b1, byte b2,
4763                                         byte b3, byte b4)
4764      {
4765        return (sig[0] == b1 && sig[1] == b2
4766                && sig[2] == b3 && sig[3] == b4);
4767      }
4768    
4769      /**
4770       * Check for a two-byte signature.
4771       * <p>Looks for a UCS-2 byte-order mark.
4772       * <p>Utility routine for detectEncoding ().
4773       * @param sig The first four bytes read.
4774       * @param b1 The first byte of the signature
4775       * @param b2 The second byte of the signature
4776       * @see #detectEncoding
4777       */
4778      private static boolean tryEncoding(byte[] sig, byte b1, byte b2)
4779      {
4780        return ((sig[0] == b1) && (sig[1] == b2));
4781      }
4782    
4783      /**
4784       * This method pushes a string back onto input.
4785       * <p>It is useful either as the expansion of an internal entity, 
4786       * or for backtracking during the parse.
4787       * <p>Call pushCharArray () to do the actual work.
4788       * @param s The string to push back onto input.
4789       * @see #pushCharArray
4790       */
4791      private void pushString(String ename, String s)
4792        throws SAXException
4793      {
4794        char[] ch = s.toCharArray();
4795        pushCharArray(ename, ch, 0, ch.length);
4796      }
4797    
4798      /**
4799       * Push a new internal input source.
4800       * <p>This method is useful for expanding an internal entity,
4801       * or for unreading a string of characters.  It creates a new
4802       * readBuffer containing the characters in the array, instead
4803       * of characters converted from an input byte stream.
4804       * @param ch The char array to push.
4805       * @see #pushString
4806       * @see #pushURL
4807       * @see #readBuffer
4808       * @see #sourceType
4809       * @see #pushInput
4810       */
4811      private void pushCharArray(String ename, char[] ch, int start, int length)
4812        throws SAXException
4813      {
4814        // Push the existing status
4815        pushInput(ename);
4816        if (ename != null && doReport)
4817          {
4818            dataBufferFlush();
4819            handler.startInternalEntity(ename);
4820          }
4821        sourceType = INPUT_INTERNAL;
4822        readBuffer = ch;
4823        readBufferPos = start;
4824        readBufferLength = length;
4825        readBufferOverflow = -1;
4826      }
4827    
4828      /**
4829       * Save the current input source onto the stack.
4830       * <p>This method saves all of the global variables associated with
4831       * the current input source, so that they can be restored when a new
4832       * input source has finished.  It also tests for entity recursion.
4833       * <p>The method saves the following global variables onto a stack
4834       * using a fixed-length array:
4835       * <ol>
4836       * <li>sourceType
4837       * <li>externalEntity
4838       * <li>readBuffer
4839       * <li>readBufferPos
4840       * <li>readBufferLength
4841       * <li>line
4842       * <li>characterEncoding
4843       * </ol>
4844       * @param ename The name of the entity (if any) causing the new input.
4845       * @see #popInput
4846       * @see #sourceType
4847       * @see #externalEntity
4848       * @see #readBuffer
4849       * @see #readBufferPos
4850       * @see #readBufferLength
4851       * @see #line
4852       * @see #characterEncoding
4853       */
4854      private void pushInput(String ename)
4855        throws SAXException
4856      {
4857        // Check for entity recursion.
4858        if (ename != null)
4859          {
4860            Iterator<String> entities = entityStack.iterator();
4861            while (entities.hasNext())
4862              {
4863                String e = entities.next();
4864                if (e != null && e == ename)
4865                  {
4866                    fatal("recursive reference to entity", ename, null);
4867                  }
4868              }
4869          }
4870        entityStack.addLast(ename);
4871        
4872        // Don't bother if there is no current input.
4873        if (sourceType == INPUT_NONE)
4874          {
4875            return;
4876          }
4877        
4878        // Set up a snapshot of the current
4879        // input source.
4880        Input input = new Input();
4881    
4882        input.sourceType = sourceType;
4883        input.readBuffer = readBuffer;
4884        input.readBufferPos = readBufferPos;
4885        input.readBufferLength = readBufferLength;
4886        input.line = line;
4887        input.charecterEncoding = characterEncoding;
4888        input.readBufferOverflow = readBufferOverflow;
4889        input.is = is;
4890        input.currentByteCount = currentByteCount;
4891        input.column = column;
4892        input.reader = reader;
4893        input.prev = prev;
4894        input.normalizationChecker = normalizationChecker;
4895        
4896        // Push it onto the stack.
4897        inputStack.addLast(input);
4898      }
4899    
4900      /**
4901       * Restore a previous input source.
4902       * <p>This method restores all of the global variables associated with
4903       * the current input source.
4904       * @exception java.io.EOFException
4905       *    If there are no more entries on the input stack.
4906       * @see #pushInput
4907       * @see #sourceType
4908       * @see #readBuffer
4909       * @see #readBufferPos
4910       * @see #readBufferLength
4911       * @see #line
4912       * @see #characterEncoding
4913       */
4914      private void popInput()
4915        throws SAXException, IOException
4916      {
4917        String ename = entityStack.removeLast();
4918    
4919        if (ename != null && doReport)
4920          {
4921            dataBufferFlush();
4922          }
4923        switch (sourceType)
4924          {
4925          case INPUT_READER:
4926            handler.endExternalEntity(ename);
4927            reader.close();
4928            break;
4929          case INPUT_INTERNAL:
4930            if (ename != null && doReport)
4931              {
4932                handler.endInternalEntity(ename);
4933              }
4934            break;
4935          }
4936    
4937        if (normalizationChecker != null) 
4938          {
4939            normalizationChecker.flush();
4940          }
4941        
4942        // Throw an EOFException if there
4943        // is nothing else to pop.
4944        if (inputStack.isEmpty())
4945          {
4946            throw new EOFException("no more input");
4947          }
4948    
4949        Input input = inputStack.removeLast();
4950    
4951        sourceType = input.sourceType;
4952        readBuffer = input.readBuffer;
4953        readBufferPos = input.readBufferPos;
4954        readBufferLength = input.readBufferLength;
4955        line = input.line;
4956        characterEncoding = input.charecterEncoding;
4957        readBufferOverflow = input.readBufferOverflow;
4958        is = input.is;
4959        currentByteCount = input.currentByteCount;
4960        column = input.column;
4961        reader = input.reader;
4962        prev = input.prev;
4963        normalizationChecker = input.normalizationChecker;
4964      }
4965      
4966      /**
4967       * Return true if we can read the expected character.
4968       * <p>Note that the character will be removed from the input stream
4969       * on success, but will be put back on failure.  Do not attempt to
4970       * read the character again if the method succeeds.
4971       * @param delim The character that should appear next.  For a
4972       *        insensitive match, you must supply this in upper-case.
4973       * @return true if the character was successfully read, or false if
4974       *   it was not.
4975       * @see #tryRead (String)
4976       */
4977      private boolean tryRead(char delim)
4978        throws SAXException, IOException
4979      {
4980        char c;
4981        
4982        // Read the character
4983        c = readCh();
4984    
4985        // Test for a match, and push the character
4986        // back if the match fails.
4987        if (c == delim)
4988          {
4989            return true;
4990          }
4991        else
4992          {
4993            unread(c);
4994            return false;
4995          }
4996      }
4997    
4998      /**
4999       * Return true if we can read the expected string.
5000       * <p>This is simply a convenience method.
5001       * <p>Note that the string will be removed from the input stream
5002       * on success, but will be put back on failure.  Do not attempt to
5003       * read the string again if the method succeeds.
5004       * <p>This method will push back a character rather than an
5005       * array whenever possible (probably the majority of cases).
5006       * @param delim The string that should appear next.
5007       * @return true if the string was successfully read, or false if
5008       *   it was not.
5009       * @see #tryRead (char)
5010       */
5011      private boolean tryRead(String delim)
5012        throws SAXException, IOException
5013      {
5014        return tryRead(delim.toCharArray());
5015      }
5016    
5017      private boolean tryRead(char[] ch)
5018        throws SAXException, IOException
5019      {
5020        char c;
5021    
5022        // Compare the input, character-
5023        // by character.
5024        
5025        for (int i = 0; i < ch.length; i++)
5026          {
5027            c = readCh();
5028            if (c != ch[i])
5029              {
5030                unread(c);
5031                if (i != 0)
5032                  {
5033                    unread(ch, i);
5034                  }
5035                return false;
5036              }
5037          }
5038        return true;
5039      }
5040    
5041      /**
5042       * Return true if we can read some whitespace.
5043       * <p>This is simply a convenience method.
5044       * <p>This method will push back a character rather than an
5045       * array whenever possible (probably the majority of cases).
5046       * @return true if whitespace was found.
5047       */
5048      private boolean tryWhitespace()
5049        throws SAXException, IOException
5050      {
5051        char c;
5052        c = readCh();
5053        if (isWhitespace(c))
5054          {
5055            skipWhitespace();
5056            return true;
5057          }
5058        else
5059          {
5060            unread(c);
5061            return false;
5062          }
5063      }
5064    
5065      private void parseUntil(char[] delim)
5066        throws SAXException, IOException
5067      {
5068        char c;
5069        int startLine = line;
5070        
5071        try
5072          {
5073            while (!tryRead(delim))
5074              {
5075                c = readCh();
5076                dataBufferAppend(c);
5077              }
5078          }
5079        catch (EOFException e)
5080          {
5081            fatal("end of input while looking for delimiter "
5082                  + "(started on line " + startLine
5083                  + ')', null, new String(delim));
5084          }
5085      }
5086    
5087      //////////////////////////////////////////////////////////////////////
5088      // Low-level I/O.
5089      //////////////////////////////////////////////////////////////////////
5090      
5091      /**
5092       * Prefetch US-ASCII XML/text decl from input stream into read buffer.
5093       * Doesn't buffer more than absolutely needed, so that when an encoding
5094       * decl says we need to create an InputStreamReader, we can discard our
5095       * buffer and reset().  Caller knows the first chars of the decl exist
5096       * in the input stream.
5097       */
5098      private void prefetchASCIIEncodingDecl()
5099        throws SAXException, IOException
5100      {
5101        int ch;
5102        readBufferPos = readBufferLength = 0;
5103        
5104        is.mark(readBuffer.length);
5105        while (true)
5106          {
5107            ch = is.read();
5108            readBuffer[readBufferLength++] = (char) ch;
5109            switch (ch)
5110              {
5111              case (int) '>':
5112                return;
5113              case -1:
5114                fatal("file ends before end of XML or encoding declaration.",
5115                      null, "?>");
5116              }
5117            if (readBuffer.length == readBufferLength)
5118              {
5119                fatal("unfinished XML or encoding declaration");
5120              }
5121          }
5122      }
5123    
5124      /**
5125       * Read a chunk of data from an external input source.
5126       * <p>This is simply a front-end that fills the rawReadBuffer
5127       * with bytes, then calls the appropriate encoding handler.
5128       * @see #characterEncoding
5129       * @see #rawReadBuffer
5130       * @see #readBuffer
5131       * @see #filterCR
5132       * @see #copyUtf8ReadBuffer
5133       * @see #copyIso8859_1ReadBuffer
5134       * @see #copyUcs_2ReadBuffer
5135       * @see #copyUcs_4ReadBuffer
5136       */
5137      private void readDataChunk()
5138        throws SAXException, IOException
5139      {
5140        int count;
5141        
5142        // See if we have any overflow (filterCR sets for CR at end)
5143        if (readBufferOverflow > -1)
5144          {
5145            readBuffer[0] = (char) readBufferOverflow;
5146            readBufferOverflow = -1;
5147            readBufferPos = 1;
5148            sawCR = true;
5149          }
5150        else
5151          {
5152            readBufferPos = 0;
5153            sawCR = false;
5154          }
5155    
5156        try
5157          {
5158             count = reader.read(readBuffer,
5159                                 readBufferPos, READ_BUFFER_MAX - readBufferPos);
5160          }
5161        catch(CharacterCodingException cce)
5162          {
5163             // 2006-04-25 hsivonen
5164            fatal("Input data does not conform to the input encoding. The input encoding was " + characterEncoding + ".");
5165            return; // never happens
5166          }
5167        if (normalizationChecker != null && count > 0)
5168          {
5169            normalizationChecker.characters(readBuffer, readBufferPos, count);
5170          }
5171        if (count < 0)
5172          {
5173             readBufferLength = readBufferPos;
5174          }
5175        else
5176          {
5177            readBufferLength = readBufferPos + count;
5178          }
5179        if (readBufferLength > 0)
5180          {
5181            filterCR(count >= 0);
5182          }
5183        sawCR = false;
5184      }
5185      
5186      /**
5187       * Filter carriage returns in the read buffer.
5188       * CRLF becomes LF; CR becomes LF.
5189       * @param moreData true iff more data might come from the same source
5190       * @see #readDataChunk
5191       * @see #readBuffer
5192       * @see #readBufferOverflow
5193       */
5194      private void filterCR(boolean moreData)
5195      {
5196        int i, j;
5197    
5198        readBufferOverflow = -1;
5199        
5200    loop:
5201        for (i = j = readBufferPos; j < readBufferLength; i++, j++)
5202          {
5203            switch (readBuffer[j])
5204              {
5205              case '\r':
5206                if (j == readBufferLength - 1)
5207                  {
5208                    if (moreData)
5209                      {
5210                        readBufferOverflow = '\r';
5211                        readBufferLength--;
5212                      }
5213                    else   // CR at end of buffer
5214                      {
5215                        readBuffer[i++] = '\n';
5216                      }
5217                    break loop;
5218                  }
5219                else if (readBuffer[j + 1] == '\n')
5220                  {
5221                    j++;
5222                  }
5223                readBuffer[i] = '\n';
5224                break;
5225    
5226              case '\n':
5227              default:
5228                readBuffer[i] = readBuffer[j];
5229                break;
5230              }
5231          }
5232        readBufferLength = i;
5233      }
5234    
5235      private void warnAboutPrivateUseChar() throws SAXException {
5236          if (!alreadyWarnedAboutPrivateUseCharacters) {
5237              handler.warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)");
5238              alreadyWarnedAboutPrivateUseCharacters = true;
5239          }
5240      }
5241    
5242      // copied from fi.iki.hsivonen.htmlparser
5243    
5244      private boolean isPrivateUse(char c) {
5245          return c >= '\uE000' && c <= '\uF8FF';
5246      }
5247    
5248      private boolean isPrivateUse(int c) {
5249          return (c >= 0xE000 && c <= 0xF8FF) || (c >= 0xF0000 && c <= 0xFFFFD) || (c >= 0x100000 && c <= 0x10FFFD);
5250      }
5251      
5252      private boolean isAstralPrivateUse(int c) {
5253          return (c >= 0xF0000 && c <= 0xFFFFD) || (c >= 0x100000 && c <= 0x10FFFD);
5254      }
5255    
5256      private boolean isNonCharacter(int c) {
5257          return (c & 0xFFFE) == 0xFFFE;
5258      }
5259      
5260      //////////////////////////////////////////////////////////////////////
5261      // Local Variables.
5262      //////////////////////////////////////////////////////////////////////
5263      
5264      /**
5265       * Re-initialize the variables for each parse.
5266       */
5267      private void initializeVariables()
5268      {
5269        prev = '\u0000';
5270        // First line
5271        line = 1;
5272        column = 0;
5273        
5274        // Set up the buffers for data and names
5275        dataBufferPos = 0;
5276        dataBuffer = new char[DATA_BUFFER_INITIAL];
5277        nameBufferPos = 0;
5278        nameBuffer = new char[NAME_BUFFER_INITIAL];
5279    
5280        // Set up the DTD hash tables
5281        elementInfo = new HashMap<String, ElementDecl>();
5282        entityInfo = new HashMap<String, EntityInfo>();
5283        notationInfo = new HashMap<String, String>();
5284        skippedPE = false;
5285    
5286        // Set up the variables for the current
5287        // element context.
5288        currentElement = null;
5289        currentElementContent = CONTENT_UNDECLARED;
5290        
5291        // Set up the input variables
5292        sourceType = INPUT_NONE;
5293        inputStack = new LinkedList<Input>();
5294        entityStack = new LinkedList<String>();
5295        tagAttributePos = 0;
5296        tagAttributes = new String[100];
5297        rawReadBuffer = new byte[READ_BUFFER_MAX];
5298        readBufferOverflow = -1;
5299    
5300        inLiteral = false;
5301        expandPE = false;
5302        peIsError = false;
5303        
5304        doReport = false;
5305        
5306        inCDATA = false;
5307        
5308        symbolTable = new Object[SYMBOL_TABLE_LENGTH][];
5309        
5310        if (handler.checkNormalization) {
5311            normalizationChecker = new NormalizationChecker(true);
5312            normalizationChecker.setDocumentLocator(handler);
5313            normalizationChecker.setErrorHandler(handler.getErrorHandler());
5314        } else {
5315            normalizationChecker = null;
5316        }
5317      }
5318    
5319      static class ExternalIdentifiers
5320      {
5321    
5322        String publicId;
5323        String systemId;
5324        String baseUri;
5325    
5326        ExternalIdentifiers()
5327        {
5328        }
5329    
5330        ExternalIdentifiers(String publicId, String systemId, String baseUri)
5331        {
5332          this.publicId = publicId;
5333          this.systemId = systemId;
5334          this.baseUri = baseUri;
5335        }
5336        
5337      }
5338    
5339      static class EntityInfo
5340      {
5341    
5342        int type;
5343        ExternalIdentifiers ids;
5344        String value;
5345        String notationName;
5346        
5347      }
5348    
5349      static class AttributeDecl
5350      {
5351        
5352        String type;
5353        String value;
5354        int valueType;
5355        String enumeration;
5356        String defaultValue;
5357    
5358      }
5359    
5360      static class ElementDecl
5361      {
5362        
5363        int contentType;
5364        String contentModel;
5365        HashMap<String, AttributeDecl> attributes;
5366      
5367      }
5368     
5369      static class Input
5370      {
5371        char prev;
5372        int sourceType;
5373        char[] readBuffer;
5374        int readBufferPos;
5375        int readBufferLength;
5376        int line;
5377        String charecterEncoding;
5378        int readBufferOverflow;
5379        InputStream is;
5380        int currentByteCount;
5381        int column;
5382        Reader reader;
5383        NormalizationChecker normalizationChecker;
5384      }
5385      
5386    }
5387