001    /*
002     * Copyright (c) 2005, 2006 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.htmlparser;
024    
025    import java.io.BufferedInputStream;
026    import java.io.BufferedReader;
027    import java.io.IOException;
028    import java.io.InputStream;
029    import java.io.InputStreamReader;
030    import java.io.Reader;
031    import java.nio.charset.CharacterCodingException;
032    import java.nio.charset.Charset;
033    import java.nio.charset.CharsetDecoder;
034    import java.nio.charset.CodingErrorAction;
035    import java.nio.charset.IllegalCharsetNameException;
036    import java.nio.charset.UnsupportedCharsetException;
037    
038    import org.xml.sax.ContentHandler;
039    import org.xml.sax.DTDHandler;
040    import org.xml.sax.EntityResolver;
041    import org.xml.sax.ErrorHandler;
042    import org.xml.sax.InputSource;
043    import org.xml.sax.Locator;
044    import org.xml.sax.SAXException;
045    import org.xml.sax.SAXNotRecognizedException;
046    import org.xml.sax.SAXNotSupportedException;
047    import org.xml.sax.SAXParseException;
048    import org.xml.sax.XMLReader;
049    import org.xml.sax.helpers.DefaultHandler;
050    
051    import fi.iki.hsivonen.io.EncodingInfo;
052    import fi.iki.hsivonen.io.NonBufferingAsciiInputStreamReader;
053    import fi.iki.hsivonen.xml.AttributesImpl;
054    import fi.iki.hsivonen.xml.ContentHandlerFilter;
055    import fi.iki.hsivonen.xml.EmptyAttributes;
056    import fi.iki.hsivonen.xml.SilentDraconianErrorHandler;
057    import fi.iki.hsivonen.xml.XhtmlSaxEmitter;
058    import fi.iki.hsivonen.xml.checker.NormalizationChecker;
059    
060    /**
061     * WARNING: This parser is incomplete. It does not perform tag inference, yet. It does not yet perform 
062     * case folding for attribute value like method="POST".
063     * 
064     * @version $Id: HtmlParser.java,v 1.20 2006/11/21 10:13:24 hsivonen Exp $
065     * @author hsivonen
066     */
067    public final class HtmlParser implements XMLReader, Locator {
068            
069        private static final int CASE_MASK = (1 << 5);
070    
071        private static final int LEAD_OFFSET = 0xD800 - (0x10000 >> 10);
072    
073        private static final int SURROGATE_OFFSET = 0x10000 - (0xD800 << 10) - 0xDC00;
074    
075        private static final char[] LT = { '<' };
076    
077        private static final char[] APOS = { '\'' };
078    
079        private static final char[] OCTYPE = "octype".toCharArray();
080    
081        private static final char[] TML = "tml".toCharArray();
082    
083        private static final char[] UBLIC = "ublic".toCharArray();
084    
085        private static final int PCDATA = 0;
086    
087        private static final int SCRIPT = 1;
088    
089        private static final int STYLE = 2;
090    
091        private String publicId;
092    
093        private String systemId;
094    
095        private boolean nonWhiteSpaceAllowed;
096    
097        private int cdataState;
098    
099        private ErrorHandler eh;
100    
101        private ContentHandler ch;
102        
103        private DoctypeHandler doctypeHandler;
104    
105        private XhtmlSaxEmitter emitter;
106    
107        private Reader reader;
108    
109        private int pos;
110    
111        private int cstart;
112    
113        private char[] buf = new char[2048];
114    
115        private int bufLen;
116    
117        private int line;
118    
119        private int col;
120    
121        private boolean doctypeSeen;
122        
123        private int doctypeMode;
124        
125        private boolean html5;
126    
127        private char prev;
128    
129        private boolean wasLt;
130    
131        private char[] strBuf = new char[64];
132    
133        private int strBufLen = 0;
134    
135        private char[] attrBuf = new char[1024];
136    
137        private int attrBufLen = 0;
138    
139        private AttributesImpl attrs = new AttributesImpl();
140    
141        private char[] bmpChar = { '\u0000' };
142    
143        private char[] astralChar = { '\u0000', '\u0000' };
144    
145        private DTDHandler dtdHandler;
146    
147        private EmptyElementFilter eef;
148    
149        private TagInferenceFilter tif;
150    
151        private CharacterEncodingDeclarationFilter cedf;
152    
153        private ContentHandlerFilter pipelineLast;
154    
155        private EntityResolver entityResolver = null;
156    
157        private String encoding = null;
158    
159        private InputStream stream;
160    
161        private boolean foldedAttributeValue;
162    
163        private boolean alreadyWarnedAboutPrivateUseCharacters;
164        
165        private NormalizationChecker normalizationChecker = null;
166    
167        public HtmlParser() {
168            eef = new EmptyElementFilter();
169            tif = new TagInferenceFilter(this);
170            cedf = new CharacterEncodingDeclarationFilter(this);
171            ch = eef;
172            emitter = new XhtmlSaxEmitter(ch);
173            eef.setContentHandler(tif);
174            tif.setContentHandler(cedf);
175            pipelineLast = cedf;
176            setErrorHandler(new SilentDraconianErrorHandler());
177            setContentHandler(new DefaultHandler());
178        }
179    
180        private void clearStrBuf() {
181            strBufLen = 0;
182        }
183    
184        private void appendStrBufAsciiLowerCase(char c) throws SAXException,
185                IOException {
186            if (c >= 'A' && c <= 'Z') {
187                appendStrBuf((char) (c | CASE_MASK));
188            } else {
189                appendStrBuf(c);
190            }
191        }
192    
193        private void appendStrBuf(char c) throws SAXException, IOException {
194            if (strBufLen == strBuf.length) {
195                fatal("Identifier too long.");
196            } else {
197                strBuf[strBufLen] = c;
198                strBufLen++;
199            }
200        }
201    
202        private String strBufToString() {
203            return new String(strBuf, 0, strBufLen);
204        }
205    
206        private void clearAttrBuf() {
207            attrBufLen = 0;
208        }
209    
210        private void appendAttrBuf(char c) throws SAXException, IOException {
211            if (attrBufLen == attrBuf.length) {
212                fatal("Attribute value or other quoted string too long.");
213            } else {
214                attrBuf[attrBufLen] = c;
215                attrBufLen++;
216            }
217        }
218    
219        private void appendAttrBufAsciiLowerCase(char c) throws SAXException,
220                IOException {
221            if (c >= 'A' && c <= 'Z') {
222                appendAttrBuf((char) (c | CASE_MASK));
223            } else if (c <= '\u007F') {
224                appendAttrBuf(c);
225            } else {
226                fatal("Non-ASCII character in an attribute value that is subject to case folding.");
227            }
228        }
229    
230        /**
231         * @param cs
232         * @throws SAXException
233         */
234        private void appendAttrBuf(char[] cs) throws SAXException, IOException {
235            for (int i = 0; i < cs.length; i++) {
236                appendAttrBuf(cs[i]);
237            }
238        }
239    
240        /**
241         * @param cs
242         * @throws SAXException
243         */
244        private void appendAttrBufAsciiLowerCase(char[] cs) throws SAXException,
245                IOException {
246            for (int i = 0; i < cs.length; i++) {
247                appendAttrBufAsciiLowerCase(cs[i]);
248            }
249        }
250    
251        private String attrBufToString() {
252            return new String(attrBuf, 0, attrBufLen);
253        }
254    
255        private void parse() throws SAXException, IOException {
256            pos = -1;
257            cstart = -1;
258            line = 1;
259            col = 0;
260            doctypeSeen = false;
261            prev = '\u0000';
262            cdataState = PCDATA;
263            nonWhiteSpaceAllowed = false;
264            wasLt = false;
265            bufLen = 0;
266            html5 = false;
267            char c;
268            for (;;) {
269                if (cdataState == PCDATA) {
270                    c = nextMayEnd();
271                    if (c == '\u0000') {
272                        flushChars();
273                        if (!doctypeSeen) {
274                            err("The document did not have a doctype.");
275                        }
276                        return;
277                    } else if (c == '<') {
278                        flushChars();
279                        consumeMarkup();
280                    } else if (c == '&') {
281                        flushChars();
282                        emitter.characters(consumeCharRef());
283                    } else if (isWhiteSpace(c)) {
284                        if (nonWhiteSpaceAllowed) {
285                            if (cstart == -1) {
286                                cstart = pos;
287                            }
288                        }
289                    } else {
290                        doctypeNotOk();
291                        if (nonWhiteSpaceAllowed) {
292                            if (cstart == -1) {
293                                cstart = pos;
294                            }
295                        } else {
296                            fatal("Character data not allowed at this point.");
297                        }
298                    }
299                } else {
300                    c = next();
301                    if (c == '<') {
302                        wasLt = true;
303                        flushChars();
304                    } else if (c == '/') {
305                        if (wasLt) {
306                            consumeEndTag();
307                        } else if (cstart == -1) {
308                            cstart = pos;
309                        }
310                        wasLt = false;
311                    } else {
312                        if (wasLt) {
313                            emitter.characters(LT);
314                        }
315                        if (cstart == -1) {
316                            cstart = pos;
317                        }
318                        wasLt = false;
319                    }
320                }
321            }
322        }
323    
324        /**
325         * @throws SAXException
326         *  
327         */
328        private void doctypeNotOk() throws SAXException, IOException {
329            if (!doctypeSeen) {
330                // there was no doctype
331                err("There was no doctype.");
332                doctypeSeen = true;
333            }
334        }
335    
336        /**
337         * @param c
338         * @return
339         */
340        private boolean isWhiteSpace(char c) {
341            return (c == ' ') || (c == '\t') || (c == '\n');
342        }
343    
344        /**
345         * @throws SAXException
346         *  
347         */
348        private char[] consumeCharRef() throws SAXException, IOException {
349            char c = next();
350            if (c == '#') {
351                return consumeNCR();
352            } else if (isNameStart(c)) {
353                return consumeEntityRef(c);
354            } else {
355                // XXX should we err or continue here
356                fatal("& not followed by # or name start.");
357            }
358            throw new RuntimeException("Unreachable");
359        }
360    
361        /**
362         * @param c
363         * @throws SAXException
364         */
365        private char[] consumeEntityRef(char c) throws SAXException, IOException {
366            clearStrBuf();
367            appendStrBuf(c);
368            for (;;) {
369                c = next();
370                if (isNameChar(c)) {
371                    appendStrBuf(c);
372                } else if (c == ';') {
373                    String name = strBufToString();
374                    char[] rv = html5 ? Entities.resolve5(name) : Entities.resolve(name);
375                    if (rv == null) {
376                        if ("apos".equals(name)) {
377                            if (html5) {
378                                warn("&apos; is not supported by IE6.");
379                            } else {
380                                err("Even though there is a predefined entity called \u201Capos\u201D is XML, there is no such thing in HTML 4.01. Continuing parsing pretending that such an entity exists.");
381                            }
382                            return APOS;
383                        } else {
384                            fatal("Unknown entity \u201C" + name + "\u201D.");
385                        }
386                    }
387                    return rv;
388                } else {
389                    // XXX should we allow implicit close as in HTML4?
390                    fatal("Found a non-name character in entity reference / unterminated entity reference.");
391                }
392            }
393        }
394    
395        /**
396         * @throws SAXException
397         *  
398         */
399        private char[] consumeNCR() throws SAXException, IOException {
400            clearStrBuf();
401            int intVal = 0;
402            char c = next();
403            if (c == 'x' || c == 'X') {
404                for (int i = 0;; i++) {
405                    if (i == 6) {
406                        fatal("Hexadecimal character reference too long.");
407                    }
408                    c = next();
409                    if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')
410                            || (c >= 'A' && c <= 'F')) {
411                        appendStrBuf(c);
412                    } else if (c == ';') {
413                        if (i == 0) {
414                            fatal("No digits in hexadecimal character reference.");
415                        }
416                        intVal = Integer.parseInt(strBufToString(), 16);
417                        break;
418                    } else {
419                        fatal("Bad character in hexadecimal character reference.");
420                    }
421                }
422            } else if (c >= '0' && c <= '9') {
423                appendStrBuf(c);
424                for (int i = 0;; i++) {
425                    if (i == 6) {
426                        fatal("Decimal character reference too long.");
427                    }
428                    c = next();
429                    if (c >= '0' && c <= '9') {
430                        appendStrBuf(c);
431                    } else if (c == ';') {
432                        intVal = Integer.parseInt(strBufToString());
433                        break;
434                    } else {
435                        fatal("Bad character in decimal character reference.");
436                    }
437                }
438            } else {
439                fatal("Bad character in numeric character reference.");
440            }
441            if ((intVal & 0xF800) == 0xD800) {
442                fatal("Character reference expands to a surrogate.");
443            } else if (intVal <= 0xFFFF) {
444                c = (char) intVal;
445                if (isForbidden(c)) {
446                    fatal("Character reference expands to a forbidden character.");
447                }
448                if (isPrivateUse(c)) {
449                    warnAboutPrivateUseChar();
450                }
451                bmpChar[0] = c;
452                return bmpChar;
453            } else if (intVal <= 0x10FFFF) {
454                // XXX astral non-characters are not banned
455                if (isNonCharacter(intVal)) {
456                    warn("Character reference expands to an astral non-character.");
457                }
458                if (isAstralPrivateUse(intVal)) {
459                    warnAboutPrivateUseChar();
460                }
461                astralChar[0] = (char) (LEAD_OFFSET + (intVal >> 10));
462                astralChar[1] = (char) (0xDC00 + (intVal & 0x3FF));
463                return astralChar;
464            } else {
465                fatal("Character reference outside the permissible Unicode range.");
466            }
467            throw new RuntimeException("Unreachable");
468        }
469    
470        /**
471         * @throws SAXException
472         *  
473         */
474        private void consumeMarkup() throws SAXException, IOException {
475            char c = next();
476            if (c == '!') {
477                consumeMarkupDecl();
478            } else if (c == '?') {
479                consumePI();
480            } else if (c == '/') {
481                consumeEndTag();
482            } else if (isNameStart(c)) {
483                consumeStartTag(c);
484            } else {
485                fatal("Found illegal character after <.");
486            }
487        }
488    
489        /**
490         * @return
491         * @throws SAXException
492         */
493        private char next() throws SAXException, IOException {
494            char c = nextMayEnd();
495            if (c == '\u0000') {
496                fatal("Unexpected end of file.");
497            }
498            return c;
499        }
500    
501        /**
502         * @param c
503         * @throws SAXException
504         */
505        private void consumeStartTag(char c) throws SAXException, IOException {
506            doctypeNotOk();
507            clearStrBuf();
508            appendStrBufAsciiLowerCase(c);
509            for (;;) {
510                c = next();
511                if (c == '>') {
512                    String gi = strBufToString();
513                    maybeBeginCdata(gi);
514                    emitter.startElement(gi,
515                            EmptyAttributes.EMPTY_ATTRIBUTES);
516                    return;
517                } else if (c == '/') {
518                    c = next();
519                    if (c == '>') {
520                        err("XML-style empty element syntax (<foo/>) is not legal in HTML. Skipping the slash.");
521                        String gi = strBufToString();
522                        maybeBeginCdata(gi);
523                        emitter.startElement(gi,
524                                EmptyAttributes.EMPTY_ATTRIBUTES);
525                        return;
526                    } else {
527                        fatal("Stray slash in start tag.");
528                    }                
529                } else if (isNameChar(c)) {
530                    appendStrBufAsciiLowerCase(c);
531                } else if (isWhiteSpace(c)) {
532                    attrs.clear();
533                    String gi = strBufToString();
534                    maybeBeginCdata(gi);
535                    c = nextAfterZeroOrMoreWhiteSpace();
536                    for (;;) {
537                        if (isNameStart(c)) {
538                            c = consumeAttribute(c);
539                        } else if (c == '/') {
540                            c = next();
541                            if (c == '>') {
542                                err("XML-style empty element syntax (<foo />) is not legal in HTML. Skipping the slash.");
543                                emitter.startElement(gi, attrs);
544                                return;
545                            } else {
546                                fatal("Stray slash in start tag.");
547                            }
548                        } else if (c == '>') {
549                            emitter.startElement(gi, attrs);
550                            return;
551                        } else {
552                            fatal("Garbage in start tag.");
553                        }
554                    }
555                } else {
556                    fatal("Illegal character in element name.");
557                }
558            }
559        }
560    
561        /**
562         * @param gi
563         */
564        private void maybeBeginCdata(String gi) {
565            if ("style".equals(gi)) {
566                cdataState = STYLE;
567                wasLt = false;
568            } else if ("script".equals(gi)) {
569                cdataState = SCRIPT;
570                wasLt = false;
571            }
572        }
573    
574        /**
575         * @param c
576         * @return
577         * @throws SAXException
578         */
579        private char consumeAttribute(char c) throws SAXException, IOException {
580            clearStrBuf();
581            appendStrBufAsciiLowerCase(c);
582            for (;;) {
583                c = next();
584                if (isNameChar(c)) {
585                    appendStrBufAsciiLowerCase(c);
586                } else {
587                    break;
588                }
589            }
590            String name = strBufToString();
591            if (name.startsWith("xml")) {
592                fatal("Attribute name must not start with \u201Cxml\u201D.");
593            }
594            if (isWhiteSpace(c)) {
595                c = nextAfterZeroOrMoreWhiteSpace();
596            }
597            if (c == '=') {
598                foldedAttributeValue = AttributeInfo.isCaseFolded(name);
599                c = nextAfterZeroOrMoreWhiteSpace();
600                clearAttrBuf();
601                if (c == '\"') {
602                    consumeQuotedAttributeValue('\"');
603                    c = next();
604                } else if (c == '\'') {
605                    consumeQuotedAttributeValue('\'');
606                    c = next();
607                } else if (isUnquotedAttributeChar(c)) {
608                    // XXX should the real definition of Name Start and Name Char be
609                    // used here?
610                    if (foldedAttributeValue) {
611                        appendAttrBufAsciiLowerCase(c);
612                    } else {
613                        appendAttrBuf(c);
614                    }
615                    for (;;) {
616                        c = next();
617                        if (isUnquotedAttributeChar(c)) {
618                            if (foldedAttributeValue) {
619                                appendAttrBufAsciiLowerCase(c);
620                            } else {
621                                appendAttrBuf(c);
622                            }
623                        } else {
624                            break;
625                        }
626                    }
627                } else {
628                    fatal("Garbage in place of attribute value. Possibly quotes missing.");
629                }
630                if (isWhiteSpace(c)) {
631                    c = nextAfterZeroOrMoreWhiteSpace();
632                }
633                if ("lang".equals(name)) {
634                    fatalIfAttributeExists("xml:lang");
635                    attrs.addAttribute("http://www.w3.org/XML/1998/namespace",
636                            "lang", "xml:lang", "CDATA", attrBufToString());
637                } else {
638                    fatalIfAttributeExists(name);
639                    attrs.addAttribute(name, attrBufToString());
640                }
641                return c;
642            } else {
643                if (!AttributeInfo.isBoolean(name)) {
644                    fatal("Cannot minimize non-boolean attributes.");
645                }
646                fatalIfAttributeExists(name);
647                attrs.addAttribute(name, name);
648                return c;
649            }
650        }
651    
652        /**
653         * @param c
654         * @return
655         */
656        private boolean isUnquotedAttributeChar(char c) {
657            return (c == '.' || c == '-' || c == '_' || (c >= '0' && c <= ':')
658                    || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'));
659        }
660    
661        /**
662         * @param c
663         * @return
664         * @throws SAXException
665         * @throws IOException
666         */
667        private void consumeQuotedAttributeValue(char delim) throws SAXException,
668                IOException {
669            char c;
670            for (;;) {
671                c = next();
672                if (c == delim) {
673                    return;
674                } else if (c == '&') {
675                    if (foldedAttributeValue) {
676                        appendAttrBufAsciiLowerCase(consumeCharRef());
677                    } else {
678                        appendAttrBuf(consumeCharRef());
679                    }
680                } else if (isWhiteSpace(c)) {
681                    appendAttrBuf(' ');
682                } else {
683                    if (foldedAttributeValue) {
684                        appendAttrBufAsciiLowerCase(c);
685                    } else {
686                        appendAttrBuf(c);
687                    }
688                }
689                // XXX is a warning called for when there is < or >?
690            }
691        }
692    
693        /**
694         * @param name
695         * @throws SAXException
696         */
697        private void fatalIfAttributeExists(String name) throws SAXException,
698                IOException {
699            if (attrs.getIndex(name) != -1) {
700                fatal("Duplicate attribute.");
701            }
702        }
703    
704        /**
705         * @param c
706         * @return
707         */
708        private boolean isNameStart(char c) {
709            return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
710        }
711    
712        /**
713         * @param c
714         * @return
715         */
716        private boolean isNameChar(char c) {
717            return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
718                    || (c >= '0' && c <= '9') || (c == '-');
719        }
720    
721        /**
722         * @throws SAXException
723         *  
724         */
725        private void consumeEndTag() throws SAXException, IOException {
726            doctypeNotOk();
727            clearStrBuf();
728            char c = next();
729            if (isNameStart(c)) {
730                appendStrBufAsciiLowerCase(c);
731            } else {
732                fatal("Element name in end tag did not start with a legal name character.");
733            }
734            for (;;) {
735                c = next();
736                if (isNameChar(c)) {
737                    appendStrBufAsciiLowerCase(c);
738                } else if (c == '>') {
739                    String gi = strBufToString();
740                    cdataStateEnd(gi);
741                    emitter.endElement(gi);
742                    return;
743                } else if (isWhiteSpace(c)) {
744                    if (nextAfterZeroOrMoreWhiteSpace() == '>') {
745                        String gi = strBufToString();
746                        cdataStateEnd(gi);
747                        emitter.endElement(gi);
748                        return;
749                    } else {
750                        fatal("Garbage in end tag.");
751                    }
752                } else {
753                    fatal("Element name in end tag contained an illegal character.");
754                }
755            }
756        }
757    
758        /**
759         * @param gi
760         * @throws SAXException
761         */
762        private void cdataStateEnd(String gi) throws SAXException, IOException {
763            if (cdataState == STYLE) {
764                if ("style".equals(gi)) {
765                    cdataState = PCDATA;
766                } else {
767                    fatal("\u201C</\u201D seen inside the style element, but the string did not constitute the start of the end tag of the element.");
768                }
769            } else if (cdataState == SCRIPT) {
770                if ("script".equals(gi)) {
771                    cdataState = PCDATA;
772                } else {
773                    fatal("\u201C</\u201D seen inside the script element, but the string did not constitute the start of the end tag of the element.");
774                }
775            }
776        }
777    
778        /**
779         * @return
780         * @throws SAXException
781         */
782        private char nextAfterZeroOrMoreWhiteSpace() throws SAXException,
783                IOException {
784            for (;;) {
785                char c = next();
786                if (!isWhiteSpace(c)) {
787                    return c;
788                }
789            }
790        }
791    
792        /**
793         * @throws SAXException
794         *  
795         */
796        private void consumePI() throws SAXException, IOException {
797            // XXX should PIs be allowed?
798            fatal("Processing instructions are not allowed.");
799        }
800    
801        /**
802         * @throws SAXException
803         *  
804         */
805        private void consumeMarkupDecl() throws SAXException, IOException {
806            char c = next();
807            if (c == '-') {
808                for (;;) {
809                    c = consumeComment();
810                    if (c == '>') {
811                        return;
812                    } else if (c != '-') {
813                        fatal("Garbage after comment.");
814                    }
815                }
816            } else if (c == 'd' || c == 'D') {
817                consumeDoctype();
818            } else if (c == '>') {
819                return;
820            } else if (c == '[') {
821                fatal("Marked sections not allowed.");
822            } else {
823                fatal("Bad character in markup declaration.");
824            }
825        }
826    
827        /**
828         * @throws SAXException
829         *  
830         */
831        private void consumeDoctype() throws SAXException, IOException {
832            if (doctypeSeen) {
833                fatal("Doctype not allowed at this point.");
834            }
835            doctypeSeen = true;
836            if (!consumeCaseInsensitiveAsciiLetterString(OCTYPE)) {
837                fatal("Expected string \u201CDOCTYPE\u201D.");
838            }
839            char c = next();
840            if (!isWhiteSpace(c)) {
841                fatal("Expected white space after \u201CDOCTYPE\u201D.");
842            }
843            c = nextAfterZeroOrMoreWhiteSpace();
844            if (!(c == 'h' || c == 'H')) {
845                fatal("Expected string \u201Chtml\u201D.");
846            }
847            if (!consumeCaseInsensitiveAsciiLetterString(TML)) {
848                fatal("Expected string \u201Chtml\u201D.");
849            }
850            c = next();
851            if (c == '>') {
852                sawHtml5Doctype();
853                return;
854            } else if (!isWhiteSpace(c)) {
855                fatal("Garbage in doctype");
856            }
857            c = nextAfterZeroOrMoreWhiteSpace();
858            if (c == '>') {
859                sawHtml5Doctype();
860                return;
861            } else if (c == 's' || c == 'S') {
862                fatal("Doctype with possibly a SYSTEM id only.");
863            } else if (c == '[') {
864                fatal("Doctype with internal subset.");
865            } else if (!(c == 'p' || c == 'P')) {
866                fatal("Expected string \u201CPUBLIC\u201D.");
867            }
868            if (!consumeCaseInsensitiveAsciiLetterString(UBLIC)) {
869                fatal("Expected string \u201CPUBLIC\u201D.");
870            }
871            c = next();
872            if (!isWhiteSpace(c)) {
873                fatal("Expected white space after \u201CPUBLIC\u201D.");
874            }
875            c = nextAfterZeroOrMoreWhiteSpace();
876            String publicId = null;
877            String systemId = null;
878            if (c == '\"') {
879                publicId = unescapedStringUntil('\"');
880            } else if (c == '\'') {
881                publicId = unescapedStringUntil('\'');
882            } else {
883                fatal("Garbage in doctype. Expected a quoted string.");
884            }
885            c = next();
886            if (c == '>') {
887                checkPublicAndSystemIds(publicId, systemId);
888                return;
889            } else if (!isWhiteSpace(c)) {
890                fatal("Expected white space or \u201C>\u201D after the public id.");
891            }
892            c = nextAfterZeroOrMoreWhiteSpace();
893            if (c == '>') {
894                checkPublicAndSystemIds(publicId, systemId);
895                return;
896            } else if (c == '\"') {
897                systemId = unescapedStringUntil('\"');
898            } else if (c == '\'') {
899                systemId = unescapedStringUntil('\'');
900            } else {
901                fatal("Garbage in doctype. Expected a quoted string or \u201C>\u201D.");
902            }
903            checkPublicAndSystemIds(publicId, systemId);
904            c = next();
905            if (c == '>') {
906                return;
907            } else if (!isWhiteSpace(c)) {
908                fatal("Expected white space or \u201C>\u201D after the system id.");
909            }
910            c = nextAfterZeroOrMoreWhiteSpace();
911            if (c == '>') {
912                return;
913            } else {
914                fatal("Garbage in doctype. Expected \u201C>\u201D.");
915            }
916        }
917    
918        /**
919         * @throws SAXException 
920         * 
921         */
922        private void sawHtml5Doctype() throws SAXException {
923            html5 = true;
924            switch (doctypeMode) {
925                case DoctypeHandler.ANY_DOCTYPE:
926                    if (doctypeHandler != null) {
927                        doctypeHandler.doctype(DoctypeHandler.DOCTYPE_HTML5);
928                    }
929                    break;
930                case DoctypeHandler.DOCTYPE_HTML5:
931                    return;
932                case DoctypeHandler.DOCTYPE_HTML401_STRICT:
933                    err("Expected an HTML 4.01 Strict document but saw the HTML5 doctype.");
934                    break;
935                case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL:
936                    err("Expected an HTML 4.01 Transitional document but saw the HTML5 doctype.");
937                    break;
938                default:
939                    throw new RuntimeException("Bug in HtmlParser: doctypeMode out of range.");
940            }
941        }
942    
943        /**
944         * @param publicId
945         * @param systemId
946         * @throws SAXException
947         */
948        private void checkPublicAndSystemIds(String publicId, String systemId)
949                throws SAXException, IOException {
950            if ("-//W3C//DTD HTML 4.01//EN".equals(publicId)) {
951                switch (doctypeMode) {
952                    case DoctypeHandler.ANY_DOCTYPE:
953                        if (doctypeHandler != null) {
954                            doctypeHandler.doctype(DoctypeHandler.DOCTYPE_HTML401_STRICT);
955                        }
956                        break;
957                    case DoctypeHandler.DOCTYPE_HTML401_STRICT:
958                        return;
959                    case DoctypeHandler.DOCTYPE_HTML5:
960                        err("Expected an HTML5 document but saw an HTML 4.01 Strict doctype.");
961                        break;
962                    case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL:
963                        err("Expected HTML 4.01 Transitional document but saw an HTML 4.01 Strict doctype.");
964                        break;
965                    default:
966                        throw new RuntimeException("Bug in HtmlParser: doctypeMode out of range.");
967                }
968                if (systemId == null) {
969                    // XXX err, because HTML 4.01 says "must"?
970                    warn("The Strict doctype lacks the system id (URI). This kind of Strict doctype is considered quirky by Mac IE 5. The preferred non-quirky form (also required by the HTML 4.01 specification) is \u201C<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\u201D.");
971                } else if (!"http://www.w3.org/TR/html4/strict.dtd".equals(systemId)) {
972                    // XXX err, because HTML 4.01 says "must"?
973                    warn("The Strict doctype has a non-canonical system id (URI). The form required by the HTML 4.01 specification is \u201C<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\u201D.");
974                }
975            } else if ("-//W3C//DTD HTML 4.01 Transitional//EN".equals(publicId)) {
976                switch (doctypeMode) {
977                    case DoctypeHandler.ANY_DOCTYPE:
978                        if (doctypeHandler != null) {
979                            doctypeHandler.doctype(DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL);
980                        }
981                        break;
982                    case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL:
983                        return;
984                    case DoctypeHandler.DOCTYPE_HTML401_STRICT:
985                        err("Expected an HTML 4.01 Strict document but saw an HTML 4.01 Transitional doctype.");
986                        break;
987                    case DoctypeHandler.DOCTYPE_HTML5:
988                        err("Expected an HTML5 document but saw an HTML 4.01 Transitional doctype.");
989                        break;
990                    default:
991                        throw new RuntimeException("Bug in HtmlParser: doctypeMode out of range.");
992                }
993                if (systemId == null) {
994                    err("The Transitional doctype lacks the system id (URI). This kind of Transitional doctype is considered quirky by browsers. The preferred non-quirky form (also required by the HTML 4.01 specification) is \u201C<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\u201D.");
995                } else if (!"http://www.w3.org/TR/html4/loose.dtd".equals(systemId)) {
996                    // XXX err, because HTML 4.01 says "must"?
997                    warn("The Transitional doctype has a non-canonical system id (URI). This kind of Transitional doctype may be considered quirky by some legacy browsers. The preferred non-quirky form (also required by the HTML 4.01 specification) is \u201C<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\u201D.");
998                }
999            } else if (publicId.startsWith("-//W3C//DTD XHTML ")) {
1000                fatal("XHTML public id seen. XHTML documents are not conforming HTML5 or HTML 4.01 documents.");
1001            } else {
1002                err("Legacy doctype or inappropriate doctype. This parser is designed for HTML5 and also supports the HTML5-like subset of HTML 4.01.");
1003            }
1004        }
1005    
1006        /**
1007         * @param c
1008         * @return
1009         * @throws SAXException
1010         */
1011        private String unescapedStringUntil(char delim) throws SAXException,
1012                IOException {
1013            clearAttrBuf();
1014            for (;;) {
1015                char c = next();
1016                if (c == delim) {
1017                    return attrBufToString();
1018                } else {
1019                    appendAttrBuf(c);
1020                }
1021            }
1022        }
1023    
1024        private boolean consumeCaseInsensitiveAsciiLetterString(char[] str)
1025                throws SAXException, IOException {
1026            for (int i = 0; i < str.length; i++) {
1027                if (!((next() | CASE_MASK) == str[i])) {
1028                    return false;
1029                }
1030            }
1031            return true;
1032        }
1033    
1034        /**
1035         * @throws SAXException
1036         *  
1037         */
1038        private char consumeComment() throws SAXException, IOException {
1039            doctypeNotOk();
1040            char c = next();
1041            if (c != '-') {
1042                fatal("Malformed comment.");
1043            }
1044            boolean prevWasHyphen = false;
1045            for (;;) {
1046                c = next();
1047                if (c == '-') {
1048                    if (prevWasHyphen) {
1049                        return nextAfterZeroOrMoreWhiteSpace();
1050                    } else {
1051                        prevWasHyphen = true;
1052                    }
1053                } else {
1054                    prevWasHyphen = false;
1055                }
1056            }
1057        }
1058    
1059        private char nextMayEnd() throws SAXException, IOException {
1060            pos++;
1061            col++;
1062            if (pos == bufLen) {
1063                boolean charDataContinuation = false;
1064                if (cstart > -1) {
1065                    flushChars();
1066                    charDataContinuation = false;
1067                }
1068                try {
1069                    bufLen = reader.read(buf);
1070                } catch (CharacterCodingException cce) {
1071                    fatal("Input data does not conform to the input encoding.");
1072                }
1073                if (bufLen == -1) {
1074                    return '\u0000';
1075                } else if (normalizationChecker != null) {
1076                    normalizationChecker.characters(buf, 0, bufLen);
1077                }
1078                if (charDataContinuation) {
1079                    cstart = 0;                
1080                }
1081                pos = 0;
1082            }
1083            char c = buf[pos];
1084            if ((c & 0xFC00) == 0xDC00) {
1085                // Got a low surrogate. See if prev was high surrogate
1086                if ((prev & 0xFC00) == 0xD800) {
1087                    int intVal = (prev << 10) + c + SURROGATE_OFFSET;
1088                    if (isNonCharacter(intVal)) {
1089                        warn("Astral non-character.");
1090                    }
1091                    if (isAstralPrivateUse(intVal)) {
1092                        warnAboutPrivateUseChar();
1093                    }
1094                } else {
1095                    fatal("Unmatched low surrogate.");
1096                }
1097                prev = c;
1098            } else {
1099                // see if there was a lone high surrogate
1100                if ((prev & 0xFC00) == 0xD800) {
1101                    fatal("Unmatched high surrogate.");
1102                }
1103                if (isForbidden(c)) {
1104                    fatal("Forbidden character.");
1105                } else if (c == '\r') {
1106                    prev = '\r';
1107                    c = buf[pos] = '\n';
1108                    line++;
1109                    col = 0;
1110                } else if (c == '\n') {
1111                    if (prev != '\r') {
1112                        prev = c;
1113                        line++;
1114                        col = 0;
1115                    } else {
1116                        prev = c;
1117                        // swallow the LF
1118                        col = 0;
1119                        int tmpCstart = cstart;
1120                        flushChars();
1121                        if (tmpCstart != -1) {
1122                            cstart = pos + 1;
1123                        }
1124                        return nextMayEnd();
1125                    }
1126                } else if (isPrivateUse(c)) {
1127                    warnAboutPrivateUseChar();
1128                }
1129            }
1130            return c;
1131        }
1132    
1133        private void warnAboutPrivateUseChar() throws SAXException {
1134            if (!alreadyWarnedAboutPrivateUseCharacters) {
1135                warn("Document uses the Unicode Private Use Area(s), which should not be used in publicly exchanged documents. (Charmod C073)");
1136                alreadyWarnedAboutPrivateUseCharacters = true;
1137            }
1138        }
1139    
1140        private boolean isPrivateUse(char c) {
1141            return c >= '\uE000' && c <= '\uF8FF';
1142        }
1143    
1144        private boolean isAstralPrivateUse(int c) {
1145            return (c >= 0xF0000 && c <= 0xFFFFD) || (c >= 0x100000 && c <= 0x10FFFD);
1146        }
1147        
1148        /**
1149         * @param intVal
1150         * @return
1151         */
1152        private boolean isNonCharacter(int c) {
1153            return (c & 0xFFFE) == 0xFFFE;
1154        }
1155    
1156        /**
1157         * @param c
1158         * @return
1159         */
1160        private boolean isForbidden(char c) {
1161            return !(c == '\t' || c == '\n' || c == '\r'
1162                    || (c >= '\u0020' && c < '\u007F')
1163                    || (c >= '\u00A0' && c < '\uFDD0') || (c > '\uFDDF' && c <= '\uFFFD'));
1164        }
1165    
1166        /**
1167         * @throws SAXException
1168         *  
1169         */
1170        private void flushChars() throws SAXException, IOException {
1171            if (nonWhiteSpaceAllowed) {
1172                if (cstart > -1) {
1173                    if (pos > cstart) {
1174                        ch.characters(buf, cstart, pos - cstart);
1175                    }
1176                }
1177                cstart = -1;
1178            }
1179        }
1180    
1181        /**
1182         * @throws SAXException
1183         * @throws SAXParseException
1184         */
1185        private void fatal(String message) throws SAXException {
1186            SAXParseException spe = new SAXParseException(message, this);
1187            eh.fatalError(spe);
1188            throw spe;
1189        }
1190    
1191        /**
1192         * @param string
1193         * @throws SAXException
1194         */
1195        private void err(String message) throws SAXException {
1196            SAXParseException spe = new SAXParseException(message, this);
1197            eh.error(spe);
1198        }
1199    
1200        /**
1201         * @param string
1202         * @throws SAXException
1203         */
1204        private void warn(String message) throws SAXException {
1205            SAXParseException spe = new SAXParseException(message, this);
1206            eh.warning(spe);
1207        }
1208    
1209        /**
1210         * @see org.xml.sax.Locator#getPublicId()
1211         */
1212        public String getPublicId() {
1213            return publicId;
1214        }
1215    
1216        /**
1217         * @see org.xml.sax.Locator#getSystemId()
1218         */
1219        public String getSystemId() {
1220            return systemId;
1221        }
1222    
1223        /**
1224         * @see org.xml.sax.Locator#getLineNumber()
1225         */
1226        public int getLineNumber() {
1227            return line;
1228        }
1229    
1230        /**
1231         * @see org.xml.sax.Locator#getColumnNumber()
1232         */
1233        public int getColumnNumber() {
1234            return col;
1235        }
1236    
1237        /**
1238         * @see org.xml.sax.XMLReader#getFeature(java.lang.String)
1239         */
1240        public boolean getFeature(String key) throws SAXNotRecognizedException,
1241                SAXNotSupportedException {
1242            if ("http://xml.org/sax/features/namespaces".equals(key)) {
1243                return true;
1244            } else if ("http://xml.org/sax/features/namespace-prefixes".equals(key)) {
1245                return false;
1246            } else if ("http://hsivonen.iki.fi/checkers/nfc/".equals(key)) {
1247                return normalizationChecker != null;
1248            } else {
1249                throw new SAXNotRecognizedException(key);
1250            }
1251        }
1252    
1253        /**
1254         * @see org.xml.sax.XMLReader#setFeature(java.lang.String, boolean)
1255         */
1256        public void setFeature(String key, boolean value)
1257                throws SAXNotRecognizedException, SAXNotSupportedException {
1258            if ("http://xml.org/sax/features/namespaces".equals(key)) {
1259                if (!value) {
1260                    throw new SAXNotSupportedException(
1261                            "Cannot turn off namespace support.");
1262                }
1263            } else if ("http://xml.org/sax/features/namespace-prefixes".equals(key)) {
1264                if (value) {
1265                    throw new SAXNotSupportedException("Cannot turn on prefixing.");
1266                }
1267            } else if ("http://hsivonen.iki.fi/checkers/nfc/".equals(key)) {
1268                if (value) {
1269                    if (normalizationChecker == null) {
1270                        normalizationChecker = new NormalizationChecker(true);
1271                        normalizationChecker.setDocumentLocator(this);
1272                        normalizationChecker.setErrorHandler(getErrorHandler());
1273                    }
1274                } else {
1275                    normalizationChecker = null;
1276                }
1277            } else {
1278                throw new SAXNotRecognizedException(key);
1279            }
1280        }
1281    
1282        /**
1283         * @see org.xml.sax.XMLReader#getProperty(java.lang.String)
1284         */
1285        public Object getProperty(String key) throws SAXNotRecognizedException,
1286                SAXNotSupportedException {
1287            throw new SAXNotRecognizedException(key);
1288        }
1289    
1290        /**
1291         * @see org.xml.sax.XMLReader#setProperty(java.lang.String,
1292         *      java.lang.Object)
1293         */
1294        public void setProperty(String key, Object value)
1295                throws SAXNotRecognizedException, SAXNotSupportedException {
1296            throw new SAXNotRecognizedException(key);
1297        }
1298    
1299        /**
1300         * @see org.xml.sax.XMLReader#setEntityResolver(org.xml.sax.EntityResolver)
1301         */
1302        public void setEntityResolver(EntityResolver entityResolver) {
1303            this.entityResolver = entityResolver;
1304        }
1305    
1306        /**
1307         * @see org.xml.sax.XMLReader#getEntityResolver()
1308         */
1309        public EntityResolver getEntityResolver() {
1310            return entityResolver;
1311        }
1312    
1313        /**
1314         * @see org.xml.sax.XMLReader#setDTDHandler(org.xml.sax.DTDHandler)
1315         */
1316        public void setDTDHandler(DTDHandler handler) {
1317            dtdHandler = handler;
1318        }
1319    
1320        /**
1321         * @see org.xml.sax.XMLReader#getDTDHandler()
1322         */
1323        public DTDHandler getDTDHandler() {
1324            return dtdHandler;
1325        }
1326    
1327        /**
1328         * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler)
1329         */
1330        public void setContentHandler(ContentHandler ch) {
1331            pipelineLast.setContentHandler(ch);
1332        }
1333    
1334        /**
1335         * @see org.xml.sax.XMLReader#getContentHandler()
1336         */
1337        public ContentHandler getContentHandler() {
1338            return pipelineLast.getContentHandler();
1339        }
1340    
1341        /**
1342         * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
1343         */
1344        public void setErrorHandler(ErrorHandler eh) {
1345            this.eh = eh;
1346            eef.setErrorHandler(eh);
1347            tif.setErrorHandler(eh);
1348            cedf.setErrorHandler(eh);
1349        }
1350    
1351        /**
1352         * @see org.xml.sax.XMLReader#getErrorHandler()
1353         */
1354        public ErrorHandler getErrorHandler() {
1355            return eh;
1356        }
1357    
1358        /**
1359         * @see org.xml.sax.XMLReader#parse(org.xml.sax.InputSource)
1360         */
1361        public void parse(InputSource is) throws IOException, SAXException {
1362            alreadyWarnedAboutPrivateUseCharacters = false;
1363            reader = null;
1364            stream = null;
1365            systemId = null;
1366            publicId = null;
1367            encoding = null;
1368            reader = is.getCharacterStream();
1369            systemId = is.getSystemId();
1370            publicId = is.getPublicId();
1371            encoding = is.getEncoding();
1372            try {
1373                streamSetup(is);
1374                ch.setDocumentLocator(this);
1375                try {
1376                    ch.startDocument();
1377                    parse();
1378                    tif.flushStack();
1379                } finally {
1380                    ch.endDocument();
1381                }
1382            } finally {
1383                if (stream != null) {
1384                    stream.close();
1385                }
1386                if (reader != null) {
1387                    reader.close();
1388                }
1389                reader = null;
1390                stream = null;
1391            }
1392        }
1393    
1394        /**
1395         * @param is
1396         * @param swallowBom
1397         * @throws IOException
1398         * @throws SAXException
1399         */
1400        private void streamSetup(InputSource is) throws SAXException, IOException {
1401            boolean swallowBom = true;
1402            if (reader == null) {
1403                stream = is.getByteStream();
1404                if (stream == null) {
1405                    throw new IllegalArgumentException(
1406                            "InputSource had neither a character stream nor a byt stream.");
1407                }
1408                if (encoding == null) {
1409                    // sniff BOM
1410                    if (!stream.markSupported()) {
1411                        stream = new BufferedInputStream(stream);
1412                    }
1413                    stream.mark(1);
1414                    int b = stream.read();
1415                    if (b == -1) {
1416                        throw new IOException("Premature end of file.");
1417                    } else if (b == 0xFE) {
1418                        // first byte big endian
1419                        b = stream.read();
1420                        if (b == -1) {
1421                            throw new IOException("Premature end of file.");
1422                        } else if (b == 0xFF) {
1423                            swallowBom = false;
1424                            encoding = "UTF-16";
1425                            reader = draconianInputStreamReader("UTF-16BE", stream, false);
1426                        } else {
1427                            cannotDetermineEncoding();
1428                        }
1429                    } else if (b == 0xFF) {
1430                        // first byte little endian
1431                        b = stream.read();
1432                        if (b == -1) {
1433                            throw new IOException("Premature end of file.");
1434                        } else if (b == 0xFE) {
1435                            swallowBom = false;
1436                            encoding = "UTF-16";
1437                            reader = draconianInputStreamReader("UTF-16LE", stream, false);
1438                        } else {
1439                            cannotDetermineEncoding();
1440                        }
1441                    } else if (b == 0xEF) {
1442                        // first byte UTF-8
1443                        b = stream.read();
1444                        if (b == -1) {
1445                            throw new IOException("Premature end of file.");
1446                        } else if (b == 0xBB) {
1447                            b = stream.read();
1448                            if (b == -1) {
1449                                throw new IOException("Premature end of file.");
1450                            } else if (b == 0xBF) {
1451                                swallowBom = false;
1452                                encoding = "UTF-8";
1453                                reader = draconianInputStreamReader("UTF-8", stream, false);
1454                            } else {
1455                                cannotDetermineEncoding();
1456                            }
1457                        } else {
1458                            cannotDetermineEncoding();
1459                        }
1460                    } else if (b < 0x80) {
1461                        // no BOM
1462                        swallowBom = false;
1463                        stream.reset();
1464                        reader = new NonBufferingAsciiInputStreamReader(stream);
1465                    } else {
1466                        cannotDetermineEncoding();
1467                    }
1468                } else {
1469                    reader = draconianInputStreamReader(encoding, stream, false);
1470                    if ("UTF-16BE".equalsIgnoreCase(encoding)
1471                            || "UTF-16LE".equalsIgnoreCase(encoding)
1472                            || "UTF-32BE".equalsIgnoreCase(encoding)
1473                            || "UTF-32LE".equalsIgnoreCase(encoding)) {
1474                        swallowBom = false;
1475                    }
1476                }
1477            }
1478            if (swallowBom) {
1479                // won't happen if charecter encoding not determined yet
1480                if (!reader.markSupported()) {
1481                    reader = new BufferedReader(reader);
1482                }
1483                reader.mark(1);
1484                int c = reader.read();
1485                if (c != 0xFEFF) {
1486                    reader.reset();
1487                }
1488            }
1489        }
1490    
1491        /**
1492         * @throws IOException
1493         *  
1494         */
1495        private void cannotDetermineEncoding() throws IOException {
1496            throw new IOException(
1497                    "Unable to determine the character encoding of the document. No external encoding information was provided and the first byte was not an ASCII byte but did not constitute a part of the Byte Order Mark.");
1498        }
1499    
1500        /**
1501         *  
1502         */
1503        private Reader draconianInputStreamReader(String encoding,
1504                InputStream stream, boolean requireAsciiSuperset) throws SAXException {
1505            encoding = encoding.toUpperCase();
1506            try {
1507                Charset cs = Charset.forName(encoding);
1508                String canonName = cs.name();
1509                if (requireAsciiSuperset) {
1510                    if (!EncodingInfo.isAsciiSuperset(canonName)) {
1511                        fatal("The encoding \u201C"
1512                                + encoding
1513                                + "\u201D is not an ASCII superset and, therefore, cannot be used in an internal encoding declaration.");
1514                    }
1515                }
1516                if (canonName.startsWith("X-") || canonName.startsWith("x-")
1517                        || canonName.startsWith("Mac")) {
1518                    if (encoding.startsWith("X-")) {
1519                        err(encoding
1520                                + " is not an IANA-registered encoding. (Charmod C022)");
1521                    } else {
1522                        err(encoding
1523                                + "is not an IANA-registered encoding and did\u2019t start with \u201CX-\u201D. (Charmod C023)");
1524                    }
1525                } else if (!canonName.equalsIgnoreCase(encoding)) {
1526                    err(encoding
1527                            + " is not the preferred name of the character encoding in use. The preferred name is "
1528                            + canonName + ". (Charmod C024)");
1529                }
1530                if (EncodingInfo.isObscure(canonName)) {
1531                    warn("Character encoding " + encoding + " is not widely supported. Better interoperability may be achieved by using UTF-8.");
1532                }
1533                CharsetDecoder decoder = cs.newDecoder();
1534                decoder.onMalformedInput(CodingErrorAction.REPORT);
1535                decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
1536                return new InputStreamReader(stream, decoder);
1537            } catch (IllegalCharsetNameException e) {
1538                fatal("Illegal character encoding name: " + encoding);
1539            } catch (UnsupportedCharsetException e) {
1540                fatal("Unsupported character encoding: " + encoding);
1541            }
1542            return null; // keep the compiler happy
1543        }
1544    
1545        /**
1546         * @see org.xml.sax.XMLReader#parse(java.lang.String)
1547         */
1548        public void parse(String url) throws IOException, SAXException {
1549            // FIXME b0rked if no resolver
1550            parse(entityResolver.resolveEntity(url, null));
1551        }
1552    
1553        /**
1554         * @param string
1555         * @throws
1556         * @throws SAXException
1557         */
1558        void setEncoding(String enc) throws SAXException {
1559            if (enc == null) {
1560                if (encoding == null) {
1561                    if (stream != null) {
1562                        // XXX should the parser default to US-ASCII instead?
1563                        fatal("Character encoding information not available.");
1564                    }
1565                }
1566            } else {
1567                if (encoding == null) {
1568                    encoding = enc;
1569                    if (stream != null) {
1570                        reader = draconianInputStreamReader(encoding, stream, true);
1571                    }
1572                } else {
1573                    if (!encoding.equalsIgnoreCase(enc)) {
1574                        err("Internal character encoding information is inconsistent with external information or the BOM.");
1575                    }
1576                }
1577            }
1578        }
1579    
1580        void setNonWhiteSpaceAllowed(boolean allow) {
1581            nonWhiteSpaceAllowed = allow;
1582        }
1583    
1584        /**
1585         * Returns the doctypeMode.
1586         * 
1587         * @return the doctypeMode
1588         */
1589        public int getDoctypeMode() {
1590            return doctypeMode;
1591        }
1592    
1593        /**
1594         * Sets the doctypeMode.
1595         * 
1596         * @param doctypeMode the doctypeMode to set
1597         */
1598        public void setDoctypeMode(int doctypeMode) {
1599            this.doctypeMode = doctypeMode;
1600        }
1601    
1602        /**
1603         * Returns the doctypeHandler.
1604         * 
1605         * @return the doctypeHandler
1606         */
1607        public DoctypeHandler getDoctypeHandler() {
1608            return doctypeHandler;
1609        }
1610    
1611        /**
1612         * Sets the doctypeHandler.
1613         * 
1614         * @param doctypeHandler the doctypeHandler to set
1615         */
1616        public void setDoctypeHandler(DoctypeHandler doctypeHandler) {
1617            this.doctypeHandler = doctypeHandler;
1618        }
1619    
1620        public void refireStart() throws SAXException {
1621            ch.setDocumentLocator(this);
1622            ch.startDocument();
1623        }
1624    }