001    /*
002     * Copyright (c) 2005, 2006 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.verifierservlet;
024    
025    import java.io.BufferedReader;
026    import java.io.File;
027    import java.io.FileInputStream;
028    import java.io.IOException;
029    import java.io.InputStreamReader;
030    import java.io.OutputStream;
031    import java.net.MalformedURLException;
032    import java.util.Arrays;
033    import java.util.HashMap;
034    import java.util.HashSet;
035    import java.util.Iterator;
036    import java.util.LinkedList;
037    import java.util.List;
038    import java.util.Map;
039    import java.util.Set;
040    import java.util.SortedMap;
041    import java.util.TreeMap;
042    import java.util.regex.Pattern;
043    
044    import javax.servlet.ServletException;
045    import javax.servlet.http.HttpServletRequest;
046    import javax.servlet.http.HttpServletResponse;
047    
048    import net.java.dev.xmlidfilter.XMLIdFilter;
049    
050    import org.apache.log4j.Logger;
051    import org.xml.sax.ContentHandler;
052    import org.xml.sax.EntityResolver;
053    import org.xml.sax.ErrorHandler;
054    import org.xml.sax.Locator;
055    import org.xml.sax.SAXException;
056    import org.xml.sax.SAXNotRecognizedException;
057    import org.xml.sax.SAXNotSupportedException;
058    import org.xml.sax.SAXParseException;
059    import org.xml.sax.XMLReader;
060    
061    import com.hp.hpl.jena.iri.IRI;
062    import com.hp.hpl.jena.iri.IRIException;
063    import com.hp.hpl.jena.iri.IRIFactory;
064    import com.ibm.icu.text.Normalizer;
065    import com.thaiopensource.relaxng.impl.CombineValidator;
066    import com.thaiopensource.util.PropertyMap;
067    import com.thaiopensource.util.PropertyMapBuilder;
068    import com.thaiopensource.validate.IncorrectSchemaException;
069    import com.thaiopensource.validate.Schema;
070    import com.thaiopensource.validate.SchemaReader;
071    import com.thaiopensource.validate.ValidateProperty;
072    import com.thaiopensource.validate.Validator;
073    import com.thaiopensource.validate.auto.AutoSchemaReader;
074    import com.thaiopensource.validate.rng.CompactSchemaReader;
075    import com.thaiopensource.validate.rng.RngProperty;
076    
077    import fi.iki.hsivonen.gnu.xml.aelfred2.SAXDriver;
078    import fi.iki.hsivonen.htmlparser.DoctypeHandler;
079    import fi.iki.hsivonen.htmlparser.HtmlParser;
080    import fi.iki.hsivonen.xml.AttributesImpl;
081    import fi.iki.hsivonen.xml.HtmlSerializer;
082    import fi.iki.hsivonen.xml.LocalCacheEntityResolver;
083    import fi.iki.hsivonen.xml.NullEntityResolver;
084    import fi.iki.hsivonen.xml.PrudentHttpEntityResolver;
085    import fi.iki.hsivonen.xml.SystemErrErrorHandler;
086    import fi.iki.hsivonen.xml.TypedInputSource;
087    import fi.iki.hsivonen.xml.XhtmlIdFilter;
088    import fi.iki.hsivonen.xml.XhtmlSaxEmitter;
089    import fi.iki.hsivonen.xml.checker.DebugChecker;
090    import fi.iki.hsivonen.xml.checker.NormalizationChecker;
091    import fi.iki.hsivonen.xml.checker.SignificantInlineChecker;
092    import fi.iki.hsivonen.xml.checker.TextContentChecker;
093    import fi.iki.hsivonen.xml.checker.jing.CheckerValidator;
094    import fi.iki.hsivonen.xml.checker.table.TableChecker;
095    import fi.karppinen.xml.CharacterUtil;
096    
097    /**
098     * @version $Id: VerifierServletTransaction.java,v 1.10 2005/07/24 07:32:48
099     *          hsivonen Exp $
100     * @author hsivonen
101     */
102    class VerifierServletTransaction implements DoctypeHandler {
103        private static final Logger log4j = Logger.getLogger(VerifierServletTransaction.class);
104    
105        private static final Pattern SPACE = Pattern.compile("\\s+");
106    
107        private static final int NO_EXTERNAL_ENTITIES = 4;
108    
109        private static final int EXTERNAL_ENTITIES_NO_VALIDATION = 5;
110    
111        private static final int HTML_PARSER = DoctypeHandler.ANY_DOCTYPE;
112    
113        private static final int HTML_PARSER_5 = DoctypeHandler.DOCTYPE_HTML5;
114    
115        private static final int HTML_PARSER_4_STRICT = DoctypeHandler.DOCTYPE_HTML401_STRICT;
116    
117        private static final int HTML_PARSER_4_TRANSITIONAL = DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL;
118    
119        private static final int AUTOMATIC_PARSER = 6;
120    
121        protected static final int XHTML5_SCHEMA = 7;
122    
123        private static final char[] SERVICE_TITLE = "Validation Service for RELAX NG ".toCharArray();
124    
125        private static final char[] TWO_POINT_OH_BETA = "2.0 Beta".toCharArray();
126    
127        private static final char[] RESULTS_TITLE = "Validation results for ".toCharArray();
128    
129        private static final char[] SUCCESS = "The document validates according to the specified schema(s).".toCharArray();
130    
131        private static final char[] FAILURE = "There were errors.".toCharArray();
132    
133        private static final Map pathMap = new HashMap();
134    
135        private static int[] presetDoctypes;
136    
137        private static String[] presetLabels;
138    
139        private static String[] presetUrls;
140    
141        private static String[] presetNamespaces;
142    
143        private static final String[] KNOWN_CONTENT_TYPES = {
144                "application/atom+xml", "application/docbook+xml",
145                "application/xhtml+xml", "application/xv+xml" };
146    
147        private static final String[] NAMESPACES_FOR_KNOWN_CONTENT_TYPES = {
148                "http://www.w3.org/2005/Atom", "http://docbook.org/ns/docbook",
149                "http://www.w3.org/1999/xhtml", "http://www.w3.org/1999/xhtml" };
150    
151        private static final String[] ALL_CHECKERS = {
152            "http://hsivonen.iki.fi/checkers/table/",
153            "http://hsivonen.iki.fi/checkers/nfc/",
154            "http://hsivonen.iki.fi/checkers/significant-inline/",
155            "http://hsivonen.iki.fi/checkers/text-content/"};
156    
157        private static final String[] ALL_CHECKERS_HTML4 = {
158            "http://hsivonen.iki.fi/checkers/table/",
159            "http://hsivonen.iki.fi/checkers/nfc/" };
160    
161        private long start = System.currentTimeMillis();
162    
163        private HttpServletRequest request;
164    
165        private HttpServletResponse response;
166    
167        private IRIFactory iriFactory;
168    
169        protected String document;
170    
171        private int parser = AUTOMATIC_PARSER;
172    
173        private boolean laxType = false;
174    
175        protected ContentHandler contentHandler;
176    
177        protected XhtmlSaxEmitter emitter;
178    
179        protected XhtmlEmittingErrorHandler errorHandler;
180    
181        private AttributesImpl attrs = new AttributesImpl();
182    
183        private OutputStream out;
184    
185        private PropertyMap jingPropertyMap;
186    
187        protected LocalCacheEntityResolver entityResolver;
188    
189        private static long lastModified;
190    
191        private static String[] preloadedSchemaUrls;
192    
193        private static Schema[] preloadedSchemas;
194    
195        private String schemaUrls = null;
196    
197        protected Validator validator = null;
198    
199        private BufferingRootNamespaceSniffer bufferingRootNamespaceSniffer = null;
200    
201        private String contentType = null;
202    
203        protected HtmlParser htmlParser = null;
204    
205        protected XMLReader reader;
206    
207        protected TypedInputSource documentInput;
208    
209        protected PrudentHttpEntityResolver httpRes;
210    
211        private Set loadedValidatorUrls = new HashSet();
212        
213        private boolean checkNormalization = false;
214    
215        private boolean rootNamespaceSeen = false;
216    
217        static {
218            try {
219                log4j.debug("Starting static initializer.");
220    
221                String presetPath = System.getProperty("fi.iki.hsivonen.verifierservlet.presetconfpath");
222                File presetFile = new File(presetPath);
223                lastModified = presetFile.lastModified();
224                BufferedReader r = new BufferedReader(new InputStreamReader(
225                        new FileInputStream(presetFile), "UTF-8"));
226                String line;
227                List doctypes = new LinkedList();
228                List namespaces = new LinkedList();
229                List labels = new LinkedList();
230                List urls = new LinkedList();
231    
232                log4j.debug("Starting to loop over config file lines.");
233    
234                while ((line = r.readLine()) != null) {
235                    if ("".equals(line.trim())) {
236                        break;
237                    }
238                    String s[] = line.split("\t");
239                    doctypes.add(s[0]);
240                    namespaces.add(s[1]);
241                    labels.add(s[2]);
242                    urls.add(s[3]);
243                }
244    
245                log4j.debug("Finished reading config.");
246    
247                String[] presetDoctypesAsStrings = (String[]) doctypes.toArray(new String[0]);
248                presetNamespaces = (String[]) namespaces.toArray(new String[0]);
249                presetLabels = (String[]) labels.toArray(new String[0]);
250                presetUrls = (String[]) urls.toArray(new String[0]);
251    
252                log4j.debug("Converted config to arrays.");
253    
254                for (int i = 0; i < presetNamespaces.length; i++) {
255                    String str = presetNamespaces[i];
256                    if ("-".equals(str)) {
257                        presetNamespaces[i] = null;
258                    } else {
259                        presetNamespaces[i] = presetNamespaces[i].intern();
260                    }
261                }
262    
263                log4j.debug("Prepared namespace array.");
264    
265                presetDoctypes = new int[presetDoctypesAsStrings.length];
266                for (int i = 0; i < presetDoctypesAsStrings.length; i++) {
267                    presetDoctypes[i] = Integer.parseInt(presetDoctypesAsStrings[i]);
268                }
269    
270                log4j.debug("Parsed doctype numbers into ints.");
271    
272                String prefix = System.getProperty("fi.iki.hsivonen.verifierservlet.cachepathprefix");
273    
274                log4j.debug("The cache path prefix is: " + prefix);
275    
276                String cacheConfPath = System.getProperty("fi.iki.hsivonen.verifierservlet.cacheconfpath");
277    
278                log4j.debug("The cache config path is: " + cacheConfPath);
279    
280                r = new BufferedReader(new InputStreamReader(new FileInputStream(
281                        cacheConfPath), "UTF-8"));
282                while ((line = r.readLine()) != null) {
283                    if ("".equals(line.trim())) {
284                        break;
285                    }
286                    String s[] = line.split("\t");
287                    pathMap.put(s[0], prefix + s[1]);
288                }
289    
290                log4j.debug("Cache config read.");
291    
292                ErrorHandler eh = new SystemErrErrorHandler();
293                LocalCacheEntityResolver er = new LocalCacheEntityResolver(pathMap,
294                        new NullEntityResolver());
295                er.setAllowRnc(true);
296                PropertyMapBuilder pmb = new PropertyMapBuilder();
297                pmb.put(ValidateProperty.ERROR_HANDLER, eh);
298                pmb.put(ValidateProperty.ENTITY_RESOLVER, er);
299                pmb.put(ValidateProperty.XML_READER_CREATOR,
300                        new VerifierServletXMLReaderCreator(eh, er));
301                RngProperty.CHECK_ID_IDREF.add(pmb);
302                PropertyMap pMap = pmb.toPropertyMap();
303    
304                log4j.debug("Parsing set up. Starting to read schemas.");
305    
306                SortedMap schemaMap = new TreeMap();
307                for (int i = 0; i < presetUrls.length; i++) {
308                    String[] urls1 = SPACE.split(presetUrls[i]);
309                    for (int j = 0; j < urls1.length; j++) {
310                        String url = urls1[j];
311                        if (schemaMap.get(url) == null && !isCheckerUrl(url)) {
312                            Schema sch = schemaByUrl(url, er, pMap);
313                            schemaMap.put(url, sch);
314                        }
315                    }
316                }
317    
318                log4j.debug("Schemas read.");
319    
320                preloadedSchemaUrls = new String[schemaMap.size()];
321                preloadedSchemas = new Schema[schemaMap.size()];
322                int i = 0;
323                for (Iterator iter = schemaMap.entrySet().iterator(); iter.hasNext();) {
324                    Map.Entry entry = (Map.Entry) iter.next();
325                    preloadedSchemaUrls[i] = entry.getKey().toString().intern();
326                    preloadedSchemas[i] = (Schema) entry.getValue();
327                    i++;
328                }
329    
330                log4j.debug("Initialization complete.");
331            } catch (Exception e) {
332                throw new RuntimeException(e);
333            }
334        }
335    
336        protected static String scrub(String s) {
337            return Normalizer.normalize(
338                    CharacterUtil.prudentlyScrubCharacterData(s), Normalizer.NFC);
339        }
340    
341        private static boolean isCheckerUrl(String url) {
342            if ("http://hsivonen.iki.fi/checkers/all/".equals(url)) {
343                return true;
344            } else if ("http://hsivonen.iki.fi/checkers/all-html4/".equals(url)) {
345                return true;
346            }
347            for (int i = 0; i < ALL_CHECKERS.length; i++) {
348                if (ALL_CHECKERS[i].equals(url)) {
349                    return true;
350                }
351            }
352            return false;
353        }
354    
355        /**
356         * @param request
357         * @param response
358         */
359        VerifierServletTransaction(HttpServletRequest request,
360                HttpServletResponse response) {
361            this.request = request;
362            this.response = response;
363            this.iriFactory = IRIFactory.iriImplementation();
364        }
365    
366        protected boolean willValidate() {
367            return document != null;
368        }
369    
370        void doGet() throws ServletException, IOException {
371            response.setContentType("text/html; charset=utf-8");
372    
373            this.out = response.getOutputStream();
374    
375            request.setCharacterEncoding("utf-8");
376    
377            if (willValidate()) {
378                response.setDateHeader("Expires", 0);
379                response.setHeader("Cache-Control", "no-cache");
380            } else {
381                response.setDateHeader("Last-Modified", lastModified);
382            }
383    
384            contentHandler = new HtmlSerializer(out, HtmlSerializer.DOCTYPE_HTML5,
385                    false, "UTF-8");
386            emitter = new XhtmlSaxEmitter(contentHandler);
387    
388            document = scrubUrl(request.getParameter("doc"));
389    
390            document = ("".equals(document)) ? null : document;
391    
392            setup();
393    
394            try {
395                PageEmitter.emit(contentHandler, this);
396            } catch (SAXException e) {
397                throw new ServletException(e);
398            }
399        }
400    
401        /**
402         * @throws ServletException
403         */
404        protected void setup() throws ServletException {
405            String preset = request.getParameter("preset");
406    
407            if (preset != null && !"".equals(preset)) {
408                schemaUrls = preset;
409            } else {
410                schemaUrls = request.getParameter("schema");
411            }
412            if (schemaUrls == null) {
413                schemaUrls = "";
414            }
415    
416            String parserStr = request.getParameter("parser");
417    
418            if ("html".equals(parserStr)) {
419                parser = HTML_PARSER;
420            } else if ("xmldtd".equals(parserStr)) {
421                parser = EXTERNAL_ENTITIES_NO_VALIDATION;
422            } else if ("xml".equals(parserStr)) {
423                parser = NO_EXTERNAL_ENTITIES;
424            } else if ("html5".equals(parserStr)) {
425                parser = HTML_PARSER_5;
426            } else if ("html4".equals(parserStr)) {
427                parser = HTML_PARSER_4_STRICT;
428            } else if ("html4tr".equals(parserStr)) {
429                parser = HTML_PARSER_4_TRANSITIONAL;
430            } // else auto
431    
432            laxType = (request.getParameter("laxtype") != null);
433        }
434    
435        private boolean isHtmlUnsafePreset() {
436            if ("".equals(schemaUrls)) {
437                return false;
438            }
439            boolean preset = false;
440            for (int i = 0; i < presetUrls.length; i++) {
441                if (presetUrls[i].equals(schemaUrls)) {
442                    preset = true;
443                    break;
444                }
445            }
446            if (!preset) {
447                return false;
448            }
449            return !(schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-basic.rng")
450                    || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-strict.rng")
451                    || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-strict-wcag.rng")
452                    || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-transitional.rng")
453                    || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-transitional-wcag.rng") || schemaUrls.startsWith("http://syntax.whattf.org/relaxng/xhtml5full-html.rnc"));
454    
455        }
456    
457        /**
458         * @throws SAXException
459         */
460        void validate() throws SAXException {
461            if (!willValidate()) {
462                return;
463            }
464            try {
465                out.flush();
466            } catch (IOException e1) {
467                throw new SAXException(e1);
468            }
469            errorHandler = new XhtmlEmittingErrorHandler(contentHandler);
470            httpRes = new PrudentHttpEntityResolver(600 * 1024, laxType,
471                    errorHandler);
472            entityResolver = new LocalCacheEntityResolver(pathMap, httpRes);
473            httpRes.setAllowRnc(true);
474            entityResolver.setAllowRnc(true);
475            boolean isValid = false;
476            boolean stats = true;
477            try {
478                this.errorHandler.start();
479                PropertyMapBuilder pmb = new PropertyMapBuilder();
480                pmb.put(ValidateProperty.ERROR_HANDLER, errorHandler);
481                pmb.put(ValidateProperty.ENTITY_RESOLVER, entityResolver);
482                pmb.put(ValidateProperty.XML_READER_CREATOR,
483                        new VerifierServletXMLReaderCreator(errorHandler,
484                                entityResolver));
485                RngProperty.CHECK_ID_IDREF.add(pmb);
486                jingPropertyMap = pmb.toPropertyMap();
487    
488                tryToSetupValidator();
489    
490                httpRes.setAllowRnc(false);
491                entityResolver.setAllowRnc(false);
492    
493                loadDocAndSetupParser();
494    
495                reader.setErrorHandler(errorHandler);
496                contentType = documentInput.getType();
497                if (validator == null) {
498                    checkNormalization = true;
499                }
500                if (checkNormalization) {
501                    reader.setFeature("http://hsivonen.iki.fi/checkers/nfc/", true);
502                }
503                reader.parse(documentInput);
504                isValid = !errorHandler.isErrors();
505            } catch (SAXException e) {
506                log4j.debug("SAXException", e);
507            } catch (IOException e) {
508                stats = false;
509                log4j.info("IOException", e);
510                errorHandler.ioError(e);
511            } catch (IncorrectSchemaException e) {
512                log4j.debug("IncorrectSchemaException", e);
513                errorHandler.schemaError(e);
514            } catch (RuntimeException e) {
515                stats = false;
516                log4j.error("RuntimeException, doc: " + document + " schema: "
517                        + schemaUrls + " lax: " + laxType, e);
518                errorHandler.internalError(e, "Oops. That was not supposed to happen. A bug manifested itself in the application internals. Unable to continue. Sorry. The admin was notified.");
519            } catch (Error e) {
520                stats = false;
521                log4j.error("Error, doc: " + document + " schema: " + schemaUrls
522                        + " lax: " + laxType, e);
523                errorHandler.internalError(e, "Oops. That was not supposed to happen. A bug manifested itself in the application internals. Unable to continue. Sorry. The admin was notified.");
524            } finally {
525                errorHandler.end();
526            }
527            if (isValid) {
528                attrs.clear();
529                attrs.addAttribute("class", "success");
530                emitter.startElement("p", attrs);
531                emitSuccess();
532                emitter.endElement("p");
533            } else {
534                attrs.clear();
535                attrs.addAttribute("class", "failure");
536                emitter.startElement("p", attrs);
537                emitFailure();
538                emitter.endElement("p");
539            }
540            if (stats) {
541                StatsEmitter.emit(contentHandler, this);
542            }
543        }
544    
545        /**
546         * @throws SAXException
547         */
548        protected void emitSuccess() throws SAXException {
549            emitter.characters(SUCCESS);
550        }
551    
552        protected void emitFailure() throws SAXException {
553            emitter.characters(FAILURE);
554        }
555    
556        /**
557         * @throws SAXException
558         * @throws IOException
559         * @throws IncorrectSchemaException
560         */
561        protected void tryToSetupValidator() throws SAXException, IOException,
562                IncorrectSchemaException {
563            validator = validatorByUrls(schemaUrls);
564        }
565    
566        /**
567         * @throws SAXException
568         * @throws IOException
569         * @throws IncorrectSchemaException
570         * @throws SAXNotRecognizedException
571         * @throws SAXNotSupportedException
572         */
573        protected void loadDocAndSetupParser() throws SAXException, IOException,
574                IncorrectSchemaException, SAXNotRecognizedException,
575                SAXNotSupportedException {
576            switch (parser) {
577                case HTML_PARSER:
578                case HTML_PARSER_5:
579                case HTML_PARSER_4_STRICT:
580                case HTML_PARSER_4_TRANSITIONAL:
581                    if (isHtmlUnsafePreset()) {
582                        String message = "The chosen preset schema is not appropriate for HTML.";
583                        SAXException se = new SAXException(message);
584                        errorHandler.schemaError(se);
585                        throw se;
586                    }
587                    httpRes.setAllowGenericXml(false);
588                    httpRes.setAllowHtml(true);
589                    httpRes.setAcceptAllKnownXmlTypes(false);
590                    httpRes.setAllowXhtml(false);
591                    documentInput = (TypedInputSource) entityResolver.resolveEntity(
592                            null, document);
593                    htmlParser = new HtmlParser();
594                    htmlParser.setDoctypeMode(parser); // magic numbers!
595                    htmlParser.setDoctypeHandler(this);
596                    reader = htmlParser;
597                    if (validator == null) {
598                        validator = validatorByDoctype(parser); // magic
599                        // numbers!
600                        // can still be null
601                    }
602                    if (validator != null) {
603                        reader.setContentHandler(validator.getContentHandler());
604                    }
605                    break;
606                case NO_EXTERNAL_ENTITIES:
607                case EXTERNAL_ENTITIES_NO_VALIDATION:
608                    httpRes.setAllowGenericXml(true);
609                    httpRes.setAllowHtml(false);
610                    httpRes.setAcceptAllKnownXmlTypes(true);
611                    httpRes.setAllowXhtml(true);
612                    documentInput = (TypedInputSource) entityResolver.resolveEntity(
613                            null, document);
614                    reader = setupXmlParser();
615                    break;
616                default:
617                    httpRes.setAllowGenericXml(true);
618                    httpRes.setAllowHtml(true);
619                    httpRes.setAcceptAllKnownXmlTypes(true);
620                    httpRes.setAllowXhtml(true);
621                    documentInput = (TypedInputSource) entityResolver.resolveEntity(
622                            null, document);
623                    if ("text/html".equals(documentInput.getType())) {
624                        if (isHtmlUnsafePreset()) {
625                            String message = "The Content-Type was \u201Ctext/html\u201D, but the chosen preset schema is not appropriate for HTML.";
626                            SAXException se = new SAXException(message);
627                            errorHandler.schemaError(se);
628                            throw se;
629                        }
630                        errorHandler.info("The Content-Type was \u201Ctext/html\u201D. Using the HTML parser.");
631                        htmlParser = new HtmlParser();
632                        htmlParser.setDoctypeMode(DoctypeHandler.ANY_DOCTYPE);
633                        htmlParser.setDoctypeHandler(this);
634                        reader = htmlParser;
635                        if (validator != null) {
636                            reader.setContentHandler(validator.getContentHandler());
637                        }
638                    } else {
639                        errorHandler.info("The Content-Type was \u201C"
640                                + documentInput.getType()
641                                + "\u201D. Using the XML parser (not resolving external entities).");
642                        reader = setupXmlParser();
643                    }
644                    break;
645            }
646        }
647    
648        protected Validator validatorByDoctype(int doctype) throws SAXException,
649                IOException, IncorrectSchemaException {
650            if (doctype == ANY_DOCTYPE) {
651                return null;
652            }
653            for (int i = 0; i < presetDoctypes.length; i++) {
654                if (presetDoctypes[i] == doctype) {
655                    return validatorByUrls(presetUrls[i]);
656                }
657            }
658            throw new RuntimeException("Doctype mappings not initialized properly.");
659        }
660    
661        /**
662         * @param entityResolver2
663         * @return
664         * @throws SAXNotRecognizedException
665         * @throws SAXNotSupportedException
666         */
667        protected XMLReader setupXmlParser() throws SAXNotRecognizedException,
668                SAXNotSupportedException {
669            XMLReader reader;
670            reader = new SAXDriver();
671            reader = new XhtmlIdFilter(new XMLIdFilter(reader));
672            reader.setFeature(
673                    "http://xml.org/sax/features/external-general-entities",
674                    parser == EXTERNAL_ENTITIES_NO_VALIDATION);
675            reader.setFeature(
676                    "http://xml.org/sax/features/external-parameter-entities",
677                    parser == EXTERNAL_ENTITIES_NO_VALIDATION);
678            if (parser == EXTERNAL_ENTITIES_NO_VALIDATION) {
679                reader.setEntityResolver(entityResolver);
680            } else {
681                reader.setEntityResolver(new NullEntityResolver());
682            }
683            if (validator == null) {
684                bufferingRootNamespaceSniffer = new BufferingRootNamespaceSniffer(
685                        this);
686                reader.setContentHandler(bufferingRootNamespaceSniffer);
687            } else {
688                reader.setContentHandler(new RootNamespaceSniffer(this,
689                        validator.getContentHandler()));
690                reader.setDTDHandler(validator.getDTDHandler());
691            }
692            return reader;
693        }
694    
695        /**
696         * @param validator
697         * @return
698         * @throws SAXException
699         * @throws IOException
700         * @throws IncorrectSchemaException
701         */
702        private Validator validatorByUrls(String schemaList) throws SAXException,
703                IOException, IncorrectSchemaException {
704            Validator validator = null;
705            String[] schemas = SPACE.split(schemaList);
706            for (int i = schemas.length - 1; i > -1; i--) {
707                String url = schemas[i];
708                if ("http://hsivonen.iki.fi/checkers/all/".equals(url)) {
709                    for (int j = 0; j < ALL_CHECKERS.length; j++) {
710                        validator = combineValidatorByUrl(validator,
711                                ALL_CHECKERS[j]);
712                    }
713                } else if ("http://hsivonen.iki.fi/checkers/all-html4/".equals(url)) {
714                        for (int j = 0; j < ALL_CHECKERS_HTML4.length; j++) {
715                            validator = combineValidatorByUrl(validator,
716                                    ALL_CHECKERS_HTML4[j]);
717                        }
718                } else {
719                    validator = combineValidatorByUrl(validator, url);
720                }
721            }
722            return validator;
723        }
724    
725        /**
726         * @param validator
727         * @param url
728         * @return
729         * @throws SAXException
730         * @throws IOException
731         * @throws IncorrectSchemaException
732         */
733        private Validator combineValidatorByUrl(Validator validator, String url)
734                throws SAXException, IOException, IncorrectSchemaException {
735            if (!"".equals(url)) {
736                Validator v = validatorByUrl(url);
737                if (validator == null) {
738                    validator = v;
739                } else {
740                    validator = new CombineValidator(v, validator);
741                }
742            }
743            return validator;
744        }
745    
746        /**
747         * @param url
748         * @return
749         * @throws SAXException
750         * @throws IOException
751         * @throws IncorrectSchemaException
752         */
753        private Validator validatorByUrl(String url) throws SAXException,
754                IOException, IncorrectSchemaException {
755            if (loadedValidatorUrls.contains(url)) {
756                return null;
757            }
758            loadedValidatorUrls.add(url);
759            if ("http://hsivonen.iki.fi/checkers/table/".equals(url)) {
760                return new CheckerValidator(new TableChecker(), jingPropertyMap);
761            } else if ("http://hsivonen.iki.fi/checkers/nfc/".equals(url)) {
762                this.checkNormalization = true;
763                return new CheckerValidator(new NormalizationChecker(),
764                        jingPropertyMap);
765            } else if ("http://hsivonen.iki.fi/checkers/significant-inline/".equals(url)) {
766                return new CheckerValidator(new SignificantInlineChecker(),
767                        jingPropertyMap);
768            } else if ("http://hsivonen.iki.fi/checkers/debug/".equals(url)) {
769                return new CheckerValidator(new DebugChecker(),
770                        jingPropertyMap);
771            } else if ("http://hsivonen.iki.fi/checkers/text-content/".equals(url)) {
772                return new CheckerValidator(new TextContentChecker(),
773                        jingPropertyMap);
774            }
775            Schema sch = schemaByUrl(url);
776            Validator validator = sch.createValidator(jingPropertyMap);
777            return validator;
778        }
779    
780        /**
781         * @param url
782         * @return
783         * @throws SAXException
784         * @throws IOException
785         * @throws IncorrectSchemaException
786         */
787        private Schema schemaByUrl(String url) throws SAXException, IOException,
788                IncorrectSchemaException {
789            int i = Arrays.binarySearch(preloadedSchemaUrls, url);
790            if (i > -1) {
791                return preloadedSchemas[i];
792            }
793    
794            TypedInputSource schemaInput = (TypedInputSource) entityResolver.resolveEntity(
795                    null, url);
796            SchemaReader sr = null;
797            if ("application/relax-ng-compact-syntax".equals(schemaInput.getType())) {
798                sr = CompactSchemaReader.getInstance();
799            } else {
800                sr = new AutoSchemaReader();
801            }
802            Schema sch = sr.createSchema(schemaInput, jingPropertyMap);
803            return sch;
804        }
805    
806        /**
807         * @param url
808         * @return
809         * @throws SAXException
810         * @throws IOException
811         * @throws IncorrectSchemaException
812         */
813        private static Schema schemaByUrl(String url, EntityResolver resolver,
814                PropertyMap pMap) throws SAXException, IOException,
815                IncorrectSchemaException {
816            log4j.debug("Will load schema: " + url);
817            TypedInputSource schemaInput = (TypedInputSource) resolver.resolveEntity(
818                    null, url);
819            SchemaReader sr = null;
820            if ("application/relax-ng-compact-syntax".equals(schemaInput.getType())) {
821                sr = CompactSchemaReader.getInstance();
822            } else {
823                sr = new AutoSchemaReader();
824            }
825            Schema sch = sr.createSchema(schemaInput, pMap);
826            return sch;
827        }
828    
829        /**
830         * @throws SAXException
831         */
832        void emitTitle(boolean markupAllowed) throws SAXException {
833            if (willValidate()) {
834                emitter.characters(RESULTS_TITLE);
835                emitter.characters(scrub(document));
836            } else {
837                emitter.characters(SERVICE_TITLE);
838                if (markupAllowed) {
839                    emitter.startElement("span");
840                    emitter.characters(TWO_POINT_OH_BETA);
841                    emitter.endElement("span");
842                }
843            }
844        }
845    
846        void emitForm() throws SAXException {
847            attrs.clear();
848            attrs.addAttribute("method", "get");
849            attrs.addAttribute("action", request.getRequestURL().toString());
850            attrs.addAttribute("onsubmit", "formSubmission()");
851            emitter.startElement("form", attrs);
852            emitFormContent();
853            emitter.endElement("form");
854        }
855    
856        /**
857         * @throws SAXException
858         */
859        protected void emitFormContent() throws SAXException {
860            FormEmitter.emit(contentHandler, this);
861        }
862    
863        void emitSchemaField() throws SAXException {
864            attrs.clear();
865            attrs.addAttribute("name", "schema");
866            attrs.addAttribute("id", "schema");
867            attrs.addAttribute("onchange", "schemaChanged();");
868            attrs.addAttribute("pattern", "(?:https?://.+(?:\\s+https?://.+)*)?");
869            attrs.addAttribute(
870                    "title",
871                    "The schema field takes zero or more space-separated absolute IRIs (http or https only) of the schemas that the document is to be validated against. (When left blank, the service will attempt to pick schemas automatically.)");
872            if (schemaUrls != null) {
873                attrs.addAttribute("value", scrub(schemaUrls));
874            }
875            emitter.startElement("input", attrs);
876            emitter.endElement("input");
877        }
878    
879        void emitDocField() throws SAXException {
880            attrs.clear();
881            attrs.addAttribute("type", "url");
882            attrs.addAttribute("name", "doc");
883            attrs.addAttribute("id", "doc");
884            attrs.addAttribute("pattern", "(?:https?://.+)?");
885            attrs.addAttribute(
886                    "title",
887                    "The document field takes the absolute IRI (http or https only) of the document to be checked. (The document field can also be left blank in order to bookmark settings.)");
888            if (document != null) {
889                attrs.addAttribute("value", scrub(document));
890            }
891            emitter.startElement("input", attrs);
892            emitter.endElement("input");
893        }
894    
895        private String scrubUrl(String urlStr) {
896            if (urlStr == null) {
897                return null;
898            }
899    
900            try {
901                IRI iri = iriFactory.construct(urlStr);
902                return iri.toASCIIString();
903            } catch (IRIException e) {
904                return null;
905            } catch (MalformedURLException e) {
906                return null;
907            }
908        }
909    
910        /**
911         * @throws SAXException
912         * 
913         */
914        void emitSchemaDuration() throws SAXException {
915        }
916    
917        /**
918         * @throws SAXException
919         * 
920         */
921        void emitDocDuration() throws SAXException {
922        }
923    
924        /**
925         * @throws SAXException
926         * 
927         */
928        void emitTotalDuration() throws SAXException {
929            emitter.characters("" + (System.currentTimeMillis() - start));
930        }
931    
932        /**
933         * @throws SAXException
934         * 
935         */
936        void emitPresetOptions() throws SAXException {
937            for (int i = 0; i < presetUrls.length; i++) {
938                emitter.option(presetLabels[i], presetUrls[i], false);
939            }
940        }
941    
942        /**
943         * @throws SAXException
944         * 
945         */
946        void emitParserOptions() throws SAXException {
947            emitter.option("Automatically from Content-Type", "",
948                    (parser == AUTOMATIC_PARSER));
949            emitter.option("XML; don\u2019t load external entities", "xml",
950                    (parser == NO_EXTERNAL_ENTITIES));
951            emitter.option("XML; load external entities", "xmldtd",
952                    (parser == EXTERNAL_ENTITIES_NO_VALIDATION));
953            emitter.option("HTML; flavor from doctype", "html",
954                    (parser == HTML_PARSER));
955            emitter.option("HTML5", "html5", (parser == HTML_PARSER_5));
956            emitter.option("HTML 4.01 Strict", "html4",
957                    (parser == HTML_PARSER_4_STRICT));
958            emitter.option("HTML 4.01 Transitional", "html4tr",
959                    (parser == HTML_PARSER_4_TRANSITIONAL));
960        }
961    
962        /**
963         * @throws SAXException
964         * 
965         */
966        void emitLaxTypeField() throws SAXException {
967            emitter.checkbox("laxtype", "yes", laxType);
968        }
969    
970        void rootNamespace(String namespace, Locator locator) throws SAXException {
971            if (validator == null) {
972                int index = -1;
973                for (int i = 0; i < presetNamespaces.length; i++) {
974                    if (namespace.equals(presetNamespaces[i])) {
975                        index = i;
976                        break;
977                    }
978                }
979                if (index == -1) {
980                    String message = "Cannot find preset schema for namespace: \u201C"
981                            + namespace + "\u201D.";
982                    SAXException se = new SAXException(message);
983                    errorHandler.schemaError(se);
984                    throw se;
985                }
986                String label = presetLabels[index];
987                String urls = presetUrls[index];
988                errorHandler.info("Using the preset for " + label
989                        + " based on the root namespace.");
990                try {
991                    validator = validatorByUrls(urls);
992                } catch (IOException ioe) {
993                    // At this point the schema comes from memory.
994                    throw new RuntimeException(ioe);
995                } catch (IncorrectSchemaException e) {
996                    // At this point the schema comes from memory.
997                    throw new RuntimeException(e);
998                }
999                if (bufferingRootNamespaceSniffer == null) {
1000                    throw new RuntimeException(
1001                            "Bug! bufferingRootNamespaceSniffer was null.");
1002                }
1003                bufferingRootNamespaceSniffer.setContentHandler(validator.getContentHandler());
1004            }
1005    
1006            if (!rootNamespaceSeen) {
1007                rootNamespaceSeen = true;
1008                if (contentType != null) {
1009                    int i;
1010                    if ((i = Arrays.binarySearch(KNOWN_CONTENT_TYPES, contentType)) > -1) {
1011                        if (!NAMESPACES_FOR_KNOWN_CONTENT_TYPES[i].equals(namespace)) {
1012                            String message = "\u201C"
1013                                    + contentType
1014                                    + "\u201D is not an appropriate Content-Type for a document whose root namespace is \u201C"
1015                                    + namespace + "\u201D.";
1016                            SAXParseException spe = new SAXParseException(message,
1017                                    locator);
1018                            errorHandler.warning(spe);
1019                        }
1020                    }
1021                }
1022            }
1023        }
1024    
1025        public void doctype(int doctype) throws SAXException {
1026            if (validator == null) {
1027                try {
1028                    validator = validatorByDoctype(doctype);
1029                } catch (IOException ioe) {
1030                    // At this point the schema comes from memory.
1031                    throw new RuntimeException(ioe);
1032                } catch (IncorrectSchemaException e) {
1033                    // At this point the schema comes from memory.
1034                    throw new RuntimeException(e);
1035                }
1036                switch (doctype) {
1037                    case DoctypeHandler.DOCTYPE_HTML5:
1038                        errorHandler.info("HTML5 doctype seen. Running the HTML parser in the HTML5 mode and using the preset for "
1039                                + schemaLabelFromDoctype(doctype) + ".");
1040                        break;
1041                    case DoctypeHandler.DOCTYPE_HTML401_STRICT:
1042                        errorHandler.info("HTML 4.01 Strict doctype seen. Running the HTML parser in the HTML 4.01 mode and using the preset for "
1043                                + schemaLabelFromDoctype(doctype) + ".");
1044                        break;
1045                    case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL:
1046                        errorHandler.info("HTML 4.01 Transitional doctype seen. Running the HTML parser in the HTML 4.01 mode and using the preset for "
1047                                + schemaLabelFromDoctype(doctype) + ".");
1048                        break;
1049                }
1050                htmlParser.setContentHandler(validator.getContentHandler());
1051                htmlParser.refireStart();
1052            } else {
1053                switch (doctype) {
1054                    case DoctypeHandler.DOCTYPE_HTML5:
1055                        errorHandler.info("HTML5 doctype seen. Running the HTML parser in the HTML5 mode.");
1056                        break;
1057                    case DoctypeHandler.DOCTYPE_HTML401_STRICT:
1058                        errorHandler.info("HTML 4.01 Strict doctype seen. Running the HTML parser in the HTML 4.01 mode.");
1059                        break;
1060                    case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL:
1061                        errorHandler.info("HTML 4.01 Transitional doctype seen. Running the HTML parser in the HTML 4.01 mode.");
1062                        break;
1063                }
1064            }
1065        }
1066    
1067        private String schemaLabelFromDoctype(int doctype) {
1068            for (int i = 0; i < presetDoctypes.length; i++) {
1069                if (doctype == presetDoctypes[i]) {
1070                    return presetLabels[i];
1071                }
1072            }
1073            throw new RuntimeException("Bug: Bad magic number.");
1074        }
1075    }