001 /*
002 * Copyright (c) 2005, 2006 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package fi.iki.hsivonen.verifierservlet;
024
025 import java.io.BufferedReader;
026 import java.io.File;
027 import java.io.FileInputStream;
028 import java.io.IOException;
029 import java.io.InputStreamReader;
030 import java.io.OutputStream;
031 import java.net.MalformedURLException;
032 import java.util.Arrays;
033 import java.util.HashMap;
034 import java.util.HashSet;
035 import java.util.Iterator;
036 import java.util.LinkedList;
037 import java.util.List;
038 import java.util.Map;
039 import java.util.Set;
040 import java.util.SortedMap;
041 import java.util.TreeMap;
042 import java.util.regex.Pattern;
043
044 import javax.servlet.ServletException;
045 import javax.servlet.http.HttpServletRequest;
046 import javax.servlet.http.HttpServletResponse;
047
048 import net.java.dev.xmlidfilter.XMLIdFilter;
049
050 import org.apache.log4j.Logger;
051 import org.xml.sax.ContentHandler;
052 import org.xml.sax.EntityResolver;
053 import org.xml.sax.ErrorHandler;
054 import org.xml.sax.Locator;
055 import org.xml.sax.SAXException;
056 import org.xml.sax.SAXNotRecognizedException;
057 import org.xml.sax.SAXNotSupportedException;
058 import org.xml.sax.SAXParseException;
059 import org.xml.sax.XMLReader;
060
061 import com.hp.hpl.jena.iri.IRI;
062 import com.hp.hpl.jena.iri.IRIException;
063 import com.hp.hpl.jena.iri.IRIFactory;
064 import com.ibm.icu.text.Normalizer;
065 import com.thaiopensource.relaxng.impl.CombineValidator;
066 import com.thaiopensource.util.PropertyMap;
067 import com.thaiopensource.util.PropertyMapBuilder;
068 import com.thaiopensource.validate.IncorrectSchemaException;
069 import com.thaiopensource.validate.Schema;
070 import com.thaiopensource.validate.SchemaReader;
071 import com.thaiopensource.validate.ValidateProperty;
072 import com.thaiopensource.validate.Validator;
073 import com.thaiopensource.validate.auto.AutoSchemaReader;
074 import com.thaiopensource.validate.rng.CompactSchemaReader;
075 import com.thaiopensource.validate.rng.RngProperty;
076
077 import fi.iki.hsivonen.gnu.xml.aelfred2.SAXDriver;
078 import fi.iki.hsivonen.htmlparser.DoctypeHandler;
079 import fi.iki.hsivonen.htmlparser.HtmlParser;
080 import fi.iki.hsivonen.xml.AttributesImpl;
081 import fi.iki.hsivonen.xml.HtmlSerializer;
082 import fi.iki.hsivonen.xml.LocalCacheEntityResolver;
083 import fi.iki.hsivonen.xml.NullEntityResolver;
084 import fi.iki.hsivonen.xml.PrudentHttpEntityResolver;
085 import fi.iki.hsivonen.xml.SystemErrErrorHandler;
086 import fi.iki.hsivonen.xml.TypedInputSource;
087 import fi.iki.hsivonen.xml.XhtmlIdFilter;
088 import fi.iki.hsivonen.xml.XhtmlSaxEmitter;
089 import fi.iki.hsivonen.xml.checker.DebugChecker;
090 import fi.iki.hsivonen.xml.checker.NormalizationChecker;
091 import fi.iki.hsivonen.xml.checker.SignificantInlineChecker;
092 import fi.iki.hsivonen.xml.checker.TextContentChecker;
093 import fi.iki.hsivonen.xml.checker.jing.CheckerValidator;
094 import fi.iki.hsivonen.xml.checker.table.TableChecker;
095 import fi.karppinen.xml.CharacterUtil;
096
097 /**
098 * @version $Id: VerifierServletTransaction.java,v 1.10 2005/07/24 07:32:48
099 * hsivonen Exp $
100 * @author hsivonen
101 */
102 class VerifierServletTransaction implements DoctypeHandler {
103 private static final Logger log4j = Logger.getLogger(VerifierServletTransaction.class);
104
105 private static final Pattern SPACE = Pattern.compile("\\s+");
106
107 private static final int NO_EXTERNAL_ENTITIES = 4;
108
109 private static final int EXTERNAL_ENTITIES_NO_VALIDATION = 5;
110
111 private static final int HTML_PARSER = DoctypeHandler.ANY_DOCTYPE;
112
113 private static final int HTML_PARSER_5 = DoctypeHandler.DOCTYPE_HTML5;
114
115 private static final int HTML_PARSER_4_STRICT = DoctypeHandler.DOCTYPE_HTML401_STRICT;
116
117 private static final int HTML_PARSER_4_TRANSITIONAL = DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL;
118
119 private static final int AUTOMATIC_PARSER = 6;
120
121 protected static final int XHTML5_SCHEMA = 7;
122
123 private static final char[] SERVICE_TITLE = "Validation Service for RELAX NG ".toCharArray();
124
125 private static final char[] TWO_POINT_OH_BETA = "2.0 Beta".toCharArray();
126
127 private static final char[] RESULTS_TITLE = "Validation results for ".toCharArray();
128
129 private static final char[] SUCCESS = "The document validates according to the specified schema(s).".toCharArray();
130
131 private static final char[] FAILURE = "There were errors.".toCharArray();
132
133 private static final Map pathMap = new HashMap();
134
135 private static int[] presetDoctypes;
136
137 private static String[] presetLabels;
138
139 private static String[] presetUrls;
140
141 private static String[] presetNamespaces;
142
143 private static final String[] KNOWN_CONTENT_TYPES = {
144 "application/atom+xml", "application/docbook+xml",
145 "application/xhtml+xml", "application/xv+xml" };
146
147 private static final String[] NAMESPACES_FOR_KNOWN_CONTENT_TYPES = {
148 "http://www.w3.org/2005/Atom", "http://docbook.org/ns/docbook",
149 "http://www.w3.org/1999/xhtml", "http://www.w3.org/1999/xhtml" };
150
151 private static final String[] ALL_CHECKERS = {
152 "http://hsivonen.iki.fi/checkers/table/",
153 "http://hsivonen.iki.fi/checkers/nfc/",
154 "http://hsivonen.iki.fi/checkers/significant-inline/",
155 "http://hsivonen.iki.fi/checkers/text-content/"};
156
157 private static final String[] ALL_CHECKERS_HTML4 = {
158 "http://hsivonen.iki.fi/checkers/table/",
159 "http://hsivonen.iki.fi/checkers/nfc/" };
160
161 private long start = System.currentTimeMillis();
162
163 private HttpServletRequest request;
164
165 private HttpServletResponse response;
166
167 private IRIFactory iriFactory;
168
169 protected String document;
170
171 private int parser = AUTOMATIC_PARSER;
172
173 private boolean laxType = false;
174
175 protected ContentHandler contentHandler;
176
177 protected XhtmlSaxEmitter emitter;
178
179 protected XhtmlEmittingErrorHandler errorHandler;
180
181 private AttributesImpl attrs = new AttributesImpl();
182
183 private OutputStream out;
184
185 private PropertyMap jingPropertyMap;
186
187 protected LocalCacheEntityResolver entityResolver;
188
189 private static long lastModified;
190
191 private static String[] preloadedSchemaUrls;
192
193 private static Schema[] preloadedSchemas;
194
195 private String schemaUrls = null;
196
197 protected Validator validator = null;
198
199 private BufferingRootNamespaceSniffer bufferingRootNamespaceSniffer = null;
200
201 private String contentType = null;
202
203 protected HtmlParser htmlParser = null;
204
205 protected XMLReader reader;
206
207 protected TypedInputSource documentInput;
208
209 protected PrudentHttpEntityResolver httpRes;
210
211 private Set loadedValidatorUrls = new HashSet();
212
213 private boolean checkNormalization = false;
214
215 private boolean rootNamespaceSeen = false;
216
217 static {
218 try {
219 log4j.debug("Starting static initializer.");
220
221 String presetPath = System.getProperty("fi.iki.hsivonen.verifierservlet.presetconfpath");
222 File presetFile = new File(presetPath);
223 lastModified = presetFile.lastModified();
224 BufferedReader r = new BufferedReader(new InputStreamReader(
225 new FileInputStream(presetFile), "UTF-8"));
226 String line;
227 List doctypes = new LinkedList();
228 List namespaces = new LinkedList();
229 List labels = new LinkedList();
230 List urls = new LinkedList();
231
232 log4j.debug("Starting to loop over config file lines.");
233
234 while ((line = r.readLine()) != null) {
235 if ("".equals(line.trim())) {
236 break;
237 }
238 String s[] = line.split("\t");
239 doctypes.add(s[0]);
240 namespaces.add(s[1]);
241 labels.add(s[2]);
242 urls.add(s[3]);
243 }
244
245 log4j.debug("Finished reading config.");
246
247 String[] presetDoctypesAsStrings = (String[]) doctypes.toArray(new String[0]);
248 presetNamespaces = (String[]) namespaces.toArray(new String[0]);
249 presetLabels = (String[]) labels.toArray(new String[0]);
250 presetUrls = (String[]) urls.toArray(new String[0]);
251
252 log4j.debug("Converted config to arrays.");
253
254 for (int i = 0; i < presetNamespaces.length; i++) {
255 String str = presetNamespaces[i];
256 if ("-".equals(str)) {
257 presetNamespaces[i] = null;
258 } else {
259 presetNamespaces[i] = presetNamespaces[i].intern();
260 }
261 }
262
263 log4j.debug("Prepared namespace array.");
264
265 presetDoctypes = new int[presetDoctypesAsStrings.length];
266 for (int i = 0; i < presetDoctypesAsStrings.length; i++) {
267 presetDoctypes[i] = Integer.parseInt(presetDoctypesAsStrings[i]);
268 }
269
270 log4j.debug("Parsed doctype numbers into ints.");
271
272 String prefix = System.getProperty("fi.iki.hsivonen.verifierservlet.cachepathprefix");
273
274 log4j.debug("The cache path prefix is: " + prefix);
275
276 String cacheConfPath = System.getProperty("fi.iki.hsivonen.verifierservlet.cacheconfpath");
277
278 log4j.debug("The cache config path is: " + cacheConfPath);
279
280 r = new BufferedReader(new InputStreamReader(new FileInputStream(
281 cacheConfPath), "UTF-8"));
282 while ((line = r.readLine()) != null) {
283 if ("".equals(line.trim())) {
284 break;
285 }
286 String s[] = line.split("\t");
287 pathMap.put(s[0], prefix + s[1]);
288 }
289
290 log4j.debug("Cache config read.");
291
292 ErrorHandler eh = new SystemErrErrorHandler();
293 LocalCacheEntityResolver er = new LocalCacheEntityResolver(pathMap,
294 new NullEntityResolver());
295 er.setAllowRnc(true);
296 PropertyMapBuilder pmb = new PropertyMapBuilder();
297 pmb.put(ValidateProperty.ERROR_HANDLER, eh);
298 pmb.put(ValidateProperty.ENTITY_RESOLVER, er);
299 pmb.put(ValidateProperty.XML_READER_CREATOR,
300 new VerifierServletXMLReaderCreator(eh, er));
301 RngProperty.CHECK_ID_IDREF.add(pmb);
302 PropertyMap pMap = pmb.toPropertyMap();
303
304 log4j.debug("Parsing set up. Starting to read schemas.");
305
306 SortedMap schemaMap = new TreeMap();
307 for (int i = 0; i < presetUrls.length; i++) {
308 String[] urls1 = SPACE.split(presetUrls[i]);
309 for (int j = 0; j < urls1.length; j++) {
310 String url = urls1[j];
311 if (schemaMap.get(url) == null && !isCheckerUrl(url)) {
312 Schema sch = schemaByUrl(url, er, pMap);
313 schemaMap.put(url, sch);
314 }
315 }
316 }
317
318 log4j.debug("Schemas read.");
319
320 preloadedSchemaUrls = new String[schemaMap.size()];
321 preloadedSchemas = new Schema[schemaMap.size()];
322 int i = 0;
323 for (Iterator iter = schemaMap.entrySet().iterator(); iter.hasNext();) {
324 Map.Entry entry = (Map.Entry) iter.next();
325 preloadedSchemaUrls[i] = entry.getKey().toString().intern();
326 preloadedSchemas[i] = (Schema) entry.getValue();
327 i++;
328 }
329
330 log4j.debug("Initialization complete.");
331 } catch (Exception e) {
332 throw new RuntimeException(e);
333 }
334 }
335
336 protected static String scrub(String s) {
337 return Normalizer.normalize(
338 CharacterUtil.prudentlyScrubCharacterData(s), Normalizer.NFC);
339 }
340
341 private static boolean isCheckerUrl(String url) {
342 if ("http://hsivonen.iki.fi/checkers/all/".equals(url)) {
343 return true;
344 } else if ("http://hsivonen.iki.fi/checkers/all-html4/".equals(url)) {
345 return true;
346 }
347 for (int i = 0; i < ALL_CHECKERS.length; i++) {
348 if (ALL_CHECKERS[i].equals(url)) {
349 return true;
350 }
351 }
352 return false;
353 }
354
355 /**
356 * @param request
357 * @param response
358 */
359 VerifierServletTransaction(HttpServletRequest request,
360 HttpServletResponse response) {
361 this.request = request;
362 this.response = response;
363 this.iriFactory = IRIFactory.iriImplementation();
364 }
365
366 protected boolean willValidate() {
367 return document != null;
368 }
369
370 void doGet() throws ServletException, IOException {
371 response.setContentType("text/html; charset=utf-8");
372
373 this.out = response.getOutputStream();
374
375 request.setCharacterEncoding("utf-8");
376
377 if (willValidate()) {
378 response.setDateHeader("Expires", 0);
379 response.setHeader("Cache-Control", "no-cache");
380 } else {
381 response.setDateHeader("Last-Modified", lastModified);
382 }
383
384 contentHandler = new HtmlSerializer(out, HtmlSerializer.DOCTYPE_HTML5,
385 false, "UTF-8");
386 emitter = new XhtmlSaxEmitter(contentHandler);
387
388 document = scrubUrl(request.getParameter("doc"));
389
390 document = ("".equals(document)) ? null : document;
391
392 setup();
393
394 try {
395 PageEmitter.emit(contentHandler, this);
396 } catch (SAXException e) {
397 throw new ServletException(e);
398 }
399 }
400
401 /**
402 * @throws ServletException
403 */
404 protected void setup() throws ServletException {
405 String preset = request.getParameter("preset");
406
407 if (preset != null && !"".equals(preset)) {
408 schemaUrls = preset;
409 } else {
410 schemaUrls = request.getParameter("schema");
411 }
412 if (schemaUrls == null) {
413 schemaUrls = "";
414 }
415
416 String parserStr = request.getParameter("parser");
417
418 if ("html".equals(parserStr)) {
419 parser = HTML_PARSER;
420 } else if ("xmldtd".equals(parserStr)) {
421 parser = EXTERNAL_ENTITIES_NO_VALIDATION;
422 } else if ("xml".equals(parserStr)) {
423 parser = NO_EXTERNAL_ENTITIES;
424 } else if ("html5".equals(parserStr)) {
425 parser = HTML_PARSER_5;
426 } else if ("html4".equals(parserStr)) {
427 parser = HTML_PARSER_4_STRICT;
428 } else if ("html4tr".equals(parserStr)) {
429 parser = HTML_PARSER_4_TRANSITIONAL;
430 } // else auto
431
432 laxType = (request.getParameter("laxtype") != null);
433 }
434
435 private boolean isHtmlUnsafePreset() {
436 if ("".equals(schemaUrls)) {
437 return false;
438 }
439 boolean preset = false;
440 for (int i = 0; i < presetUrls.length; i++) {
441 if (presetUrls[i].equals(schemaUrls)) {
442 preset = true;
443 break;
444 }
445 }
446 if (!preset) {
447 return false;
448 }
449 return !(schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-basic.rng")
450 || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-strict.rng")
451 || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-strict-wcag.rng")
452 || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-transitional.rng")
453 || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-transitional-wcag.rng") || schemaUrls.startsWith("http://syntax.whattf.org/relaxng/xhtml5full-html.rnc"));
454
455 }
456
457 /**
458 * @throws SAXException
459 */
460 void validate() throws SAXException {
461 if (!willValidate()) {
462 return;
463 }
464 try {
465 out.flush();
466 } catch (IOException e1) {
467 throw new SAXException(e1);
468 }
469 errorHandler = new XhtmlEmittingErrorHandler(contentHandler);
470 httpRes = new PrudentHttpEntityResolver(600 * 1024, laxType,
471 errorHandler);
472 entityResolver = new LocalCacheEntityResolver(pathMap, httpRes);
473 httpRes.setAllowRnc(true);
474 entityResolver.setAllowRnc(true);
475 boolean isValid = false;
476 boolean stats = true;
477 try {
478 this.errorHandler.start();
479 PropertyMapBuilder pmb = new PropertyMapBuilder();
480 pmb.put(ValidateProperty.ERROR_HANDLER, errorHandler);
481 pmb.put(ValidateProperty.ENTITY_RESOLVER, entityResolver);
482 pmb.put(ValidateProperty.XML_READER_CREATOR,
483 new VerifierServletXMLReaderCreator(errorHandler,
484 entityResolver));
485 RngProperty.CHECK_ID_IDREF.add(pmb);
486 jingPropertyMap = pmb.toPropertyMap();
487
488 tryToSetupValidator();
489
490 httpRes.setAllowRnc(false);
491 entityResolver.setAllowRnc(false);
492
493 loadDocAndSetupParser();
494
495 reader.setErrorHandler(errorHandler);
496 contentType = documentInput.getType();
497 if (validator == null) {
498 checkNormalization = true;
499 }
500 if (checkNormalization) {
501 reader.setFeature("http://hsivonen.iki.fi/checkers/nfc/", true);
502 }
503 reader.parse(documentInput);
504 isValid = !errorHandler.isErrors();
505 } catch (SAXException e) {
506 log4j.debug("SAXException", e);
507 } catch (IOException e) {
508 stats = false;
509 log4j.info("IOException", e);
510 errorHandler.ioError(e);
511 } catch (IncorrectSchemaException e) {
512 log4j.debug("IncorrectSchemaException", e);
513 errorHandler.schemaError(e);
514 } catch (RuntimeException e) {
515 stats = false;
516 log4j.error("RuntimeException, doc: " + document + " schema: "
517 + schemaUrls + " lax: " + laxType, e);
518 errorHandler.internalError(e, "Oops. That was not supposed to happen. A bug manifested itself in the application internals. Unable to continue. Sorry. The admin was notified.");
519 } catch (Error e) {
520 stats = false;
521 log4j.error("Error, doc: " + document + " schema: " + schemaUrls
522 + " lax: " + laxType, e);
523 errorHandler.internalError(e, "Oops. That was not supposed to happen. A bug manifested itself in the application internals. Unable to continue. Sorry. The admin was notified.");
524 } finally {
525 errorHandler.end();
526 }
527 if (isValid) {
528 attrs.clear();
529 attrs.addAttribute("class", "success");
530 emitter.startElement("p", attrs);
531 emitSuccess();
532 emitter.endElement("p");
533 } else {
534 attrs.clear();
535 attrs.addAttribute("class", "failure");
536 emitter.startElement("p", attrs);
537 emitFailure();
538 emitter.endElement("p");
539 }
540 if (stats) {
541 StatsEmitter.emit(contentHandler, this);
542 }
543 }
544
545 /**
546 * @throws SAXException
547 */
548 protected void emitSuccess() throws SAXException {
549 emitter.characters(SUCCESS);
550 }
551
552 protected void emitFailure() throws SAXException {
553 emitter.characters(FAILURE);
554 }
555
556 /**
557 * @throws SAXException
558 * @throws IOException
559 * @throws IncorrectSchemaException
560 */
561 protected void tryToSetupValidator() throws SAXException, IOException,
562 IncorrectSchemaException {
563 validator = validatorByUrls(schemaUrls);
564 }
565
566 /**
567 * @throws SAXException
568 * @throws IOException
569 * @throws IncorrectSchemaException
570 * @throws SAXNotRecognizedException
571 * @throws SAXNotSupportedException
572 */
573 protected void loadDocAndSetupParser() throws SAXException, IOException,
574 IncorrectSchemaException, SAXNotRecognizedException,
575 SAXNotSupportedException {
576 switch (parser) {
577 case HTML_PARSER:
578 case HTML_PARSER_5:
579 case HTML_PARSER_4_STRICT:
580 case HTML_PARSER_4_TRANSITIONAL:
581 if (isHtmlUnsafePreset()) {
582 String message = "The chosen preset schema is not appropriate for HTML.";
583 SAXException se = new SAXException(message);
584 errorHandler.schemaError(se);
585 throw se;
586 }
587 httpRes.setAllowGenericXml(false);
588 httpRes.setAllowHtml(true);
589 httpRes.setAcceptAllKnownXmlTypes(false);
590 httpRes.setAllowXhtml(false);
591 documentInput = (TypedInputSource) entityResolver.resolveEntity(
592 null, document);
593 htmlParser = new HtmlParser();
594 htmlParser.setDoctypeMode(parser); // magic numbers!
595 htmlParser.setDoctypeHandler(this);
596 reader = htmlParser;
597 if (validator == null) {
598 validator = validatorByDoctype(parser); // magic
599 // numbers!
600 // can still be null
601 }
602 if (validator != null) {
603 reader.setContentHandler(validator.getContentHandler());
604 }
605 break;
606 case NO_EXTERNAL_ENTITIES:
607 case EXTERNAL_ENTITIES_NO_VALIDATION:
608 httpRes.setAllowGenericXml(true);
609 httpRes.setAllowHtml(false);
610 httpRes.setAcceptAllKnownXmlTypes(true);
611 httpRes.setAllowXhtml(true);
612 documentInput = (TypedInputSource) entityResolver.resolveEntity(
613 null, document);
614 reader = setupXmlParser();
615 break;
616 default:
617 httpRes.setAllowGenericXml(true);
618 httpRes.setAllowHtml(true);
619 httpRes.setAcceptAllKnownXmlTypes(true);
620 httpRes.setAllowXhtml(true);
621 documentInput = (TypedInputSource) entityResolver.resolveEntity(
622 null, document);
623 if ("text/html".equals(documentInput.getType())) {
624 if (isHtmlUnsafePreset()) {
625 String message = "The Content-Type was \u201Ctext/html\u201D, but the chosen preset schema is not appropriate for HTML.";
626 SAXException se = new SAXException(message);
627 errorHandler.schemaError(se);
628 throw se;
629 }
630 errorHandler.info("The Content-Type was \u201Ctext/html\u201D. Using the HTML parser.");
631 htmlParser = new HtmlParser();
632 htmlParser.setDoctypeMode(DoctypeHandler.ANY_DOCTYPE);
633 htmlParser.setDoctypeHandler(this);
634 reader = htmlParser;
635 if (validator != null) {
636 reader.setContentHandler(validator.getContentHandler());
637 }
638 } else {
639 errorHandler.info("The Content-Type was \u201C"
640 + documentInput.getType()
641 + "\u201D. Using the XML parser (not resolving external entities).");
642 reader = setupXmlParser();
643 }
644 break;
645 }
646 }
647
648 protected Validator validatorByDoctype(int doctype) throws SAXException,
649 IOException, IncorrectSchemaException {
650 if (doctype == ANY_DOCTYPE) {
651 return null;
652 }
653 for (int i = 0; i < presetDoctypes.length; i++) {
654 if (presetDoctypes[i] == doctype) {
655 return validatorByUrls(presetUrls[i]);
656 }
657 }
658 throw new RuntimeException("Doctype mappings not initialized properly.");
659 }
660
661 /**
662 * @param entityResolver2
663 * @return
664 * @throws SAXNotRecognizedException
665 * @throws SAXNotSupportedException
666 */
667 protected XMLReader setupXmlParser() throws SAXNotRecognizedException,
668 SAXNotSupportedException {
669 XMLReader reader;
670 reader = new SAXDriver();
671 reader = new XhtmlIdFilter(new XMLIdFilter(reader));
672 reader.setFeature(
673 "http://xml.org/sax/features/external-general-entities",
674 parser == EXTERNAL_ENTITIES_NO_VALIDATION);
675 reader.setFeature(
676 "http://xml.org/sax/features/external-parameter-entities",
677 parser == EXTERNAL_ENTITIES_NO_VALIDATION);
678 if (parser == EXTERNAL_ENTITIES_NO_VALIDATION) {
679 reader.setEntityResolver(entityResolver);
680 } else {
681 reader.setEntityResolver(new NullEntityResolver());
682 }
683 if (validator == null) {
684 bufferingRootNamespaceSniffer = new BufferingRootNamespaceSniffer(
685 this);
686 reader.setContentHandler(bufferingRootNamespaceSniffer);
687 } else {
688 reader.setContentHandler(new RootNamespaceSniffer(this,
689 validator.getContentHandler()));
690 reader.setDTDHandler(validator.getDTDHandler());
691 }
692 return reader;
693 }
694
695 /**
696 * @param validator
697 * @return
698 * @throws SAXException
699 * @throws IOException
700 * @throws IncorrectSchemaException
701 */
702 private Validator validatorByUrls(String schemaList) throws SAXException,
703 IOException, IncorrectSchemaException {
704 Validator validator = null;
705 String[] schemas = SPACE.split(schemaList);
706 for (int i = schemas.length - 1; i > -1; i--) {
707 String url = schemas[i];
708 if ("http://hsivonen.iki.fi/checkers/all/".equals(url)) {
709 for (int j = 0; j < ALL_CHECKERS.length; j++) {
710 validator = combineValidatorByUrl(validator,
711 ALL_CHECKERS[j]);
712 }
713 } else if ("http://hsivonen.iki.fi/checkers/all-html4/".equals(url)) {
714 for (int j = 0; j < ALL_CHECKERS_HTML4.length; j++) {
715 validator = combineValidatorByUrl(validator,
716 ALL_CHECKERS_HTML4[j]);
717 }
718 } else {
719 validator = combineValidatorByUrl(validator, url);
720 }
721 }
722 return validator;
723 }
724
725 /**
726 * @param validator
727 * @param url
728 * @return
729 * @throws SAXException
730 * @throws IOException
731 * @throws IncorrectSchemaException
732 */
733 private Validator combineValidatorByUrl(Validator validator, String url)
734 throws SAXException, IOException, IncorrectSchemaException {
735 if (!"".equals(url)) {
736 Validator v = validatorByUrl(url);
737 if (validator == null) {
738 validator = v;
739 } else {
740 validator = new CombineValidator(v, validator);
741 }
742 }
743 return validator;
744 }
745
746 /**
747 * @param url
748 * @return
749 * @throws SAXException
750 * @throws IOException
751 * @throws IncorrectSchemaException
752 */
753 private Validator validatorByUrl(String url) throws SAXException,
754 IOException, IncorrectSchemaException {
755 if (loadedValidatorUrls.contains(url)) {
756 return null;
757 }
758 loadedValidatorUrls.add(url);
759 if ("http://hsivonen.iki.fi/checkers/table/".equals(url)) {
760 return new CheckerValidator(new TableChecker(), jingPropertyMap);
761 } else if ("http://hsivonen.iki.fi/checkers/nfc/".equals(url)) {
762 this.checkNormalization = true;
763 return new CheckerValidator(new NormalizationChecker(),
764 jingPropertyMap);
765 } else if ("http://hsivonen.iki.fi/checkers/significant-inline/".equals(url)) {
766 return new CheckerValidator(new SignificantInlineChecker(),
767 jingPropertyMap);
768 } else if ("http://hsivonen.iki.fi/checkers/debug/".equals(url)) {
769 return new CheckerValidator(new DebugChecker(),
770 jingPropertyMap);
771 } else if ("http://hsivonen.iki.fi/checkers/text-content/".equals(url)) {
772 return new CheckerValidator(new TextContentChecker(),
773 jingPropertyMap);
774 }
775 Schema sch = schemaByUrl(url);
776 Validator validator = sch.createValidator(jingPropertyMap);
777 return validator;
778 }
779
780 /**
781 * @param url
782 * @return
783 * @throws SAXException
784 * @throws IOException
785 * @throws IncorrectSchemaException
786 */
787 private Schema schemaByUrl(String url) throws SAXException, IOException,
788 IncorrectSchemaException {
789 int i = Arrays.binarySearch(preloadedSchemaUrls, url);
790 if (i > -1) {
791 return preloadedSchemas[i];
792 }
793
794 TypedInputSource schemaInput = (TypedInputSource) entityResolver.resolveEntity(
795 null, url);
796 SchemaReader sr = null;
797 if ("application/relax-ng-compact-syntax".equals(schemaInput.getType())) {
798 sr = CompactSchemaReader.getInstance();
799 } else {
800 sr = new AutoSchemaReader();
801 }
802 Schema sch = sr.createSchema(schemaInput, jingPropertyMap);
803 return sch;
804 }
805
806 /**
807 * @param url
808 * @return
809 * @throws SAXException
810 * @throws IOException
811 * @throws IncorrectSchemaException
812 */
813 private static Schema schemaByUrl(String url, EntityResolver resolver,
814 PropertyMap pMap) throws SAXException, IOException,
815 IncorrectSchemaException {
816 log4j.debug("Will load schema: " + url);
817 TypedInputSource schemaInput = (TypedInputSource) resolver.resolveEntity(
818 null, url);
819 SchemaReader sr = null;
820 if ("application/relax-ng-compact-syntax".equals(schemaInput.getType())) {
821 sr = CompactSchemaReader.getInstance();
822 } else {
823 sr = new AutoSchemaReader();
824 }
825 Schema sch = sr.createSchema(schemaInput, pMap);
826 return sch;
827 }
828
829 /**
830 * @throws SAXException
831 */
832 void emitTitle(boolean markupAllowed) throws SAXException {
833 if (willValidate()) {
834 emitter.characters(RESULTS_TITLE);
835 emitter.characters(scrub(document));
836 } else {
837 emitter.characters(SERVICE_TITLE);
838 if (markupAllowed) {
839 emitter.startElement("span");
840 emitter.characters(TWO_POINT_OH_BETA);
841 emitter.endElement("span");
842 }
843 }
844 }
845
846 void emitForm() throws SAXException {
847 attrs.clear();
848 attrs.addAttribute("method", "get");
849 attrs.addAttribute("action", request.getRequestURL().toString());
850 attrs.addAttribute("onsubmit", "formSubmission()");
851 emitter.startElement("form", attrs);
852 emitFormContent();
853 emitter.endElement("form");
854 }
855
856 /**
857 * @throws SAXException
858 */
859 protected void emitFormContent() throws SAXException {
860 FormEmitter.emit(contentHandler, this);
861 }
862
863 void emitSchemaField() throws SAXException {
864 attrs.clear();
865 attrs.addAttribute("name", "schema");
866 attrs.addAttribute("id", "schema");
867 attrs.addAttribute("onchange", "schemaChanged();");
868 attrs.addAttribute("pattern", "(?:https?://.+(?:\\s+https?://.+)*)?");
869 attrs.addAttribute(
870 "title",
871 "The schema field takes zero or more space-separated absolute IRIs (http or https only) of the schemas that the document is to be validated against. (When left blank, the service will attempt to pick schemas automatically.)");
872 if (schemaUrls != null) {
873 attrs.addAttribute("value", scrub(schemaUrls));
874 }
875 emitter.startElement("input", attrs);
876 emitter.endElement("input");
877 }
878
879 void emitDocField() throws SAXException {
880 attrs.clear();
881 attrs.addAttribute("type", "url");
882 attrs.addAttribute("name", "doc");
883 attrs.addAttribute("id", "doc");
884 attrs.addAttribute("pattern", "(?:https?://.+)?");
885 attrs.addAttribute(
886 "title",
887 "The document field takes the absolute IRI (http or https only) of the document to be checked. (The document field can also be left blank in order to bookmark settings.)");
888 if (document != null) {
889 attrs.addAttribute("value", scrub(document));
890 }
891 emitter.startElement("input", attrs);
892 emitter.endElement("input");
893 }
894
895 private String scrubUrl(String urlStr) {
896 if (urlStr == null) {
897 return null;
898 }
899
900 try {
901 IRI iri = iriFactory.construct(urlStr);
902 return iri.toASCIIString();
903 } catch (IRIException e) {
904 return null;
905 } catch (MalformedURLException e) {
906 return null;
907 }
908 }
909
910 /**
911 * @throws SAXException
912 *
913 */
914 void emitSchemaDuration() throws SAXException {
915 }
916
917 /**
918 * @throws SAXException
919 *
920 */
921 void emitDocDuration() throws SAXException {
922 }
923
924 /**
925 * @throws SAXException
926 *
927 */
928 void emitTotalDuration() throws SAXException {
929 emitter.characters("" + (System.currentTimeMillis() - start));
930 }
931
932 /**
933 * @throws SAXException
934 *
935 */
936 void emitPresetOptions() throws SAXException {
937 for (int i = 0; i < presetUrls.length; i++) {
938 emitter.option(presetLabels[i], presetUrls[i], false);
939 }
940 }
941
942 /**
943 * @throws SAXException
944 *
945 */
946 void emitParserOptions() throws SAXException {
947 emitter.option("Automatically from Content-Type", "",
948 (parser == AUTOMATIC_PARSER));
949 emitter.option("XML; don\u2019t load external entities", "xml",
950 (parser == NO_EXTERNAL_ENTITIES));
951 emitter.option("XML; load external entities", "xmldtd",
952 (parser == EXTERNAL_ENTITIES_NO_VALIDATION));
953 emitter.option("HTML; flavor from doctype", "html",
954 (parser == HTML_PARSER));
955 emitter.option("HTML5", "html5", (parser == HTML_PARSER_5));
956 emitter.option("HTML 4.01 Strict", "html4",
957 (parser == HTML_PARSER_4_STRICT));
958 emitter.option("HTML 4.01 Transitional", "html4tr",
959 (parser == HTML_PARSER_4_TRANSITIONAL));
960 }
961
962 /**
963 * @throws SAXException
964 *
965 */
966 void emitLaxTypeField() throws SAXException {
967 emitter.checkbox("laxtype", "yes", laxType);
968 }
969
970 void rootNamespace(String namespace, Locator locator) throws SAXException {
971 if (validator == null) {
972 int index = -1;
973 for (int i = 0; i < presetNamespaces.length; i++) {
974 if (namespace.equals(presetNamespaces[i])) {
975 index = i;
976 break;
977 }
978 }
979 if (index == -1) {
980 String message = "Cannot find preset schema for namespace: \u201C"
981 + namespace + "\u201D.";
982 SAXException se = new SAXException(message);
983 errorHandler.schemaError(se);
984 throw se;
985 }
986 String label = presetLabels[index];
987 String urls = presetUrls[index];
988 errorHandler.info("Using the preset for " + label
989 + " based on the root namespace.");
990 try {
991 validator = validatorByUrls(urls);
992 } catch (IOException ioe) {
993 // At this point the schema comes from memory.
994 throw new RuntimeException(ioe);
995 } catch (IncorrectSchemaException e) {
996 // At this point the schema comes from memory.
997 throw new RuntimeException(e);
998 }
999 if (bufferingRootNamespaceSniffer == null) {
1000 throw new RuntimeException(
1001 "Bug! bufferingRootNamespaceSniffer was null.");
1002 }
1003 bufferingRootNamespaceSniffer.setContentHandler(validator.getContentHandler());
1004 }
1005
1006 if (!rootNamespaceSeen) {
1007 rootNamespaceSeen = true;
1008 if (contentType != null) {
1009 int i;
1010 if ((i = Arrays.binarySearch(KNOWN_CONTENT_TYPES, contentType)) > -1) {
1011 if (!NAMESPACES_FOR_KNOWN_CONTENT_TYPES[i].equals(namespace)) {
1012 String message = "\u201C"
1013 + contentType
1014 + "\u201D is not an appropriate Content-Type for a document whose root namespace is \u201C"
1015 + namespace + "\u201D.";
1016 SAXParseException spe = new SAXParseException(message,
1017 locator);
1018 errorHandler.warning(spe);
1019 }
1020 }
1021 }
1022 }
1023 }
1024
1025 public void doctype(int doctype) throws SAXException {
1026 if (validator == null) {
1027 try {
1028 validator = validatorByDoctype(doctype);
1029 } catch (IOException ioe) {
1030 // At this point the schema comes from memory.
1031 throw new RuntimeException(ioe);
1032 } catch (IncorrectSchemaException e) {
1033 // At this point the schema comes from memory.
1034 throw new RuntimeException(e);
1035 }
1036 switch (doctype) {
1037 case DoctypeHandler.DOCTYPE_HTML5:
1038 errorHandler.info("HTML5 doctype seen. Running the HTML parser in the HTML5 mode and using the preset for "
1039 + schemaLabelFromDoctype(doctype) + ".");
1040 break;
1041 case DoctypeHandler.DOCTYPE_HTML401_STRICT:
1042 errorHandler.info("HTML 4.01 Strict doctype seen. Running the HTML parser in the HTML 4.01 mode and using the preset for "
1043 + schemaLabelFromDoctype(doctype) + ".");
1044 break;
1045 case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL:
1046 errorHandler.info("HTML 4.01 Transitional doctype seen. Running the HTML parser in the HTML 4.01 mode and using the preset for "
1047 + schemaLabelFromDoctype(doctype) + ".");
1048 break;
1049 }
1050 htmlParser.setContentHandler(validator.getContentHandler());
1051 htmlParser.refireStart();
1052 } else {
1053 switch (doctype) {
1054 case DoctypeHandler.DOCTYPE_HTML5:
1055 errorHandler.info("HTML5 doctype seen. Running the HTML parser in the HTML5 mode.");
1056 break;
1057 case DoctypeHandler.DOCTYPE_HTML401_STRICT:
1058 errorHandler.info("HTML 4.01 Strict doctype seen. Running the HTML parser in the HTML 4.01 mode.");
1059 break;
1060 case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL:
1061 errorHandler.info("HTML 4.01 Transitional doctype seen. Running the HTML parser in the HTML 4.01 mode.");
1062 break;
1063 }
1064 }
1065 }
1066
1067 private String schemaLabelFromDoctype(int doctype) {
1068 for (int i = 0; i < presetDoctypes.length; i++) {
1069 if (doctype == presetDoctypes[i]) {
1070 return presetLabels[i];
1071 }
1072 }
1073 throw new RuntimeException("Bug: Bad magic number.");
1074 }
1075 }