001 /* 002 * Copyright (c) 2005, 2006 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.iki.hsivonen.verifierservlet; 024 025 import java.io.BufferedReader; 026 import java.io.File; 027 import java.io.FileInputStream; 028 import java.io.IOException; 029 import java.io.InputStreamReader; 030 import java.io.OutputStream; 031 import java.net.MalformedURLException; 032 import java.util.Arrays; 033 import java.util.HashMap; 034 import java.util.HashSet; 035 import java.util.Iterator; 036 import java.util.LinkedList; 037 import java.util.List; 038 import java.util.Map; 039 import java.util.Set; 040 import java.util.SortedMap; 041 import java.util.TreeMap; 042 import java.util.regex.Pattern; 043 044 import javax.servlet.ServletException; 045 import javax.servlet.http.HttpServletRequest; 046 import javax.servlet.http.HttpServletResponse; 047 048 import net.java.dev.xmlidfilter.XMLIdFilter; 049 050 import org.apache.log4j.Logger; 051 import org.xml.sax.ContentHandler; 052 import org.xml.sax.EntityResolver; 053 import org.xml.sax.ErrorHandler; 054 import org.xml.sax.Locator; 055 import org.xml.sax.SAXException; 056 import org.xml.sax.SAXNotRecognizedException; 057 import org.xml.sax.SAXNotSupportedException; 058 import org.xml.sax.SAXParseException; 059 import org.xml.sax.XMLReader; 060 061 import com.hp.hpl.jena.iri.IRI; 062 import com.hp.hpl.jena.iri.IRIException; 063 import com.hp.hpl.jena.iri.IRIFactory; 064 import com.ibm.icu.text.Normalizer; 065 import com.thaiopensource.relaxng.impl.CombineValidator; 066 import com.thaiopensource.util.PropertyMap; 067 import com.thaiopensource.util.PropertyMapBuilder; 068 import com.thaiopensource.validate.IncorrectSchemaException; 069 import com.thaiopensource.validate.Schema; 070 import com.thaiopensource.validate.SchemaReader; 071 import com.thaiopensource.validate.ValidateProperty; 072 import com.thaiopensource.validate.Validator; 073 import com.thaiopensource.validate.auto.AutoSchemaReader; 074 import com.thaiopensource.validate.rng.CompactSchemaReader; 075 import com.thaiopensource.validate.rng.RngProperty; 076 077 import fi.iki.hsivonen.gnu.xml.aelfred2.SAXDriver; 078 import fi.iki.hsivonen.htmlparser.DoctypeHandler; 079 import fi.iki.hsivonen.htmlparser.HtmlParser; 080 import fi.iki.hsivonen.xml.AttributesImpl; 081 import fi.iki.hsivonen.xml.HtmlSerializer; 082 import fi.iki.hsivonen.xml.LocalCacheEntityResolver; 083 import fi.iki.hsivonen.xml.NullEntityResolver; 084 import fi.iki.hsivonen.xml.PrudentHttpEntityResolver; 085 import fi.iki.hsivonen.xml.SystemErrErrorHandler; 086 import fi.iki.hsivonen.xml.TypedInputSource; 087 import fi.iki.hsivonen.xml.XhtmlIdFilter; 088 import fi.iki.hsivonen.xml.XhtmlSaxEmitter; 089 import fi.iki.hsivonen.xml.checker.DebugChecker; 090 import fi.iki.hsivonen.xml.checker.NormalizationChecker; 091 import fi.iki.hsivonen.xml.checker.SignificantInlineChecker; 092 import fi.iki.hsivonen.xml.checker.TextContentChecker; 093 import fi.iki.hsivonen.xml.checker.jing.CheckerValidator; 094 import fi.iki.hsivonen.xml.checker.table.TableChecker; 095 import fi.karppinen.xml.CharacterUtil; 096 097 /** 098 * @version $Id: VerifierServletTransaction.java,v 1.10 2005/07/24 07:32:48 099 * hsivonen Exp $ 100 * @author hsivonen 101 */ 102 class VerifierServletTransaction implements DoctypeHandler { 103 private static final Logger log4j = Logger.getLogger(VerifierServletTransaction.class); 104 105 private static final Pattern SPACE = Pattern.compile("\\s+"); 106 107 private static final int NO_EXTERNAL_ENTITIES = 4; 108 109 private static final int EXTERNAL_ENTITIES_NO_VALIDATION = 5; 110 111 private static final int HTML_PARSER = DoctypeHandler.ANY_DOCTYPE; 112 113 private static final int HTML_PARSER_5 = DoctypeHandler.DOCTYPE_HTML5; 114 115 private static final int HTML_PARSER_4_STRICT = DoctypeHandler.DOCTYPE_HTML401_STRICT; 116 117 private static final int HTML_PARSER_4_TRANSITIONAL = DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL; 118 119 private static final int AUTOMATIC_PARSER = 6; 120 121 protected static final int XHTML5_SCHEMA = 7; 122 123 private static final char[] SERVICE_TITLE = "Validation Service for RELAX NG ".toCharArray(); 124 125 private static final char[] TWO_POINT_OH_BETA = "2.0 Beta".toCharArray(); 126 127 private static final char[] RESULTS_TITLE = "Validation results for ".toCharArray(); 128 129 private static final char[] SUCCESS = "The document validates according to the specified schema(s).".toCharArray(); 130 131 private static final char[] FAILURE = "There were errors.".toCharArray(); 132 133 private static final Map pathMap = new HashMap(); 134 135 private static int[] presetDoctypes; 136 137 private static String[] presetLabels; 138 139 private static String[] presetUrls; 140 141 private static String[] presetNamespaces; 142 143 private static final String[] KNOWN_CONTENT_TYPES = { 144 "application/atom+xml", "application/docbook+xml", 145 "application/xhtml+xml", "application/xv+xml" }; 146 147 private static final String[] NAMESPACES_FOR_KNOWN_CONTENT_TYPES = { 148 "http://www.w3.org/2005/Atom", "http://docbook.org/ns/docbook", 149 "http://www.w3.org/1999/xhtml", "http://www.w3.org/1999/xhtml" }; 150 151 private static final String[] ALL_CHECKERS = { 152 "http://hsivonen.iki.fi/checkers/table/", 153 "http://hsivonen.iki.fi/checkers/nfc/", 154 "http://hsivonen.iki.fi/checkers/significant-inline/", 155 "http://hsivonen.iki.fi/checkers/text-content/"}; 156 157 private static final String[] ALL_CHECKERS_HTML4 = { 158 "http://hsivonen.iki.fi/checkers/table/", 159 "http://hsivonen.iki.fi/checkers/nfc/" }; 160 161 private long start = System.currentTimeMillis(); 162 163 private HttpServletRequest request; 164 165 private HttpServletResponse response; 166 167 private IRIFactory iriFactory; 168 169 protected String document; 170 171 private int parser = AUTOMATIC_PARSER; 172 173 private boolean laxType = false; 174 175 protected ContentHandler contentHandler; 176 177 protected XhtmlSaxEmitter emitter; 178 179 protected XhtmlEmittingErrorHandler errorHandler; 180 181 private AttributesImpl attrs = new AttributesImpl(); 182 183 private OutputStream out; 184 185 private PropertyMap jingPropertyMap; 186 187 protected LocalCacheEntityResolver entityResolver; 188 189 private static long lastModified; 190 191 private static String[] preloadedSchemaUrls; 192 193 private static Schema[] preloadedSchemas; 194 195 private String schemaUrls = null; 196 197 protected Validator validator = null; 198 199 private BufferingRootNamespaceSniffer bufferingRootNamespaceSniffer = null; 200 201 private String contentType = null; 202 203 protected HtmlParser htmlParser = null; 204 205 protected XMLReader reader; 206 207 protected TypedInputSource documentInput; 208 209 protected PrudentHttpEntityResolver httpRes; 210 211 private Set loadedValidatorUrls = new HashSet(); 212 213 private boolean checkNormalization = false; 214 215 private boolean rootNamespaceSeen = false; 216 217 static { 218 try { 219 log4j.debug("Starting static initializer."); 220 221 String presetPath = System.getProperty("fi.iki.hsivonen.verifierservlet.presetconfpath"); 222 File presetFile = new File(presetPath); 223 lastModified = presetFile.lastModified(); 224 BufferedReader r = new BufferedReader(new InputStreamReader( 225 new FileInputStream(presetFile), "UTF-8")); 226 String line; 227 List doctypes = new LinkedList(); 228 List namespaces = new LinkedList(); 229 List labels = new LinkedList(); 230 List urls = new LinkedList(); 231 232 log4j.debug("Starting to loop over config file lines."); 233 234 while ((line = r.readLine()) != null) { 235 if ("".equals(line.trim())) { 236 break; 237 } 238 String s[] = line.split("\t"); 239 doctypes.add(s[0]); 240 namespaces.add(s[1]); 241 labels.add(s[2]); 242 urls.add(s[3]); 243 } 244 245 log4j.debug("Finished reading config."); 246 247 String[] presetDoctypesAsStrings = (String[]) doctypes.toArray(new String[0]); 248 presetNamespaces = (String[]) namespaces.toArray(new String[0]); 249 presetLabels = (String[]) labels.toArray(new String[0]); 250 presetUrls = (String[]) urls.toArray(new String[0]); 251 252 log4j.debug("Converted config to arrays."); 253 254 for (int i = 0; i < presetNamespaces.length; i++) { 255 String str = presetNamespaces[i]; 256 if ("-".equals(str)) { 257 presetNamespaces[i] = null; 258 } else { 259 presetNamespaces[i] = presetNamespaces[i].intern(); 260 } 261 } 262 263 log4j.debug("Prepared namespace array."); 264 265 presetDoctypes = new int[presetDoctypesAsStrings.length]; 266 for (int i = 0; i < presetDoctypesAsStrings.length; i++) { 267 presetDoctypes[i] = Integer.parseInt(presetDoctypesAsStrings[i]); 268 } 269 270 log4j.debug("Parsed doctype numbers into ints."); 271 272 String prefix = System.getProperty("fi.iki.hsivonen.verifierservlet.cachepathprefix"); 273 274 log4j.debug("The cache path prefix is: " + prefix); 275 276 String cacheConfPath = System.getProperty("fi.iki.hsivonen.verifierservlet.cacheconfpath"); 277 278 log4j.debug("The cache config path is: " + cacheConfPath); 279 280 r = new BufferedReader(new InputStreamReader(new FileInputStream( 281 cacheConfPath), "UTF-8")); 282 while ((line = r.readLine()) != null) { 283 if ("".equals(line.trim())) { 284 break; 285 } 286 String s[] = line.split("\t"); 287 pathMap.put(s[0], prefix + s[1]); 288 } 289 290 log4j.debug("Cache config read."); 291 292 ErrorHandler eh = new SystemErrErrorHandler(); 293 LocalCacheEntityResolver er = new LocalCacheEntityResolver(pathMap, 294 new NullEntityResolver()); 295 er.setAllowRnc(true); 296 PropertyMapBuilder pmb = new PropertyMapBuilder(); 297 pmb.put(ValidateProperty.ERROR_HANDLER, eh); 298 pmb.put(ValidateProperty.ENTITY_RESOLVER, er); 299 pmb.put(ValidateProperty.XML_READER_CREATOR, 300 new VerifierServletXMLReaderCreator(eh, er)); 301 RngProperty.CHECK_ID_IDREF.add(pmb); 302 PropertyMap pMap = pmb.toPropertyMap(); 303 304 log4j.debug("Parsing set up. Starting to read schemas."); 305 306 SortedMap schemaMap = new TreeMap(); 307 for (int i = 0; i < presetUrls.length; i++) { 308 String[] urls1 = SPACE.split(presetUrls[i]); 309 for (int j = 0; j < urls1.length; j++) { 310 String url = urls1[j]; 311 if (schemaMap.get(url) == null && !isCheckerUrl(url)) { 312 Schema sch = schemaByUrl(url, er, pMap); 313 schemaMap.put(url, sch); 314 } 315 } 316 } 317 318 log4j.debug("Schemas read."); 319 320 preloadedSchemaUrls = new String[schemaMap.size()]; 321 preloadedSchemas = new Schema[schemaMap.size()]; 322 int i = 0; 323 for (Iterator iter = schemaMap.entrySet().iterator(); iter.hasNext();) { 324 Map.Entry entry = (Map.Entry) iter.next(); 325 preloadedSchemaUrls[i] = entry.getKey().toString().intern(); 326 preloadedSchemas[i] = (Schema) entry.getValue(); 327 i++; 328 } 329 330 log4j.debug("Initialization complete."); 331 } catch (Exception e) { 332 throw new RuntimeException(e); 333 } 334 } 335 336 protected static String scrub(String s) { 337 return Normalizer.normalize( 338 CharacterUtil.prudentlyScrubCharacterData(s), Normalizer.NFC); 339 } 340 341 private static boolean isCheckerUrl(String url) { 342 if ("http://hsivonen.iki.fi/checkers/all/".equals(url)) { 343 return true; 344 } else if ("http://hsivonen.iki.fi/checkers/all-html4/".equals(url)) { 345 return true; 346 } 347 for (int i = 0; i < ALL_CHECKERS.length; i++) { 348 if (ALL_CHECKERS[i].equals(url)) { 349 return true; 350 } 351 } 352 return false; 353 } 354 355 /** 356 * @param request 357 * @param response 358 */ 359 VerifierServletTransaction(HttpServletRequest request, 360 HttpServletResponse response) { 361 this.request = request; 362 this.response = response; 363 this.iriFactory = IRIFactory.iriImplementation(); 364 } 365 366 protected boolean willValidate() { 367 return document != null; 368 } 369 370 void doGet() throws ServletException, IOException { 371 response.setContentType("text/html; charset=utf-8"); 372 373 this.out = response.getOutputStream(); 374 375 request.setCharacterEncoding("utf-8"); 376 377 if (willValidate()) { 378 response.setDateHeader("Expires", 0); 379 response.setHeader("Cache-Control", "no-cache"); 380 } else { 381 response.setDateHeader("Last-Modified", lastModified); 382 } 383 384 contentHandler = new HtmlSerializer(out, HtmlSerializer.DOCTYPE_HTML5, 385 false, "UTF-8"); 386 emitter = new XhtmlSaxEmitter(contentHandler); 387 388 document = scrubUrl(request.getParameter("doc")); 389 390 document = ("".equals(document)) ? null : document; 391 392 setup(); 393 394 try { 395 PageEmitter.emit(contentHandler, this); 396 } catch (SAXException e) { 397 throw new ServletException(e); 398 } 399 } 400 401 /** 402 * @throws ServletException 403 */ 404 protected void setup() throws ServletException { 405 String preset = request.getParameter("preset"); 406 407 if (preset != null && !"".equals(preset)) { 408 schemaUrls = preset; 409 } else { 410 schemaUrls = request.getParameter("schema"); 411 } 412 if (schemaUrls == null) { 413 schemaUrls = ""; 414 } 415 416 String parserStr = request.getParameter("parser"); 417 418 if ("html".equals(parserStr)) { 419 parser = HTML_PARSER; 420 } else if ("xmldtd".equals(parserStr)) { 421 parser = EXTERNAL_ENTITIES_NO_VALIDATION; 422 } else if ("xml".equals(parserStr)) { 423 parser = NO_EXTERNAL_ENTITIES; 424 } else if ("html5".equals(parserStr)) { 425 parser = HTML_PARSER_5; 426 } else if ("html4".equals(parserStr)) { 427 parser = HTML_PARSER_4_STRICT; 428 } else if ("html4tr".equals(parserStr)) { 429 parser = HTML_PARSER_4_TRANSITIONAL; 430 } // else auto 431 432 laxType = (request.getParameter("laxtype") != null); 433 } 434 435 private boolean isHtmlUnsafePreset() { 436 if ("".equals(schemaUrls)) { 437 return false; 438 } 439 boolean preset = false; 440 for (int i = 0; i < presetUrls.length; i++) { 441 if (presetUrls[i].equals(schemaUrls)) { 442 preset = true; 443 break; 444 } 445 } 446 if (!preset) { 447 return false; 448 } 449 return !(schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-basic.rng") 450 || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-strict.rng") 451 || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-strict-wcag.rng") 452 || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-transitional.rng") 453 || schemaUrls.startsWith("http://hsivonen.iki.fi/xhtml-schema/xhtml-transitional-wcag.rng") || schemaUrls.startsWith("http://syntax.whattf.org/relaxng/xhtml5full-html.rnc")); 454 455 } 456 457 /** 458 * @throws SAXException 459 */ 460 void validate() throws SAXException { 461 if (!willValidate()) { 462 return; 463 } 464 try { 465 out.flush(); 466 } catch (IOException e1) { 467 throw new SAXException(e1); 468 } 469 errorHandler = new XhtmlEmittingErrorHandler(contentHandler); 470 httpRes = new PrudentHttpEntityResolver(600 * 1024, laxType, 471 errorHandler); 472 entityResolver = new LocalCacheEntityResolver(pathMap, httpRes); 473 httpRes.setAllowRnc(true); 474 entityResolver.setAllowRnc(true); 475 boolean isValid = false; 476 boolean stats = true; 477 try { 478 this.errorHandler.start(); 479 PropertyMapBuilder pmb = new PropertyMapBuilder(); 480 pmb.put(ValidateProperty.ERROR_HANDLER, errorHandler); 481 pmb.put(ValidateProperty.ENTITY_RESOLVER, entityResolver); 482 pmb.put(ValidateProperty.XML_READER_CREATOR, 483 new VerifierServletXMLReaderCreator(errorHandler, 484 entityResolver)); 485 RngProperty.CHECK_ID_IDREF.add(pmb); 486 jingPropertyMap = pmb.toPropertyMap(); 487 488 tryToSetupValidator(); 489 490 httpRes.setAllowRnc(false); 491 entityResolver.setAllowRnc(false); 492 493 loadDocAndSetupParser(); 494 495 reader.setErrorHandler(errorHandler); 496 contentType = documentInput.getType(); 497 if (validator == null) { 498 checkNormalization = true; 499 } 500 if (checkNormalization) { 501 reader.setFeature("http://hsivonen.iki.fi/checkers/nfc/", true); 502 } 503 reader.parse(documentInput); 504 isValid = !errorHandler.isErrors(); 505 } catch (SAXException e) { 506 log4j.debug("SAXException", e); 507 } catch (IOException e) { 508 stats = false; 509 log4j.info("IOException", e); 510 errorHandler.ioError(e); 511 } catch (IncorrectSchemaException e) { 512 log4j.debug("IncorrectSchemaException", e); 513 errorHandler.schemaError(e); 514 } catch (RuntimeException e) { 515 stats = false; 516 log4j.error("RuntimeException, doc: " + document + " schema: " 517 + schemaUrls + " lax: " + laxType, e); 518 errorHandler.internalError(e, "Oops. That was not supposed to happen. A bug manifested itself in the application internals. Unable to continue. Sorry. The admin was notified."); 519 } catch (Error e) { 520 stats = false; 521 log4j.error("Error, doc: " + document + " schema: " + schemaUrls 522 + " lax: " + laxType, e); 523 errorHandler.internalError(e, "Oops. That was not supposed to happen. A bug manifested itself in the application internals. Unable to continue. Sorry. The admin was notified."); 524 } finally { 525 errorHandler.end(); 526 } 527 if (isValid) { 528 attrs.clear(); 529 attrs.addAttribute("class", "success"); 530 emitter.startElement("p", attrs); 531 emitSuccess(); 532 emitter.endElement("p"); 533 } else { 534 attrs.clear(); 535 attrs.addAttribute("class", "failure"); 536 emitter.startElement("p", attrs); 537 emitFailure(); 538 emitter.endElement("p"); 539 } 540 if (stats) { 541 StatsEmitter.emit(contentHandler, this); 542 } 543 } 544 545 /** 546 * @throws SAXException 547 */ 548 protected void emitSuccess() throws SAXException { 549 emitter.characters(SUCCESS); 550 } 551 552 protected void emitFailure() throws SAXException { 553 emitter.characters(FAILURE); 554 } 555 556 /** 557 * @throws SAXException 558 * @throws IOException 559 * @throws IncorrectSchemaException 560 */ 561 protected void tryToSetupValidator() throws SAXException, IOException, 562 IncorrectSchemaException { 563 validator = validatorByUrls(schemaUrls); 564 } 565 566 /** 567 * @throws SAXException 568 * @throws IOException 569 * @throws IncorrectSchemaException 570 * @throws SAXNotRecognizedException 571 * @throws SAXNotSupportedException 572 */ 573 protected void loadDocAndSetupParser() throws SAXException, IOException, 574 IncorrectSchemaException, SAXNotRecognizedException, 575 SAXNotSupportedException { 576 switch (parser) { 577 case HTML_PARSER: 578 case HTML_PARSER_5: 579 case HTML_PARSER_4_STRICT: 580 case HTML_PARSER_4_TRANSITIONAL: 581 if (isHtmlUnsafePreset()) { 582 String message = "The chosen preset schema is not appropriate for HTML."; 583 SAXException se = new SAXException(message); 584 errorHandler.schemaError(se); 585 throw se; 586 } 587 httpRes.setAllowGenericXml(false); 588 httpRes.setAllowHtml(true); 589 httpRes.setAcceptAllKnownXmlTypes(false); 590 httpRes.setAllowXhtml(false); 591 documentInput = (TypedInputSource) entityResolver.resolveEntity( 592 null, document); 593 htmlParser = new HtmlParser(); 594 htmlParser.setDoctypeMode(parser); // magic numbers! 595 htmlParser.setDoctypeHandler(this); 596 reader = htmlParser; 597 if (validator == null) { 598 validator = validatorByDoctype(parser); // magic 599 // numbers! 600 // can still be null 601 } 602 if (validator != null) { 603 reader.setContentHandler(validator.getContentHandler()); 604 } 605 break; 606 case NO_EXTERNAL_ENTITIES: 607 case EXTERNAL_ENTITIES_NO_VALIDATION: 608 httpRes.setAllowGenericXml(true); 609 httpRes.setAllowHtml(false); 610 httpRes.setAcceptAllKnownXmlTypes(true); 611 httpRes.setAllowXhtml(true); 612 documentInput = (TypedInputSource) entityResolver.resolveEntity( 613 null, document); 614 reader = setupXmlParser(); 615 break; 616 default: 617 httpRes.setAllowGenericXml(true); 618 httpRes.setAllowHtml(true); 619 httpRes.setAcceptAllKnownXmlTypes(true); 620 httpRes.setAllowXhtml(true); 621 documentInput = (TypedInputSource) entityResolver.resolveEntity( 622 null, document); 623 if ("text/html".equals(documentInput.getType())) { 624 if (isHtmlUnsafePreset()) { 625 String message = "The Content-Type was \u201Ctext/html\u201D, but the chosen preset schema is not appropriate for HTML."; 626 SAXException se = new SAXException(message); 627 errorHandler.schemaError(se); 628 throw se; 629 } 630 errorHandler.info("The Content-Type was \u201Ctext/html\u201D. Using the HTML parser."); 631 htmlParser = new HtmlParser(); 632 htmlParser.setDoctypeMode(DoctypeHandler.ANY_DOCTYPE); 633 htmlParser.setDoctypeHandler(this); 634 reader = htmlParser; 635 if (validator != null) { 636 reader.setContentHandler(validator.getContentHandler()); 637 } 638 } else { 639 errorHandler.info("The Content-Type was \u201C" 640 + documentInput.getType() 641 + "\u201D. Using the XML parser (not resolving external entities)."); 642 reader = setupXmlParser(); 643 } 644 break; 645 } 646 } 647 648 protected Validator validatorByDoctype(int doctype) throws SAXException, 649 IOException, IncorrectSchemaException { 650 if (doctype == ANY_DOCTYPE) { 651 return null; 652 } 653 for (int i = 0; i < presetDoctypes.length; i++) { 654 if (presetDoctypes[i] == doctype) { 655 return validatorByUrls(presetUrls[i]); 656 } 657 } 658 throw new RuntimeException("Doctype mappings not initialized properly."); 659 } 660 661 /** 662 * @param entityResolver2 663 * @return 664 * @throws SAXNotRecognizedException 665 * @throws SAXNotSupportedException 666 */ 667 protected XMLReader setupXmlParser() throws SAXNotRecognizedException, 668 SAXNotSupportedException { 669 XMLReader reader; 670 reader = new SAXDriver(); 671 reader = new XhtmlIdFilter(new XMLIdFilter(reader)); 672 reader.setFeature( 673 "http://xml.org/sax/features/external-general-entities", 674 parser == EXTERNAL_ENTITIES_NO_VALIDATION); 675 reader.setFeature( 676 "http://xml.org/sax/features/external-parameter-entities", 677 parser == EXTERNAL_ENTITIES_NO_VALIDATION); 678 if (parser == EXTERNAL_ENTITIES_NO_VALIDATION) { 679 reader.setEntityResolver(entityResolver); 680 } else { 681 reader.setEntityResolver(new NullEntityResolver()); 682 } 683 if (validator == null) { 684 bufferingRootNamespaceSniffer = new BufferingRootNamespaceSniffer( 685 this); 686 reader.setContentHandler(bufferingRootNamespaceSniffer); 687 } else { 688 reader.setContentHandler(new RootNamespaceSniffer(this, 689 validator.getContentHandler())); 690 reader.setDTDHandler(validator.getDTDHandler()); 691 } 692 return reader; 693 } 694 695 /** 696 * @param validator 697 * @return 698 * @throws SAXException 699 * @throws IOException 700 * @throws IncorrectSchemaException 701 */ 702 private Validator validatorByUrls(String schemaList) throws SAXException, 703 IOException, IncorrectSchemaException { 704 Validator validator = null; 705 String[] schemas = SPACE.split(schemaList); 706 for (int i = schemas.length - 1; i > -1; i--) { 707 String url = schemas[i]; 708 if ("http://hsivonen.iki.fi/checkers/all/".equals(url)) { 709 for (int j = 0; j < ALL_CHECKERS.length; j++) { 710 validator = combineValidatorByUrl(validator, 711 ALL_CHECKERS[j]); 712 } 713 } else if ("http://hsivonen.iki.fi/checkers/all-html4/".equals(url)) { 714 for (int j = 0; j < ALL_CHECKERS_HTML4.length; j++) { 715 validator = combineValidatorByUrl(validator, 716 ALL_CHECKERS_HTML4[j]); 717 } 718 } else { 719 validator = combineValidatorByUrl(validator, url); 720 } 721 } 722 return validator; 723 } 724 725 /** 726 * @param validator 727 * @param url 728 * @return 729 * @throws SAXException 730 * @throws IOException 731 * @throws IncorrectSchemaException 732 */ 733 private Validator combineValidatorByUrl(Validator validator, String url) 734 throws SAXException, IOException, IncorrectSchemaException { 735 if (!"".equals(url)) { 736 Validator v = validatorByUrl(url); 737 if (validator == null) { 738 validator = v; 739 } else { 740 validator = new CombineValidator(v, validator); 741 } 742 } 743 return validator; 744 } 745 746 /** 747 * @param url 748 * @return 749 * @throws SAXException 750 * @throws IOException 751 * @throws IncorrectSchemaException 752 */ 753 private Validator validatorByUrl(String url) throws SAXException, 754 IOException, IncorrectSchemaException { 755 if (loadedValidatorUrls.contains(url)) { 756 return null; 757 } 758 loadedValidatorUrls.add(url); 759 if ("http://hsivonen.iki.fi/checkers/table/".equals(url)) { 760 return new CheckerValidator(new TableChecker(), jingPropertyMap); 761 } else if ("http://hsivonen.iki.fi/checkers/nfc/".equals(url)) { 762 this.checkNormalization = true; 763 return new CheckerValidator(new NormalizationChecker(), 764 jingPropertyMap); 765 } else if ("http://hsivonen.iki.fi/checkers/significant-inline/".equals(url)) { 766 return new CheckerValidator(new SignificantInlineChecker(), 767 jingPropertyMap); 768 } else if ("http://hsivonen.iki.fi/checkers/debug/".equals(url)) { 769 return new CheckerValidator(new DebugChecker(), 770 jingPropertyMap); 771 } else if ("http://hsivonen.iki.fi/checkers/text-content/".equals(url)) { 772 return new CheckerValidator(new TextContentChecker(), 773 jingPropertyMap); 774 } 775 Schema sch = schemaByUrl(url); 776 Validator validator = sch.createValidator(jingPropertyMap); 777 return validator; 778 } 779 780 /** 781 * @param url 782 * @return 783 * @throws SAXException 784 * @throws IOException 785 * @throws IncorrectSchemaException 786 */ 787 private Schema schemaByUrl(String url) throws SAXException, IOException, 788 IncorrectSchemaException { 789 int i = Arrays.binarySearch(preloadedSchemaUrls, url); 790 if (i > -1) { 791 return preloadedSchemas[i]; 792 } 793 794 TypedInputSource schemaInput = (TypedInputSource) entityResolver.resolveEntity( 795 null, url); 796 SchemaReader sr = null; 797 if ("application/relax-ng-compact-syntax".equals(schemaInput.getType())) { 798 sr = CompactSchemaReader.getInstance(); 799 } else { 800 sr = new AutoSchemaReader(); 801 } 802 Schema sch = sr.createSchema(schemaInput, jingPropertyMap); 803 return sch; 804 } 805 806 /** 807 * @param url 808 * @return 809 * @throws SAXException 810 * @throws IOException 811 * @throws IncorrectSchemaException 812 */ 813 private static Schema schemaByUrl(String url, EntityResolver resolver, 814 PropertyMap pMap) throws SAXException, IOException, 815 IncorrectSchemaException { 816 log4j.debug("Will load schema: " + url); 817 TypedInputSource schemaInput = (TypedInputSource) resolver.resolveEntity( 818 null, url); 819 SchemaReader sr = null; 820 if ("application/relax-ng-compact-syntax".equals(schemaInput.getType())) { 821 sr = CompactSchemaReader.getInstance(); 822 } else { 823 sr = new AutoSchemaReader(); 824 } 825 Schema sch = sr.createSchema(schemaInput, pMap); 826 return sch; 827 } 828 829 /** 830 * @throws SAXException 831 */ 832 void emitTitle(boolean markupAllowed) throws SAXException { 833 if (willValidate()) { 834 emitter.characters(RESULTS_TITLE); 835 emitter.characters(scrub(document)); 836 } else { 837 emitter.characters(SERVICE_TITLE); 838 if (markupAllowed) { 839 emitter.startElement("span"); 840 emitter.characters(TWO_POINT_OH_BETA); 841 emitter.endElement("span"); 842 } 843 } 844 } 845 846 void emitForm() throws SAXException { 847 attrs.clear(); 848 attrs.addAttribute("method", "get"); 849 attrs.addAttribute("action", request.getRequestURL().toString()); 850 attrs.addAttribute("onsubmit", "formSubmission()"); 851 emitter.startElement("form", attrs); 852 emitFormContent(); 853 emitter.endElement("form"); 854 } 855 856 /** 857 * @throws SAXException 858 */ 859 protected void emitFormContent() throws SAXException { 860 FormEmitter.emit(contentHandler, this); 861 } 862 863 void emitSchemaField() throws SAXException { 864 attrs.clear(); 865 attrs.addAttribute("name", "schema"); 866 attrs.addAttribute("id", "schema"); 867 attrs.addAttribute("onchange", "schemaChanged();"); 868 attrs.addAttribute("pattern", "(?:https?://.+(?:\\s+https?://.+)*)?"); 869 attrs.addAttribute( 870 "title", 871 "The schema field takes zero or more space-separated absolute IRIs (http or https only) of the schemas that the document is to be validated against. (When left blank, the service will attempt to pick schemas automatically.)"); 872 if (schemaUrls != null) { 873 attrs.addAttribute("value", scrub(schemaUrls)); 874 } 875 emitter.startElement("input", attrs); 876 emitter.endElement("input"); 877 } 878 879 void emitDocField() throws SAXException { 880 attrs.clear(); 881 attrs.addAttribute("type", "url"); 882 attrs.addAttribute("name", "doc"); 883 attrs.addAttribute("id", "doc"); 884 attrs.addAttribute("pattern", "(?:https?://.+)?"); 885 attrs.addAttribute( 886 "title", 887 "The document field takes the absolute IRI (http or https only) of the document to be checked. (The document field can also be left blank in order to bookmark settings.)"); 888 if (document != null) { 889 attrs.addAttribute("value", scrub(document)); 890 } 891 emitter.startElement("input", attrs); 892 emitter.endElement("input"); 893 } 894 895 private String scrubUrl(String urlStr) { 896 if (urlStr == null) { 897 return null; 898 } 899 900 try { 901 IRI iri = iriFactory.construct(urlStr); 902 return iri.toASCIIString(); 903 } catch (IRIException e) { 904 return null; 905 } catch (MalformedURLException e) { 906 return null; 907 } 908 } 909 910 /** 911 * @throws SAXException 912 * 913 */ 914 void emitSchemaDuration() throws SAXException { 915 } 916 917 /** 918 * @throws SAXException 919 * 920 */ 921 void emitDocDuration() throws SAXException { 922 } 923 924 /** 925 * @throws SAXException 926 * 927 */ 928 void emitTotalDuration() throws SAXException { 929 emitter.characters("" + (System.currentTimeMillis() - start)); 930 } 931 932 /** 933 * @throws SAXException 934 * 935 */ 936 void emitPresetOptions() throws SAXException { 937 for (int i = 0; i < presetUrls.length; i++) { 938 emitter.option(presetLabels[i], presetUrls[i], false); 939 } 940 } 941 942 /** 943 * @throws SAXException 944 * 945 */ 946 void emitParserOptions() throws SAXException { 947 emitter.option("Automatically from Content-Type", "", 948 (parser == AUTOMATIC_PARSER)); 949 emitter.option("XML; don\u2019t load external entities", "xml", 950 (parser == NO_EXTERNAL_ENTITIES)); 951 emitter.option("XML; load external entities", "xmldtd", 952 (parser == EXTERNAL_ENTITIES_NO_VALIDATION)); 953 emitter.option("HTML; flavor from doctype", "html", 954 (parser == HTML_PARSER)); 955 emitter.option("HTML5", "html5", (parser == HTML_PARSER_5)); 956 emitter.option("HTML 4.01 Strict", "html4", 957 (parser == HTML_PARSER_4_STRICT)); 958 emitter.option("HTML 4.01 Transitional", "html4tr", 959 (parser == HTML_PARSER_4_TRANSITIONAL)); 960 } 961 962 /** 963 * @throws SAXException 964 * 965 */ 966 void emitLaxTypeField() throws SAXException { 967 emitter.checkbox("laxtype", "yes", laxType); 968 } 969 970 void rootNamespace(String namespace, Locator locator) throws SAXException { 971 if (validator == null) { 972 int index = -1; 973 for (int i = 0; i < presetNamespaces.length; i++) { 974 if (namespace.equals(presetNamespaces[i])) { 975 index = i; 976 break; 977 } 978 } 979 if (index == -1) { 980 String message = "Cannot find preset schema for namespace: \u201C" 981 + namespace + "\u201D."; 982 SAXException se = new SAXException(message); 983 errorHandler.schemaError(se); 984 throw se; 985 } 986 String label = presetLabels[index]; 987 String urls = presetUrls[index]; 988 errorHandler.info("Using the preset for " + label 989 + " based on the root namespace."); 990 try { 991 validator = validatorByUrls(urls); 992 } catch (IOException ioe) { 993 // At this point the schema comes from memory. 994 throw new RuntimeException(ioe); 995 } catch (IncorrectSchemaException e) { 996 // At this point the schema comes from memory. 997 throw new RuntimeException(e); 998 } 999 if (bufferingRootNamespaceSniffer == null) { 1000 throw new RuntimeException( 1001 "Bug! bufferingRootNamespaceSniffer was null."); 1002 } 1003 bufferingRootNamespaceSniffer.setContentHandler(validator.getContentHandler()); 1004 } 1005 1006 if (!rootNamespaceSeen) { 1007 rootNamespaceSeen = true; 1008 if (contentType != null) { 1009 int i; 1010 if ((i = Arrays.binarySearch(KNOWN_CONTENT_TYPES, contentType)) > -1) { 1011 if (!NAMESPACES_FOR_KNOWN_CONTENT_TYPES[i].equals(namespace)) { 1012 String message = "\u201C" 1013 + contentType 1014 + "\u201D is not an appropriate Content-Type for a document whose root namespace is \u201C" 1015 + namespace + "\u201D."; 1016 SAXParseException spe = new SAXParseException(message, 1017 locator); 1018 errorHandler.warning(spe); 1019 } 1020 } 1021 } 1022 } 1023 } 1024 1025 public void doctype(int doctype) throws SAXException { 1026 if (validator == null) { 1027 try { 1028 validator = validatorByDoctype(doctype); 1029 } catch (IOException ioe) { 1030 // At this point the schema comes from memory. 1031 throw new RuntimeException(ioe); 1032 } catch (IncorrectSchemaException e) { 1033 // At this point the schema comes from memory. 1034 throw new RuntimeException(e); 1035 } 1036 switch (doctype) { 1037 case DoctypeHandler.DOCTYPE_HTML5: 1038 errorHandler.info("HTML5 doctype seen. Running the HTML parser in the HTML5 mode and using the preset for " 1039 + schemaLabelFromDoctype(doctype) + "."); 1040 break; 1041 case DoctypeHandler.DOCTYPE_HTML401_STRICT: 1042 errorHandler.info("HTML 4.01 Strict doctype seen. Running the HTML parser in the HTML 4.01 mode and using the preset for " 1043 + schemaLabelFromDoctype(doctype) + "."); 1044 break; 1045 case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL: 1046 errorHandler.info("HTML 4.01 Transitional doctype seen. Running the HTML parser in the HTML 4.01 mode and using the preset for " 1047 + schemaLabelFromDoctype(doctype) + "."); 1048 break; 1049 } 1050 htmlParser.setContentHandler(validator.getContentHandler()); 1051 htmlParser.refireStart(); 1052 } else { 1053 switch (doctype) { 1054 case DoctypeHandler.DOCTYPE_HTML5: 1055 errorHandler.info("HTML5 doctype seen. Running the HTML parser in the HTML5 mode."); 1056 break; 1057 case DoctypeHandler.DOCTYPE_HTML401_STRICT: 1058 errorHandler.info("HTML 4.01 Strict doctype seen. Running the HTML parser in the HTML 4.01 mode."); 1059 break; 1060 case DoctypeHandler.DOCTYPE_HTML401_TRANSITIONAL: 1061 errorHandler.info("HTML 4.01 Transitional doctype seen. Running the HTML parser in the HTML 4.01 mode."); 1062 break; 1063 } 1064 } 1065 } 1066 1067 private String schemaLabelFromDoctype(int doctype) { 1068 for (int i = 0; i < presetDoctypes.length; i++) { 1069 if (doctype == presetDoctypes[i]) { 1070 return presetLabels[i]; 1071 } 1072 } 1073 throw new RuntimeException("Bug: Bad magic number."); 1074 } 1075 }