001 /* 002 * Copyright (c) 2005 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.iki.hsivonen.xml; 024 025 import java.io.IOException; 026 import java.io.InputStream; 027 import java.net.MalformedURLException; 028 import java.util.Iterator; 029 import java.util.Set; 030 import java.util.TreeSet; 031 import java.util.regex.Matcher; 032 import java.util.regex.Pattern; 033 034 import org.apache.commons.httpclient.Header; 035 import org.apache.commons.httpclient.HostConfiguration; 036 import org.apache.commons.httpclient.HttpClient; 037 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager; 038 import org.apache.commons.httpclient.cookie.CookiePolicy; 039 import org.apache.commons.httpclient.methods.GetMethod; 040 import org.apache.commons.httpclient.params.HttpConnectionManagerParams; 041 import org.apache.log4j.Logger; 042 import org.xml.sax.EntityResolver; 043 import org.xml.sax.ErrorHandler; 044 import org.xml.sax.InputSource; 045 import org.xml.sax.SAXException; 046 import org.xml.sax.SAXParseException; 047 048 import com.hp.hpl.jena.iri.IRI; 049 import com.hp.hpl.jena.iri.IRIException; 050 import com.hp.hpl.jena.iri.IRIFactory; 051 052 import fi.iki.hsivonen.io.BoundednputStream; 053 import fi.iki.hsivonen.io.ObservableInputStream; 054 import fi.iki.hsivonen.io.StreamObserver; 055 056 /** 057 * @version $Id: PrudentHttpEntityResolver.java,v 1.1 2005/01/08 08:11:26 058 * hsivonen Exp $ 059 * @author hsivonen 060 */ 061 public class PrudentHttpEntityResolver implements EntityResolver { 062 063 private static final Logger log4j = Logger.getLogger(PrudentHttpEntityResolver.class); 064 065 private static final Pattern CHARSET = Pattern.compile("^\\s*charset\\s*=\\s*(\\S+)\\s*$"); 066 067 private static final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager(); 068 069 private static final HttpClient client = new HttpClient(manager); 070 071 private static int maxRequests; 072 073 private int sizeLimit; 074 075 private ErrorHandler errorHandler; 076 077 private int requestsLeft; 078 079 private boolean laxContentType; 080 081 private boolean allowRnc = false; 082 083 private boolean allowHtml = false; 084 085 private boolean allowXhtml = false; 086 087 private boolean acceptAllKnownXmlTypes = false; 088 089 private boolean allowGenericXml = true; 090 091 private IRIFactory iriFactory; 092 093 /** 094 * Sets the timeouts of the HTTP client. 095 * 096 * @param connectionTimeout 097 * timeout until connection established in milliseconds. Zero 098 * means no timeout. 099 * @param socketTimeout 100 * timeout for waiting for data in milliseconds. Zero means no 101 * timeout. 102 */ 103 public static void setParams(int connectionTimeout, int socketTimeout, 104 int maxRequests) { 105 HttpConnectionManagerParams hcmp = client.getHttpConnectionManager().getParams(); 106 hcmp.setConnectionTimeout(connectionTimeout); 107 hcmp.setSoTimeout(socketTimeout); 108 hcmp.setMaxConnectionsPerHost(HostConfiguration.ANY_HOST_CONFIGURATION, 109 maxRequests); 110 hcmp.setMaxTotalConnections(maxRequests * 2); 111 PrudentHttpEntityResolver.maxRequests = maxRequests; 112 } 113 114 public static void setUserAgent(String ua) { 115 client.getParams().setParameter("http.useragent", ua); 116 } 117 118 /** 119 * @param connectionTimeout 120 * @param socketTimeout 121 * @param sizeLimit 122 */ 123 public PrudentHttpEntityResolver(int sizeLimit, boolean laxContentType, 124 ErrorHandler errorHandler) { 125 this.sizeLimit = sizeLimit; 126 this.requestsLeft = maxRequests; 127 this.laxContentType = laxContentType; 128 this.errorHandler = errorHandler; 129 this.iriFactory = new IRIFactory(); 130 this.iriFactory.useSpecificationXMLSystemID(true); 131 this.iriFactory.useSchemeSpecificRules("http", true); 132 this.iriFactory.useSchemeSpecificRules("https", true); 133 } 134 135 /** 136 * @see org.xml.sax.EntityResolver#resolveEntity(java.lang.String, 137 * java.lang.String) 138 */ 139 public InputSource resolveEntity(String publicId, String systemId) 140 throws SAXException, IOException { 141 if (requestsLeft > -1) { 142 if (requestsLeft == 0) { 143 throw new IOException( 144 "Number of permitted HTTP requests exceeded."); 145 } else { 146 requestsLeft--; 147 } 148 } 149 GetMethod m = null; 150 try { 151 IRI iri; 152 try { 153 iri = iriFactory.construct(systemId); 154 } catch (IRIException e) { 155 IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e); 156 SAXParseException spe = new SAXParseException(e.getMessage(), 157 publicId, systemId, -1, -1, ioe); 158 if (errorHandler != null) { 159 errorHandler.fatalError(spe); 160 } 161 throw spe; 162 } 163 if (!iri.isAbsolute()) { 164 SAXParseException spe = new SAXParseException( 165 "Not an absolute URI.", publicId, systemId, -1, -1, 166 new IOException()); 167 if (errorHandler != null) { 168 errorHandler.fatalError(spe); 169 } 170 throw spe; 171 } 172 String scheme = iri.getScheme(); 173 if (!("http".equals(scheme) || "https".equals(scheme))) { 174 SAXParseException spe = new SAXParseException( 175 "Unsupported URI scheme: " + scheme, publicId, 176 systemId, -1, -1, new IOException()); 177 if (errorHandler != null) { 178 errorHandler.fatalError(spe); 179 } 180 throw spe; 181 } 182 String host = iri.getHost(); 183 if ("127.0.0.1".equals(host) || "localhost".equals(host)) { 184 SAXParseException spe = new SAXParseException( 185 "Attempted to connect to localhost.", publicId, 186 systemId, -1, -1, new IOException()); 187 if (errorHandler != null) { 188 errorHandler.fatalError(spe); 189 } 190 throw spe; 191 } 192 try { 193 systemId = iri.toASCIIString(); 194 } catch (MalformedURLException e) { 195 IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e); 196 SAXParseException spe = new SAXParseException(e.getMessage(), 197 publicId, systemId, -1, -1, ioe); 198 if (errorHandler != null) { 199 errorHandler.fatalError(spe); 200 } 201 throw spe; 202 } 203 try { 204 m = new GetMethod(systemId); 205 } catch (IllegalArgumentException e) { 206 SAXParseException spe = new SAXParseException( 207 e.getMessage(), 208 publicId, 209 systemId, 210 -1, 211 -1, 212 (IOException) new IOException(e.getMessage()).initCause(e)); 213 if (errorHandler != null) { 214 errorHandler.fatalError(spe); 215 } 216 throw spe; 217 } 218 m.setFollowRedirects(true); 219 m.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES); 220 m.addRequestHeader("Accept", buildAccept()); 221 log4j.info(systemId); 222 client.executeMethod(m); 223 if (m.getStatusCode() != 200) { 224 SAXParseException spe = new SAXParseException( 225 "HTTP resource not retrievable.", publicId, 226 m.getURI().toString(), -1, -1, new IOException()); 227 if (errorHandler != null) { 228 errorHandler.fatalError(spe); 229 } 230 throw spe; 231 } 232 long len = m.getResponseContentLength(); 233 if (sizeLimit > -1 && len > sizeLimit) { 234 SAXParseException spe = new SAXParseException( 235 "Resource size exceeds limit.", publicId, 236 m.getURI().toString(), -1, -1, new IOException()); 237 if (errorHandler != null) { 238 errorHandler.fatalError(spe); 239 } 240 throw spe; 241 } 242 TypedInputSource is = new TypedInputSource(); 243 is.setPublicId(publicId); 244 is.setSystemId(m.getURI().toString()); 245 Header ct = m.getResponseHeader("Content-Type"); 246 if (ct != null) { 247 String val = ct.getValue(); 248 String[] params = val.split(";"); 249 String type = params[0].trim(); 250 boolean wasRnc = false; 251 boolean wasHtml = false; 252 if (isAllowRnc()) { 253 if (rncContentType(type, is)) { 254 wasRnc = true; 255 is.setType("application/relax-ng-compact-syntax"); 256 } 257 } 258 if (!wasRnc) { 259 if (isAllowHtml()) { 260 if ("text/html".equals(type)) { 261 is.setType(type); 262 wasHtml = true; 263 } else if (isOnlyHtmlAllowed()) { 264 if (laxContentType && "text/plain".equals(type)) { 265 is.setType(type); 266 wasHtml = true; 267 if (errorHandler != null) { 268 errorHandler.warning(new SAXParseException( 269 "Being lax about non-HTML Content-Type: " 270 + type, is.getPublicId(), 271 is.getSystemId(), -1, -1)); 272 } 273 } else { 274 SAXParseException spe = new SAXParseException( 275 "Non-HTML Content-Type: " + type, 276 publicId, m.getURI().toString(), -1, 277 -1, new IOException()); 278 if (errorHandler != null) { 279 errorHandler.fatalError(spe); 280 } 281 throw spe; 282 } 283 } 284 } 285 if (!wasHtml && (isAllowGenericXml() || isAllowXhtml() || isAcceptAllKnownXmlTypes())) { 286 if (!xmlContentType(type, is)) { 287 SAXParseException spe = new SAXParseException( 288 "Non-XML Content-Type: " + type, publicId, 289 m.getURI().toString(), -1, -1, 290 new IOException()); 291 if (errorHandler != null) { 292 errorHandler.fatalError(spe); 293 } 294 throw spe; 295 } else { 296 is.setType(type); 297 } 298 } 299 } 300 String charset = null; 301 for (int i = 1; i < params.length; i++) { 302 Matcher matcher = CHARSET.matcher(params[i]); 303 if (matcher.matches()) { 304 charset = matcher.group(1); 305 break; 306 } 307 } 308 if (charset != null) { 309 is.setEncoding(charset); 310 } else if (type.startsWith("text/") && !wasHtml) { 311 if (laxContentType) { 312 if (errorHandler != null) { 313 errorHandler.warning(new SAXParseException( 314 "text/* type without a charset parameter seen. Would have defaulted to US-ASCII had the lax option not been chosen.", 315 is.getPublicId(), is.getSystemId(), -1, -1)); 316 } 317 } else { 318 is.setEncoding("US-ASCII"); 319 if (errorHandler != null) { 320 errorHandler.warning(new SAXParseException( 321 "text/* type without a charset parameter seen. Defaulting to US-ASCII per section 3.1 of RFC 3023.", 322 is.getPublicId(), is.getSystemId(), -1, -1)); 323 } 324 } 325 } 326 } 327 final GetMethod meth = m; 328 InputStream stream = m.getResponseBodyAsStream(); 329 if (sizeLimit > -1) { 330 stream = new BoundednputStream(stream, sizeLimit); 331 } 332 is.setByteStream(new ObservableInputStream(stream, 333 new StreamObserver() { 334 private final Logger log4j = Logger.getLogger("fi.iki.hsivonen.xml.PrudentEntityResolver.StreamObserver"); 335 336 private boolean released = false; 337 338 public void closeCalled() { 339 log4j.debug("closeCalled"); 340 if (!released) { 341 log4j.debug("closeCalled, not yet released"); 342 released = true; 343 try { 344 meth.releaseConnection(); 345 } catch (Exception e) { 346 log4j.debug( 347 "closeCalled, releaseConnection", e); 348 } 349 } 350 } 351 352 public void exceptionOccurred(Exception ex) { 353 if (!released) { 354 released = true; 355 try { 356 meth.abort(); 357 } catch (Exception e) { 358 log4j.debug("exceptionOccurred, abort", e); 359 } finally { 360 try { 361 meth.releaseConnection(); 362 } catch (Exception e) { 363 log4j.debug( 364 "exceptionOccurred, releaseConnection", 365 e); 366 } 367 } 368 } 369 } 370 371 public void finalizerCalled() { 372 if (!released) { 373 released = true; 374 try { 375 meth.abort(); 376 } catch (Exception e) { 377 log4j.debug("finalizerCalled, abort", e); 378 } finally { 379 try { 380 meth.releaseConnection(); 381 } catch (Exception e) { 382 log4j.debug( 383 "finalizerCalled, releaseConnection", 384 e); 385 } 386 } 387 } 388 } 389 390 })); 391 return is; 392 } catch (IOException e) { 393 try { 394 m.abort(); 395 } catch (Exception ex) { 396 log4j.debug("abort", ex); 397 } finally { 398 try { 399 m.releaseConnection(); 400 } catch (Exception ex) { 401 log4j.debug("releaseConnection", ex); 402 } 403 } 404 throw e; 405 } catch (SAXException e) { 406 try { 407 m.abort(); 408 } catch (Exception ex) { 409 log4j.debug("abort", ex); 410 } finally { 411 try { 412 m.releaseConnection(); 413 } catch (Exception ex) { 414 log4j.debug("releaseConnection", ex); 415 } 416 } 417 throw e; 418 } catch (RuntimeException e) { 419 try { 420 m.abort(); 421 } catch (Exception ex) { 422 log4j.debug("abort", ex); 423 } finally { 424 try { 425 m.releaseConnection(); 426 } catch (Exception ex) { 427 log4j.debug("releaseConnection", ex); 428 } 429 } 430 throw e; 431 } 432 } 433 434 protected boolean xmlContentType(String type, InputSource is) 435 throws SAXException { 436 if ("application/xhtml-voice+xml".equals(type)) { 437 if (errorHandler != null) { 438 errorHandler.warning(new SAXParseException( 439 "application/xhtml-voice+xml is an obsolete type.", 440 is.getPublicId(), is.getSystemId(), -1, -1)); 441 } 442 } 443 boolean typeOk = "application/xml".equals(type) 444 || "text/xml".equals(type) || type.endsWith("+xml") 445 || "application/xml-external-parsed-entity".equals(type) 446 || "text/xml-external-parsed-entity".equals(type) 447 || "application/xml-dtd".equals(type) 448 || "application/octet-stream".equals(type); 449 if (!typeOk && laxContentType) { 450 boolean laxOk = "text/plain".equals(type) 451 || "text/html".equals(type) || "text/xsl".equals(type); 452 if (laxOk && errorHandler != null) { 453 errorHandler.warning(new SAXParseException( 454 "Being lax about non-XML Content-Type: " + type, 455 is.getPublicId(), is.getSystemId(), -1, -1)); 456 } 457 return laxOk; 458 } else { 459 return typeOk; 460 } 461 } 462 463 protected boolean rncContentType(String type, InputSource is) 464 throws SAXException { 465 boolean typeOk = "application/relax-ng-compact-syntax".equals(type); 466 if (!typeOk) { 467 typeOk = "application/vnd.relax-ng.rnc".equals(type); 468 if (typeOk && errorHandler != null) { 469 errorHandler.warning(new SAXParseException( 470 "application/vnd.relax-ng.rnc is an unregistered type. application/relax-ng-compact-syntax is the registered type.", 471 is.getPublicId(), is.getSystemId(), -1, -1)); 472 } 473 } 474 if (!typeOk) { 475 typeOk = "application/octet-stream".equals(type) 476 && is.getSystemId().endsWith(".rnc"); 477 } 478 if (!typeOk && laxContentType) { 479 boolean laxOk = "text/plain".equals(type) 480 && is.getSystemId().endsWith(".rnc"); 481 if (laxOk && errorHandler != null) { 482 errorHandler.warning(new SAXParseException( 483 "Being lax about non-RNC Content-Type: " + type, 484 is.getPublicId(), is.getSystemId(), -1, -1)); 485 } 486 return laxOk; 487 } else { 488 return typeOk; 489 } 490 } 491 492 /** 493 * @return Returns the allowRnc. 494 */ 495 public boolean isAllowRnc() { 496 return allowRnc; 497 } 498 499 /** 500 * @param allowRnc 501 * The allowRnc to set. 502 */ 503 public void setAllowRnc(boolean allowRnc) { 504 this.allowRnc = allowRnc; 505 } 506 507 /** 508 * @param b 509 */ 510 public void setAllowHtml(boolean expectHtml) { 511 this.allowHtml = expectHtml; 512 } 513 514 /** 515 * Returns the acceptAllKnownXmlTypes. 516 * 517 * @return the acceptAllKnownXmlTypes 518 */ 519 public boolean isAcceptAllKnownXmlTypes() { 520 return acceptAllKnownXmlTypes; 521 } 522 523 /** 524 * Sets the acceptAllKnownXmlTypes. 525 * 526 * @param acceptAllKnownXmlTypes 527 * the acceptAllKnownXmlTypes to set 528 */ 529 public void setAcceptAllKnownXmlTypes(boolean acceptAllKnownXmlTypes) { 530 this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes; 531 } 532 533 /** 534 * Returns the allowGenericXml. 535 * 536 * @return the allowGenericXml 537 */ 538 public boolean isAllowGenericXml() { 539 return allowGenericXml; 540 } 541 542 /** 543 * Sets the allowGenericXml. 544 * 545 * @param allowGenericXml 546 * the allowGenericXml to set 547 */ 548 public void setAllowGenericXml(boolean allowGenericXml) { 549 this.allowGenericXml = allowGenericXml; 550 } 551 552 /** 553 * Returns the allowXhtml. 554 * 555 * @return the allowXhtml 556 */ 557 public boolean isAllowXhtml() { 558 return allowXhtml; 559 } 560 561 /** 562 * Sets the allowXhtml. 563 * 564 * @param allowXhtml 565 * the allowXhtml to set 566 */ 567 public void setAllowXhtml(boolean allowXhtml) { 568 this.allowXhtml = allowXhtml; 569 } 570 571 private String buildAccept() { 572 Set<String> types = new TreeSet<String>(); 573 if (isAllowRnc()) { 574 types.add("application/relax-ng-compact-syntax"); 575 } 576 if (isAllowHtml()) { 577 types.add("text/html; q=0.9"); 578 } 579 if (isAllowXhtml()) { 580 types.add("application/xhtml+xml"); 581 types.add("application/xml; q=0.5"); 582 } 583 if (isAcceptAllKnownXmlTypes()) { 584 types.add("application/xhtml+xml"); 585 // types.add("application/atom+xml"); 586 types.add("image/svg+xml"); 587 types.add("application/docbook+xml"); 588 types.add("application/xml; q=0.5"); 589 types.add("text/xml; q=0.3"); 590 types.add("*/*; q=0.1"); 591 } 592 if (isAllowGenericXml()) { 593 types.add("application/xml; q=0.5"); 594 types.add("text/xml; q=0.3"); 595 types.add("*/*; q=0.1"); 596 } 597 StringBuilder buf = new StringBuilder(); 598 for (Iterator<String> iter = types.iterator(); iter.hasNext();) { 599 String str = iter.next(); 600 buf.append(str); 601 buf.append(", "); 602 } 603 for (int i = 0; i < 2; i++) { 604 int len = buf.length(); 605 if (len > 0) { 606 buf.deleteCharAt(len - 1); 607 } 608 } 609 return buf.toString(); 610 } 611 612 /** 613 * Returns the allowHtml. 614 * 615 * @return the allowHtml 616 */ 617 public boolean isAllowHtml() { 618 return allowHtml; 619 } 620 621 public boolean isOnlyHtmlAllowed() { 622 return !isAllowGenericXml() && !isAllowRnc() && !isAllowXhtml(); 623 } 624 }