001 /*
002 * Copyright (c) 2005 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package fi.iki.hsivonen.xml;
024
025 import java.io.IOException;
026 import java.io.InputStream;
027 import java.net.MalformedURLException;
028 import java.util.Iterator;
029 import java.util.Set;
030 import java.util.TreeSet;
031 import java.util.regex.Matcher;
032 import java.util.regex.Pattern;
033
034 import org.apache.commons.httpclient.Header;
035 import org.apache.commons.httpclient.HostConfiguration;
036 import org.apache.commons.httpclient.HttpClient;
037 import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
038 import org.apache.commons.httpclient.cookie.CookiePolicy;
039 import org.apache.commons.httpclient.methods.GetMethod;
040 import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
041 import org.apache.log4j.Logger;
042 import org.xml.sax.EntityResolver;
043 import org.xml.sax.ErrorHandler;
044 import org.xml.sax.InputSource;
045 import org.xml.sax.SAXException;
046 import org.xml.sax.SAXParseException;
047
048 import com.hp.hpl.jena.iri.IRI;
049 import com.hp.hpl.jena.iri.IRIException;
050 import com.hp.hpl.jena.iri.IRIFactory;
051
052 import fi.iki.hsivonen.io.BoundednputStream;
053 import fi.iki.hsivonen.io.ObservableInputStream;
054 import fi.iki.hsivonen.io.StreamObserver;
055
056 /**
057 * @version $Id: PrudentHttpEntityResolver.java,v 1.1 2005/01/08 08:11:26
058 * hsivonen Exp $
059 * @author hsivonen
060 */
061 public class PrudentHttpEntityResolver implements EntityResolver {
062
063 private static final Logger log4j = Logger.getLogger(PrudentHttpEntityResolver.class);
064
065 private static final Pattern CHARSET = Pattern.compile("^\\s*charset\\s*=\\s*(\\S+)\\s*$");
066
067 private static final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
068
069 private static final HttpClient client = new HttpClient(manager);
070
071 private static int maxRequests;
072
073 private int sizeLimit;
074
075 private ErrorHandler errorHandler;
076
077 private int requestsLeft;
078
079 private boolean laxContentType;
080
081 private boolean allowRnc = false;
082
083 private boolean allowHtml = false;
084
085 private boolean allowXhtml = false;
086
087 private boolean acceptAllKnownXmlTypes = false;
088
089 private boolean allowGenericXml = true;
090
091 private IRIFactory iriFactory;
092
093 /**
094 * Sets the timeouts of the HTTP client.
095 *
096 * @param connectionTimeout
097 * timeout until connection established in milliseconds. Zero
098 * means no timeout.
099 * @param socketTimeout
100 * timeout for waiting for data in milliseconds. Zero means no
101 * timeout.
102 */
103 public static void setParams(int connectionTimeout, int socketTimeout,
104 int maxRequests) {
105 HttpConnectionManagerParams hcmp = client.getHttpConnectionManager().getParams();
106 hcmp.setConnectionTimeout(connectionTimeout);
107 hcmp.setSoTimeout(socketTimeout);
108 hcmp.setMaxConnectionsPerHost(HostConfiguration.ANY_HOST_CONFIGURATION,
109 maxRequests);
110 hcmp.setMaxTotalConnections(maxRequests * 2);
111 PrudentHttpEntityResolver.maxRequests = maxRequests;
112 }
113
114 public static void setUserAgent(String ua) {
115 client.getParams().setParameter("http.useragent", ua);
116 }
117
118 /**
119 * @param connectionTimeout
120 * @param socketTimeout
121 * @param sizeLimit
122 */
123 public PrudentHttpEntityResolver(int sizeLimit, boolean laxContentType,
124 ErrorHandler errorHandler) {
125 this.sizeLimit = sizeLimit;
126 this.requestsLeft = maxRequests;
127 this.laxContentType = laxContentType;
128 this.errorHandler = errorHandler;
129 this.iriFactory = new IRIFactory();
130 this.iriFactory.useSpecificationXMLSystemID(true);
131 this.iriFactory.useSchemeSpecificRules("http", true);
132 this.iriFactory.useSchemeSpecificRules("https", true);
133 }
134
135 /**
136 * @see org.xml.sax.EntityResolver#resolveEntity(java.lang.String,
137 * java.lang.String)
138 */
139 public InputSource resolveEntity(String publicId, String systemId)
140 throws SAXException, IOException {
141 if (requestsLeft > -1) {
142 if (requestsLeft == 0) {
143 throw new IOException(
144 "Number of permitted HTTP requests exceeded.");
145 } else {
146 requestsLeft--;
147 }
148 }
149 GetMethod m = null;
150 try {
151 IRI iri;
152 try {
153 iri = iriFactory.construct(systemId);
154 } catch (IRIException e) {
155 IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e);
156 SAXParseException spe = new SAXParseException(e.getMessage(),
157 publicId, systemId, -1, -1, ioe);
158 if (errorHandler != null) {
159 errorHandler.fatalError(spe);
160 }
161 throw spe;
162 }
163 if (!iri.isAbsolute()) {
164 SAXParseException spe = new SAXParseException(
165 "Not an absolute URI.", publicId, systemId, -1, -1,
166 new IOException());
167 if (errorHandler != null) {
168 errorHandler.fatalError(spe);
169 }
170 throw spe;
171 }
172 String scheme = iri.getScheme();
173 if (!("http".equals(scheme) || "https".equals(scheme))) {
174 SAXParseException spe = new SAXParseException(
175 "Unsupported URI scheme: " + scheme, publicId,
176 systemId, -1, -1, new IOException());
177 if (errorHandler != null) {
178 errorHandler.fatalError(spe);
179 }
180 throw spe;
181 }
182 String host = iri.getHost();
183 if ("127.0.0.1".equals(host) || "localhost".equals(host)) {
184 SAXParseException spe = new SAXParseException(
185 "Attempted to connect to localhost.", publicId,
186 systemId, -1, -1, new IOException());
187 if (errorHandler != null) {
188 errorHandler.fatalError(spe);
189 }
190 throw spe;
191 }
192 try {
193 systemId = iri.toASCIIString();
194 } catch (MalformedURLException e) {
195 IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e);
196 SAXParseException spe = new SAXParseException(e.getMessage(),
197 publicId, systemId, -1, -1, ioe);
198 if (errorHandler != null) {
199 errorHandler.fatalError(spe);
200 }
201 throw spe;
202 }
203 try {
204 m = new GetMethod(systemId);
205 } catch (IllegalArgumentException e) {
206 SAXParseException spe = new SAXParseException(
207 e.getMessage(),
208 publicId,
209 systemId,
210 -1,
211 -1,
212 (IOException) new IOException(e.getMessage()).initCause(e));
213 if (errorHandler != null) {
214 errorHandler.fatalError(spe);
215 }
216 throw spe;
217 }
218 m.setFollowRedirects(true);
219 m.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
220 m.addRequestHeader("Accept", buildAccept());
221 log4j.info(systemId);
222 client.executeMethod(m);
223 if (m.getStatusCode() != 200) {
224 SAXParseException spe = new SAXParseException(
225 "HTTP resource not retrievable.", publicId,
226 m.getURI().toString(), -1, -1, new IOException());
227 if (errorHandler != null) {
228 errorHandler.fatalError(spe);
229 }
230 throw spe;
231 }
232 long len = m.getResponseContentLength();
233 if (sizeLimit > -1 && len > sizeLimit) {
234 SAXParseException spe = new SAXParseException(
235 "Resource size exceeds limit.", publicId,
236 m.getURI().toString(), -1, -1, new IOException());
237 if (errorHandler != null) {
238 errorHandler.fatalError(spe);
239 }
240 throw spe;
241 }
242 TypedInputSource is = new TypedInputSource();
243 is.setPublicId(publicId);
244 is.setSystemId(m.getURI().toString());
245 Header ct = m.getResponseHeader("Content-Type");
246 if (ct != null) {
247 String val = ct.getValue();
248 String[] params = val.split(";");
249 String type = params[0].trim();
250 boolean wasRnc = false;
251 boolean wasHtml = false;
252 if (isAllowRnc()) {
253 if (rncContentType(type, is)) {
254 wasRnc = true;
255 is.setType("application/relax-ng-compact-syntax");
256 }
257 }
258 if (!wasRnc) {
259 if (isAllowHtml()) {
260 if ("text/html".equals(type)) {
261 is.setType(type);
262 wasHtml = true;
263 } else if (isOnlyHtmlAllowed()) {
264 if (laxContentType && "text/plain".equals(type)) {
265 is.setType(type);
266 wasHtml = true;
267 if (errorHandler != null) {
268 errorHandler.warning(new SAXParseException(
269 "Being lax about non-HTML Content-Type: "
270 + type, is.getPublicId(),
271 is.getSystemId(), -1, -1));
272 }
273 } else {
274 SAXParseException spe = new SAXParseException(
275 "Non-HTML Content-Type: " + type,
276 publicId, m.getURI().toString(), -1,
277 -1, new IOException());
278 if (errorHandler != null) {
279 errorHandler.fatalError(spe);
280 }
281 throw spe;
282 }
283 }
284 }
285 if (!wasHtml && (isAllowGenericXml() || isAllowXhtml() || isAcceptAllKnownXmlTypes())) {
286 if (!xmlContentType(type, is)) {
287 SAXParseException spe = new SAXParseException(
288 "Non-XML Content-Type: " + type, publicId,
289 m.getURI().toString(), -1, -1,
290 new IOException());
291 if (errorHandler != null) {
292 errorHandler.fatalError(spe);
293 }
294 throw spe;
295 } else {
296 is.setType(type);
297 }
298 }
299 }
300 String charset = null;
301 for (int i = 1; i < params.length; i++) {
302 Matcher matcher = CHARSET.matcher(params[i]);
303 if (matcher.matches()) {
304 charset = matcher.group(1);
305 break;
306 }
307 }
308 if (charset != null) {
309 is.setEncoding(charset);
310 } else if (type.startsWith("text/") && !wasHtml) {
311 if (laxContentType) {
312 if (errorHandler != null) {
313 errorHandler.warning(new SAXParseException(
314 "text/* type without a charset parameter seen. Would have defaulted to US-ASCII had the lax option not been chosen.",
315 is.getPublicId(), is.getSystemId(), -1, -1));
316 }
317 } else {
318 is.setEncoding("US-ASCII");
319 if (errorHandler != null) {
320 errorHandler.warning(new SAXParseException(
321 "text/* type without a charset parameter seen. Defaulting to US-ASCII per section 3.1 of RFC 3023.",
322 is.getPublicId(), is.getSystemId(), -1, -1));
323 }
324 }
325 }
326 }
327 final GetMethod meth = m;
328 InputStream stream = m.getResponseBodyAsStream();
329 if (sizeLimit > -1) {
330 stream = new BoundednputStream(stream, sizeLimit);
331 }
332 is.setByteStream(new ObservableInputStream(stream,
333 new StreamObserver() {
334 private final Logger log4j = Logger.getLogger("fi.iki.hsivonen.xml.PrudentEntityResolver.StreamObserver");
335
336 private boolean released = false;
337
338 public void closeCalled() {
339 log4j.debug("closeCalled");
340 if (!released) {
341 log4j.debug("closeCalled, not yet released");
342 released = true;
343 try {
344 meth.releaseConnection();
345 } catch (Exception e) {
346 log4j.debug(
347 "closeCalled, releaseConnection", e);
348 }
349 }
350 }
351
352 public void exceptionOccurred(Exception ex) {
353 if (!released) {
354 released = true;
355 try {
356 meth.abort();
357 } catch (Exception e) {
358 log4j.debug("exceptionOccurred, abort", e);
359 } finally {
360 try {
361 meth.releaseConnection();
362 } catch (Exception e) {
363 log4j.debug(
364 "exceptionOccurred, releaseConnection",
365 e);
366 }
367 }
368 }
369 }
370
371 public void finalizerCalled() {
372 if (!released) {
373 released = true;
374 try {
375 meth.abort();
376 } catch (Exception e) {
377 log4j.debug("finalizerCalled, abort", e);
378 } finally {
379 try {
380 meth.releaseConnection();
381 } catch (Exception e) {
382 log4j.debug(
383 "finalizerCalled, releaseConnection",
384 e);
385 }
386 }
387 }
388 }
389
390 }));
391 return is;
392 } catch (IOException e) {
393 try {
394 m.abort();
395 } catch (Exception ex) {
396 log4j.debug("abort", ex);
397 } finally {
398 try {
399 m.releaseConnection();
400 } catch (Exception ex) {
401 log4j.debug("releaseConnection", ex);
402 }
403 }
404 throw e;
405 } catch (SAXException e) {
406 try {
407 m.abort();
408 } catch (Exception ex) {
409 log4j.debug("abort", ex);
410 } finally {
411 try {
412 m.releaseConnection();
413 } catch (Exception ex) {
414 log4j.debug("releaseConnection", ex);
415 }
416 }
417 throw e;
418 } catch (RuntimeException e) {
419 try {
420 m.abort();
421 } catch (Exception ex) {
422 log4j.debug("abort", ex);
423 } finally {
424 try {
425 m.releaseConnection();
426 } catch (Exception ex) {
427 log4j.debug("releaseConnection", ex);
428 }
429 }
430 throw e;
431 }
432 }
433
434 protected boolean xmlContentType(String type, InputSource is)
435 throws SAXException {
436 if ("application/xhtml-voice+xml".equals(type)) {
437 if (errorHandler != null) {
438 errorHandler.warning(new SAXParseException(
439 "application/xhtml-voice+xml is an obsolete type.",
440 is.getPublicId(), is.getSystemId(), -1, -1));
441 }
442 }
443 boolean typeOk = "application/xml".equals(type)
444 || "text/xml".equals(type) || type.endsWith("+xml")
445 || "application/xml-external-parsed-entity".equals(type)
446 || "text/xml-external-parsed-entity".equals(type)
447 || "application/xml-dtd".equals(type)
448 || "application/octet-stream".equals(type);
449 if (!typeOk && laxContentType) {
450 boolean laxOk = "text/plain".equals(type)
451 || "text/html".equals(type) || "text/xsl".equals(type);
452 if (laxOk && errorHandler != null) {
453 errorHandler.warning(new SAXParseException(
454 "Being lax about non-XML Content-Type: " + type,
455 is.getPublicId(), is.getSystemId(), -1, -1));
456 }
457 return laxOk;
458 } else {
459 return typeOk;
460 }
461 }
462
463 protected boolean rncContentType(String type, InputSource is)
464 throws SAXException {
465 boolean typeOk = "application/relax-ng-compact-syntax".equals(type);
466 if (!typeOk) {
467 typeOk = "application/vnd.relax-ng.rnc".equals(type);
468 if (typeOk && errorHandler != null) {
469 errorHandler.warning(new SAXParseException(
470 "application/vnd.relax-ng.rnc is an unregistered type. application/relax-ng-compact-syntax is the registered type.",
471 is.getPublicId(), is.getSystemId(), -1, -1));
472 }
473 }
474 if (!typeOk) {
475 typeOk = "application/octet-stream".equals(type)
476 && is.getSystemId().endsWith(".rnc");
477 }
478 if (!typeOk && laxContentType) {
479 boolean laxOk = "text/plain".equals(type)
480 && is.getSystemId().endsWith(".rnc");
481 if (laxOk && errorHandler != null) {
482 errorHandler.warning(new SAXParseException(
483 "Being lax about non-RNC Content-Type: " + type,
484 is.getPublicId(), is.getSystemId(), -1, -1));
485 }
486 return laxOk;
487 } else {
488 return typeOk;
489 }
490 }
491
492 /**
493 * @return Returns the allowRnc.
494 */
495 public boolean isAllowRnc() {
496 return allowRnc;
497 }
498
499 /**
500 * @param allowRnc
501 * The allowRnc to set.
502 */
503 public void setAllowRnc(boolean allowRnc) {
504 this.allowRnc = allowRnc;
505 }
506
507 /**
508 * @param b
509 */
510 public void setAllowHtml(boolean expectHtml) {
511 this.allowHtml = expectHtml;
512 }
513
514 /**
515 * Returns the acceptAllKnownXmlTypes.
516 *
517 * @return the acceptAllKnownXmlTypes
518 */
519 public boolean isAcceptAllKnownXmlTypes() {
520 return acceptAllKnownXmlTypes;
521 }
522
523 /**
524 * Sets the acceptAllKnownXmlTypes.
525 *
526 * @param acceptAllKnownXmlTypes
527 * the acceptAllKnownXmlTypes to set
528 */
529 public void setAcceptAllKnownXmlTypes(boolean acceptAllKnownXmlTypes) {
530 this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes;
531 }
532
533 /**
534 * Returns the allowGenericXml.
535 *
536 * @return the allowGenericXml
537 */
538 public boolean isAllowGenericXml() {
539 return allowGenericXml;
540 }
541
542 /**
543 * Sets the allowGenericXml.
544 *
545 * @param allowGenericXml
546 * the allowGenericXml to set
547 */
548 public void setAllowGenericXml(boolean allowGenericXml) {
549 this.allowGenericXml = allowGenericXml;
550 }
551
552 /**
553 * Returns the allowXhtml.
554 *
555 * @return the allowXhtml
556 */
557 public boolean isAllowXhtml() {
558 return allowXhtml;
559 }
560
561 /**
562 * Sets the allowXhtml.
563 *
564 * @param allowXhtml
565 * the allowXhtml to set
566 */
567 public void setAllowXhtml(boolean allowXhtml) {
568 this.allowXhtml = allowXhtml;
569 }
570
571 private String buildAccept() {
572 Set<String> types = new TreeSet<String>();
573 if (isAllowRnc()) {
574 types.add("application/relax-ng-compact-syntax");
575 }
576 if (isAllowHtml()) {
577 types.add("text/html; q=0.9");
578 }
579 if (isAllowXhtml()) {
580 types.add("application/xhtml+xml");
581 types.add("application/xml; q=0.5");
582 }
583 if (isAcceptAllKnownXmlTypes()) {
584 types.add("application/xhtml+xml");
585 // types.add("application/atom+xml");
586 types.add("image/svg+xml");
587 types.add("application/docbook+xml");
588 types.add("application/xml; q=0.5");
589 types.add("text/xml; q=0.3");
590 types.add("*/*; q=0.1");
591 }
592 if (isAllowGenericXml()) {
593 types.add("application/xml; q=0.5");
594 types.add("text/xml; q=0.3");
595 types.add("*/*; q=0.1");
596 }
597 StringBuilder buf = new StringBuilder();
598 for (Iterator<String> iter = types.iterator(); iter.hasNext();) {
599 String str = iter.next();
600 buf.append(str);
601 buf.append(", ");
602 }
603 for (int i = 0; i < 2; i++) {
604 int len = buf.length();
605 if (len > 0) {
606 buf.deleteCharAt(len - 1);
607 }
608 }
609 return buf.toString();
610 }
611
612 /**
613 * Returns the allowHtml.
614 *
615 * @return the allowHtml
616 */
617 public boolean isAllowHtml() {
618 return allowHtml;
619 }
620
621 public boolean isOnlyHtmlAllowed() {
622 return !isAllowGenericXml() && !isAllowRnc() && !isAllowXhtml();
623 }
624 }