001    /*
002     * Copyright (c) 2005 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.xml;
024    
025    import java.io.IOException;
026    import java.io.InputStream;
027    import java.net.MalformedURLException;
028    import java.util.Iterator;
029    import java.util.Set;
030    import java.util.TreeSet;
031    import java.util.regex.Matcher;
032    import java.util.regex.Pattern;
033    
034    import org.apache.commons.httpclient.Header;
035    import org.apache.commons.httpclient.HostConfiguration;
036    import org.apache.commons.httpclient.HttpClient;
037    import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
038    import org.apache.commons.httpclient.cookie.CookiePolicy;
039    import org.apache.commons.httpclient.methods.GetMethod;
040    import org.apache.commons.httpclient.params.HttpConnectionManagerParams;
041    import org.apache.log4j.Logger;
042    import org.xml.sax.EntityResolver;
043    import org.xml.sax.ErrorHandler;
044    import org.xml.sax.InputSource;
045    import org.xml.sax.SAXException;
046    import org.xml.sax.SAXParseException;
047    
048    import com.hp.hpl.jena.iri.IRI;
049    import com.hp.hpl.jena.iri.IRIException;
050    import com.hp.hpl.jena.iri.IRIFactory;
051    
052    import fi.iki.hsivonen.io.BoundednputStream;
053    import fi.iki.hsivonen.io.ObservableInputStream;
054    import fi.iki.hsivonen.io.StreamObserver;
055    
056    /**
057     * @version $Id: PrudentHttpEntityResolver.java,v 1.1 2005/01/08 08:11:26
058     *          hsivonen Exp $
059     * @author hsivonen
060     */
061    public class PrudentHttpEntityResolver implements EntityResolver {
062    
063        private static final Logger log4j = Logger.getLogger(PrudentHttpEntityResolver.class);
064    
065        private static final Pattern CHARSET = Pattern.compile("^\\s*charset\\s*=\\s*(\\S+)\\s*$");
066    
067        private static final MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
068    
069        private static final HttpClient client = new HttpClient(manager);
070    
071        private static int maxRequests;
072    
073        private int sizeLimit;
074    
075        private ErrorHandler errorHandler;
076    
077        private int requestsLeft;
078    
079        private boolean laxContentType;
080    
081        private boolean allowRnc = false;
082    
083        private boolean allowHtml = false;
084    
085        private boolean allowXhtml = false;
086    
087        private boolean acceptAllKnownXmlTypes = false;
088    
089        private boolean allowGenericXml = true;
090    
091        private IRIFactory iriFactory;
092    
093        /**
094         * Sets the timeouts of the HTTP client.
095         * 
096         * @param connectionTimeout
097         *            timeout until connection established in milliseconds. Zero
098         *            means no timeout.
099         * @param socketTimeout
100         *            timeout for waiting for data in milliseconds. Zero means no
101         *            timeout.
102         */
103        public static void setParams(int connectionTimeout, int socketTimeout,
104                int maxRequests) {
105            HttpConnectionManagerParams hcmp = client.getHttpConnectionManager().getParams();
106            hcmp.setConnectionTimeout(connectionTimeout);
107            hcmp.setSoTimeout(socketTimeout);
108            hcmp.setMaxConnectionsPerHost(HostConfiguration.ANY_HOST_CONFIGURATION,
109                    maxRequests);
110            hcmp.setMaxTotalConnections(maxRequests * 2);
111            PrudentHttpEntityResolver.maxRequests = maxRequests;
112        }
113    
114        public static void setUserAgent(String ua) {
115            client.getParams().setParameter("http.useragent", ua);
116        }
117    
118        /**
119         * @param connectionTimeout
120         * @param socketTimeout
121         * @param sizeLimit
122         */
123        public PrudentHttpEntityResolver(int sizeLimit, boolean laxContentType,
124                ErrorHandler errorHandler) {
125            this.sizeLimit = sizeLimit;
126            this.requestsLeft = maxRequests;
127            this.laxContentType = laxContentType;
128            this.errorHandler = errorHandler;
129            this.iriFactory = new IRIFactory();
130            this.iriFactory.useSpecificationXMLSystemID(true);
131            this.iriFactory.useSchemeSpecificRules("http", true);
132            this.iriFactory.useSchemeSpecificRules("https", true);
133        }
134    
135        /**
136         * @see org.xml.sax.EntityResolver#resolveEntity(java.lang.String,
137         *      java.lang.String)
138         */
139        public InputSource resolveEntity(String publicId, String systemId)
140                throws SAXException, IOException {
141            if (requestsLeft > -1) {
142                if (requestsLeft == 0) {
143                    throw new IOException(
144                            "Number of permitted HTTP requests exceeded.");
145                } else {
146                    requestsLeft--;
147                }
148            }
149            GetMethod m = null;
150            try {
151                IRI iri;
152                try {
153                    iri = iriFactory.construct(systemId);
154                } catch (IRIException e) {
155                    IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e);
156                    SAXParseException spe = new SAXParseException(e.getMessage(),
157                            publicId, systemId, -1, -1, ioe);
158                    if (errorHandler != null) {
159                        errorHandler.fatalError(spe);
160                    }
161                    throw spe;
162                }
163                if (!iri.isAbsolute()) {
164                    SAXParseException spe = new SAXParseException(
165                            "Not an absolute URI.", publicId, systemId, -1, -1,
166                            new IOException());
167                    if (errorHandler != null) {
168                        errorHandler.fatalError(spe);
169                    }
170                    throw spe;
171                }
172                String scheme = iri.getScheme();
173                if (!("http".equals(scheme) || "https".equals(scheme))) {
174                    SAXParseException spe = new SAXParseException(
175                            "Unsupported URI scheme: " + scheme, publicId,
176                            systemId, -1, -1, new IOException());
177                    if (errorHandler != null) {
178                        errorHandler.fatalError(spe);
179                    }
180                    throw spe;
181                }
182                String host = iri.getHost();
183                if ("127.0.0.1".equals(host) || "localhost".equals(host)) {
184                    SAXParseException spe = new SAXParseException(
185                            "Attempted to connect to localhost.", publicId,
186                            systemId, -1, -1, new IOException());
187                    if (errorHandler != null) {
188                        errorHandler.fatalError(spe);
189                    }
190                    throw spe;
191                }
192                try {
193                    systemId = iri.toASCIIString();
194                } catch (MalformedURLException e) {
195                    IOException ioe = (IOException) new IOException(e.getMessage()).initCause(e);
196                    SAXParseException spe = new SAXParseException(e.getMessage(),
197                            publicId, systemId, -1, -1, ioe);
198                    if (errorHandler != null) {
199                        errorHandler.fatalError(spe);
200                    }
201                    throw spe;
202                }
203                try {
204                    m = new GetMethod(systemId);
205                } catch (IllegalArgumentException e) {
206                    SAXParseException spe = new SAXParseException(
207                            e.getMessage(),
208                            publicId,
209                            systemId,
210                            -1,
211                            -1,
212                            (IOException) new IOException(e.getMessage()).initCause(e));
213                    if (errorHandler != null) {
214                        errorHandler.fatalError(spe);
215                    }
216                    throw spe;
217                }
218                m.setFollowRedirects(true);
219                m.getParams().setCookiePolicy(CookiePolicy.IGNORE_COOKIES);
220                m.addRequestHeader("Accept", buildAccept());
221                log4j.info(systemId);
222                client.executeMethod(m);
223                if (m.getStatusCode() != 200) {
224                    SAXParseException spe = new SAXParseException(
225                            "HTTP resource not retrievable.", publicId,
226                            m.getURI().toString(), -1, -1, new IOException());
227                    if (errorHandler != null) {
228                        errorHandler.fatalError(spe);
229                    }
230                    throw spe;
231                }
232                long len = m.getResponseContentLength();
233                if (sizeLimit > -1 && len > sizeLimit) {
234                    SAXParseException spe = new SAXParseException(
235                            "Resource size exceeds limit.", publicId,
236                            m.getURI().toString(), -1, -1, new IOException());
237                    if (errorHandler != null) {
238                        errorHandler.fatalError(spe);
239                    }
240                    throw spe;
241                }
242                TypedInputSource is = new TypedInputSource();
243                is.setPublicId(publicId);
244                is.setSystemId(m.getURI().toString());
245                Header ct = m.getResponseHeader("Content-Type");
246                if (ct != null) {
247                    String val = ct.getValue();
248                    String[] params = val.split(";");
249                    String type = params[0].trim();
250                    boolean wasRnc = false;
251                    boolean wasHtml = false;
252                    if (isAllowRnc()) {
253                        if (rncContentType(type, is)) {
254                            wasRnc = true;
255                            is.setType("application/relax-ng-compact-syntax");
256                        }
257                    }
258                    if (!wasRnc) {
259                        if (isAllowHtml()) {
260                            if ("text/html".equals(type)) {
261                                is.setType(type);
262                                wasHtml = true;
263                            } else if (isOnlyHtmlAllowed()) {
264                                if (laxContentType && "text/plain".equals(type)) {
265                                    is.setType(type);
266                                    wasHtml = true;
267                                    if (errorHandler != null) {
268                                        errorHandler.warning(new SAXParseException(
269                                                "Being lax about non-HTML Content-Type: "
270                                                        + type, is.getPublicId(),
271                                                is.getSystemId(), -1, -1));
272                                    }
273                                } else {
274                                    SAXParseException spe = new SAXParseException(
275                                            "Non-HTML Content-Type: " + type,
276                                            publicId, m.getURI().toString(), -1,
277                                            -1, new IOException());
278                                    if (errorHandler != null) {
279                                        errorHandler.fatalError(spe);
280                                    }
281                                    throw spe;
282                                }
283                            }
284                        } 
285                        if (!wasHtml && (isAllowGenericXml() || isAllowXhtml() || isAcceptAllKnownXmlTypes())) {
286                            if (!xmlContentType(type, is)) {
287                                SAXParseException spe = new SAXParseException(
288                                        "Non-XML Content-Type: " + type, publicId,
289                                        m.getURI().toString(), -1, -1,
290                                        new IOException());
291                                if (errorHandler != null) {
292                                    errorHandler.fatalError(spe);
293                                }
294                                throw spe;
295                            } else {
296                                is.setType(type);
297                            }
298                        }
299                    }
300                    String charset = null;
301                    for (int i = 1; i < params.length; i++) {
302                        Matcher matcher = CHARSET.matcher(params[i]);
303                        if (matcher.matches()) {
304                            charset = matcher.group(1);
305                            break;
306                        }
307                    }
308                    if (charset != null) {
309                        is.setEncoding(charset);
310                    } else if (type.startsWith("text/") && !wasHtml) {
311                        if (laxContentType) {
312                            if (errorHandler != null) {
313                                errorHandler.warning(new SAXParseException(
314                                        "text/* type without a charset parameter seen. Would have defaulted to US-ASCII had the lax option not been chosen.",
315                                        is.getPublicId(), is.getSystemId(), -1, -1));
316                            }
317                        } else {
318                            is.setEncoding("US-ASCII");
319                            if (errorHandler != null) {
320                                errorHandler.warning(new SAXParseException(
321                                        "text/* type without a charset parameter seen. Defaulting to US-ASCII per section 3.1 of RFC 3023.",
322                                        is.getPublicId(), is.getSystemId(), -1, -1));
323                            }
324                        }
325                    }
326                }
327                final GetMethod meth = m;
328                InputStream stream = m.getResponseBodyAsStream();
329                if (sizeLimit > -1) {
330                    stream = new BoundednputStream(stream, sizeLimit);
331                }
332                is.setByteStream(new ObservableInputStream(stream,
333                        new StreamObserver() {
334                            private final Logger log4j = Logger.getLogger("fi.iki.hsivonen.xml.PrudentEntityResolver.StreamObserver");
335    
336                            private boolean released = false;
337    
338                            public void closeCalled() {
339                                log4j.debug("closeCalled");
340                                if (!released) {
341                                    log4j.debug("closeCalled, not yet released");
342                                    released = true;
343                                    try {
344                                        meth.releaseConnection();
345                                    } catch (Exception e) {
346                                        log4j.debug(
347                                                "closeCalled, releaseConnection", e);
348                                    }
349                                }
350                            }
351    
352                            public void exceptionOccurred(Exception ex) {
353                                if (!released) {
354                                    released = true;
355                                    try {
356                                        meth.abort();
357                                    } catch (Exception e) {
358                                        log4j.debug("exceptionOccurred, abort", e);
359                                    } finally {
360                                        try {
361                                            meth.releaseConnection();
362                                        } catch (Exception e) {
363                                            log4j.debug(
364                                                    "exceptionOccurred, releaseConnection",
365                                                    e);
366                                        }
367                                    }
368                                }
369                            }
370    
371                            public void finalizerCalled() {
372                                if (!released) {
373                                    released = true;
374                                    try {
375                                        meth.abort();
376                                    } catch (Exception e) {
377                                        log4j.debug("finalizerCalled, abort", e);
378                                    } finally {
379                                        try {
380                                            meth.releaseConnection();
381                                        } catch (Exception e) {
382                                            log4j.debug(
383                                                    "finalizerCalled, releaseConnection",
384                                                    e);
385                                        }
386                                    }
387                                }
388                            }
389    
390                        }));
391                return is;
392            } catch (IOException e) {
393                try {
394                    m.abort();
395                } catch (Exception ex) {
396                    log4j.debug("abort", ex);
397                } finally {
398                    try {
399                        m.releaseConnection();
400                    } catch (Exception ex) {
401                        log4j.debug("releaseConnection", ex);
402                    }
403                }
404                throw e;
405            } catch (SAXException e) {
406                try {
407                    m.abort();
408                } catch (Exception ex) {
409                    log4j.debug("abort", ex);
410                } finally {
411                    try {
412                        m.releaseConnection();
413                    } catch (Exception ex) {
414                        log4j.debug("releaseConnection", ex);
415                    }
416                }
417                throw e;
418            } catch (RuntimeException e) {
419                try {
420                    m.abort();
421                } catch (Exception ex) {
422                    log4j.debug("abort", ex);
423                } finally {
424                    try {
425                        m.releaseConnection();
426                    } catch (Exception ex) {
427                        log4j.debug("releaseConnection", ex);
428                    }
429                }
430                throw e;
431            }
432        }
433    
434        protected boolean xmlContentType(String type, InputSource is)
435                throws SAXException {
436            if ("application/xhtml-voice+xml".equals(type)) {
437                if (errorHandler != null) {
438                    errorHandler.warning(new SAXParseException(
439                            "application/xhtml-voice+xml is an obsolete type.",
440                            is.getPublicId(), is.getSystemId(), -1, -1));
441                }
442            }
443            boolean typeOk = "application/xml".equals(type)
444                    || "text/xml".equals(type) || type.endsWith("+xml")
445                    || "application/xml-external-parsed-entity".equals(type)
446                    || "text/xml-external-parsed-entity".equals(type)
447                    || "application/xml-dtd".equals(type)
448                    || "application/octet-stream".equals(type);
449            if (!typeOk && laxContentType) {
450                boolean laxOk = "text/plain".equals(type)
451                        || "text/html".equals(type) || "text/xsl".equals(type);
452                if (laxOk && errorHandler != null) {
453                    errorHandler.warning(new SAXParseException(
454                            "Being lax about non-XML Content-Type: " + type,
455                            is.getPublicId(), is.getSystemId(), -1, -1));
456                }
457                return laxOk;
458            } else {
459                return typeOk;
460            }
461        }
462    
463        protected boolean rncContentType(String type, InputSource is)
464                throws SAXException {
465            boolean typeOk = "application/relax-ng-compact-syntax".equals(type);
466            if (!typeOk) {
467                typeOk = "application/vnd.relax-ng.rnc".equals(type);
468                if (typeOk && errorHandler != null) {
469                    errorHandler.warning(new SAXParseException(
470                            "application/vnd.relax-ng.rnc is an unregistered type. application/relax-ng-compact-syntax is the registered type.",
471                            is.getPublicId(), is.getSystemId(), -1, -1));
472                }
473            }
474            if (!typeOk) {
475                typeOk = "application/octet-stream".equals(type)
476                        && is.getSystemId().endsWith(".rnc");
477            }
478            if (!typeOk && laxContentType) {
479                boolean laxOk = "text/plain".equals(type)
480                        && is.getSystemId().endsWith(".rnc");
481                if (laxOk && errorHandler != null) {
482                    errorHandler.warning(new SAXParseException(
483                            "Being lax about non-RNC Content-Type: " + type,
484                            is.getPublicId(), is.getSystemId(), -1, -1));
485                }
486                return laxOk;
487            } else {
488                return typeOk;
489            }
490        }
491    
492        /**
493         * @return Returns the allowRnc.
494         */
495        public boolean isAllowRnc() {
496            return allowRnc;
497        }
498    
499        /**
500         * @param allowRnc
501         *            The allowRnc to set.
502         */
503        public void setAllowRnc(boolean allowRnc) {
504            this.allowRnc = allowRnc;
505        }
506    
507        /**
508         * @param b
509         */
510        public void setAllowHtml(boolean expectHtml) {
511            this.allowHtml = expectHtml;
512        }
513    
514        /**
515         * Returns the acceptAllKnownXmlTypes.
516         * 
517         * @return the acceptAllKnownXmlTypes
518         */
519        public boolean isAcceptAllKnownXmlTypes() {
520            return acceptAllKnownXmlTypes;
521        }
522    
523        /**
524         * Sets the acceptAllKnownXmlTypes.
525         * 
526         * @param acceptAllKnownXmlTypes
527         *            the acceptAllKnownXmlTypes to set
528         */
529        public void setAcceptAllKnownXmlTypes(boolean acceptAllKnownXmlTypes) {
530            this.acceptAllKnownXmlTypes = acceptAllKnownXmlTypes;
531        }
532    
533        /**
534         * Returns the allowGenericXml.
535         * 
536         * @return the allowGenericXml
537         */
538        public boolean isAllowGenericXml() {
539            return allowGenericXml;
540        }
541    
542        /**
543         * Sets the allowGenericXml.
544         * 
545         * @param allowGenericXml
546         *            the allowGenericXml to set
547         */
548        public void setAllowGenericXml(boolean allowGenericXml) {
549            this.allowGenericXml = allowGenericXml;
550        }
551    
552        /**
553         * Returns the allowXhtml.
554         * 
555         * @return the allowXhtml
556         */
557        public boolean isAllowXhtml() {
558            return allowXhtml;
559        }
560    
561        /**
562         * Sets the allowXhtml.
563         * 
564         * @param allowXhtml
565         *            the allowXhtml to set
566         */
567        public void setAllowXhtml(boolean allowXhtml) {
568            this.allowXhtml = allowXhtml;
569        }
570    
571        private String buildAccept() {
572            Set<String> types = new TreeSet<String>();
573            if (isAllowRnc()) {
574                types.add("application/relax-ng-compact-syntax");
575            }
576            if (isAllowHtml()) {
577                types.add("text/html; q=0.9");
578            }
579            if (isAllowXhtml()) {
580                types.add("application/xhtml+xml");
581                types.add("application/xml; q=0.5");
582            }
583            if (isAcceptAllKnownXmlTypes()) {
584                types.add("application/xhtml+xml");
585    //            types.add("application/atom+xml");
586                types.add("image/svg+xml");
587                types.add("application/docbook+xml");
588                types.add("application/xml; q=0.5");
589                types.add("text/xml; q=0.3");
590                types.add("*/*; q=0.1");
591            }
592            if (isAllowGenericXml()) {
593                types.add("application/xml; q=0.5");
594                types.add("text/xml; q=0.3");
595                types.add("*/*; q=0.1");
596            }
597            StringBuilder buf = new StringBuilder();
598            for (Iterator<String> iter = types.iterator(); iter.hasNext();) {
599                String str = iter.next();
600                buf.append(str);
601                buf.append(", ");
602            }
603            for (int i = 0; i < 2; i++) {
604                int len = buf.length();
605                if (len > 0) {
606                    buf.deleteCharAt(len - 1);
607                }
608            }
609            return buf.toString();
610        }
611    
612        /**
613         * Returns the allowHtml.
614         * 
615         * @return the allowHtml
616         */
617        public boolean isAllowHtml() {
618            return allowHtml;
619        }
620    
621        public boolean isOnlyHtmlAllowed() {
622            return !isAllowGenericXml() && !isAllowRnc() && !isAllowXhtml();
623        }
624    }