001    /*
002     * Copyright (c) 2003-2005 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.xml;
024    
025    import java.io.IOException;
026    import java.io.OutputStream;
027    import java.util.Locale;
028    import java.util.NoSuchElementException;
029    import java.util.StringTokenizer;
030    
031    import javax.xml.parsers.DocumentBuilder;
032    import javax.xml.parsers.DocumentBuilderFactory;
033    import javax.xml.parsers.ParserConfigurationException;
034    
035    import org.w3c.dom.Document;
036    import org.w3c.dom.Element;
037    import org.w3c.dom.Node;
038    import org.xml.sax.ErrorHandler;
039    import org.xml.sax.SAXException;
040    import org.xml.sax.XMLReader;
041    
042    import fi.iki.hsivonen.gnu.xml.pipeline.DomConsumer;
043    import fi.iki.hsivonen.schemas.dtd.DTDCatalog;
044    import fi.karppinen.xml.ContentHandlerEventConsumer;
045    import gnu.xml.dom.DomDocument;
046    import gnu.xml.pipeline.NSFilter;
047    import gnu.xml.pipeline.TextConsumer;
048    import gnu.xml.util.DomParser;
049    
050    /**
051     * A collection of utility methods for working with the DOM.
052     *
053     * @author hsivonen
054     */
055    public class DOMUtils {
056    
057        /**
058         * Finds the first occurrence of an element in the subtree rooted at 
059         * <code>node</code>
060         *
061         * @param node the root of the subtree to search
062         * @param namespace the namespace URI of the element being seached
063         * @param localName the local name of the element being seached
064         *
065         * @return the first occurrence of the named element or <code>null</code>
066         *         if not found
067         */
068        public static final Element findElement(Node node, String namespace,
069                String localName) {
070            Node current = node;
071            Node next;
072            for (;;) {
073                switch (current.getNodeType()) {
074                    case Node.ELEMENT_NODE:
075                        if (localName.equals(current.getLocalName())
076                                && namespace.equals(current.getNamespaceURI())) {
077                            return (Element) current;
078                        }
079                    // fall through
080                    case Node.DOCUMENT_FRAGMENT_NODE:
081                    case Node.DOCUMENT_NODE:
082                        if ((next = current.getFirstChild()) != null) {
083                            current = next;
084                            continue;
085                        }
086                }
087                for (;;) {
088                    if ((next = current.getNextSibling()) != null) {
089                        current = next;
090                        break;
091                    }
092                    current = current.getParentNode();
093                    if (current == node)
094                        return null;
095                }
096            }
097        }
098    
099        /**
100         * Finds an element of that has an attribute called <code>id</code> which 
101         * has the given value and is not in a namespace. The IDness of the 
102         * attribute is based on the attribute name--not on the DTD.
103         * 
104         * @param node the root of the subtree to search
105         * @param id the value of the id attribute
106         * @return the first element that has the specified attribute
107         */
108        public static final Element getElementById(Node node, String id) {
109            Node current = node;
110            Node next;
111            for (;;) {
112                switch (current.getNodeType()) {
113                    case Node.ELEMENT_NODE:
114                        Element elt = (Element) current;
115                        if (id.equals(elt.getAttribute("id"))) {
116                            return elt;
117                        }
118                    // fall through
119                    case Node.DOCUMENT_FRAGMENT_NODE:
120                    case Node.DOCUMENT_NODE:
121                        if ((next = current.getFirstChild()) != null) {
122                            current = next;
123                            continue;
124                        }
125                }
126                for (;;) {
127                    if ((next = current.getNextSibling()) != null) {
128                        current = next;
129                        break;
130                    }
131                    current = current.getParentNode();
132                    if (current == node)
133                        return null;
134                }
135            }
136        }
137    
138        /**
139         * Returns the white space-normalized text content of the subtree rooted at 
140         * <code>node</code>.
141         *
142         * @param node the subtree
143         * @return the white space-normalized text content
144         */
145        public static final String textContent(Node node) {
146            StringBuilder buf = new StringBuilder();
147            boolean lastIsWhitespace = true;
148            Node current = node;
149            Node next;
150            for (;;) {
151                switch (current.getNodeType()) {
152                    case Node.TEXT_NODE:
153                    case Node.CDATA_SECTION_NODE:
154                        String text = current.getNodeValue();
155                        for (int i = 0; i < text.length(); i++) {
156                            char c = text.charAt(i);
157                            if (c == ' ' || c == '\t' || c == '\n') {
158                                if (!lastIsWhitespace) {
159                                    buf.append(' ');
160                                    lastIsWhitespace = true;
161                                }
162                            } else {
163                                buf.append(c);
164                                lastIsWhitespace = false;
165                            }
166                        }
167                        break;
168                    case Node.ELEMENT_NODE:
169                    case Node.DOCUMENT_FRAGMENT_NODE:
170                    case Node.DOCUMENT_NODE:
171                        if ((next = current.getFirstChild()) != null) {
172                            current = next;
173                            continue;
174                        }
175                        break;
176                }
177                for (;;) {
178                    if ((next = current.getNextSibling()) != null) {
179                        current = next;
180                        break;
181                    }
182                    current = current.getParentNode();
183                    if (current == node) {
184                        if (buf.charAt(buf.length() - 1) == ' ') {
185                            buf.deleteCharAt(buf.length() - 1);
186                        }
187                        return buf.toString();
188                    }
189                }
190            }
191        }
192    
193        /**
194         * Instantiates a <code>DocumentBuilder</code> which is non-validating, 
195         * is namespaces aware, expands entities using the local 
196         * <code>DTDCatalog</code>, doesn't make arbitrary network connections, 
197         * coalesces CDATA sections and ignores comments.
198         *
199         * @return a configured <code>DocumentBuilder</code>
200         */
201        public static final DocumentBuilder newNonvalidatingDocumentBuilder() {
202            DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
203            factory.setNamespaceAware(true);
204            factory.setExpandEntityReferences(true);
205            factory.setCoalescing(true);
206            factory.setIgnoringComments(true);
207            factory.setIgnoringElementContentWhitespace(false);
208            factory.setValidating(false);
209            try {
210                DocumentBuilder builder = factory.newDocumentBuilder();
211                builder.setEntityResolver(DTDCatalog.getInstance());
212                builder.setErrorHandler(new SilentDraconianErrorHandler());
213                return builder;
214            } catch (ParserConfigurationException e) {
215                throw new RuntimeException(e);
216            }
217        }
218    
219        /**
220         * Checks whether a node is a text or CDATA node consisting of white 
221         * space only.
222         *
223         * @param node the node to examine
224         *
225         * @return <code>true</code> if it is a white space node and 
226         * <code>false</code> otherwise
227         */
228        public static final boolean isWhiteSpace(Node node) {
229            String value = node.getNodeValue();
230            for (int i = 0; i < value.length(); i++) {
231                char c = value.charAt(i);
232                if (!(c == ' ' || c == '\t' || c == '\n')) {
233                    return false;
234                }
235            }
236            return true;
237        }
238    
239        public static final String language(Node node) {
240            for (;;) {
241                if (node == null) {
242                    return "";
243                }
244                switch (node.getNodeType()) {
245                    case Node.ELEMENT_NODE:
246                        Element elt = (Element)node;
247                        if (elt.hasAttributeNS(
248                                "http://www.w3.org/XML/1998/namespace", "lang")) {
249                            return elt.getAttributeNS(
250                                    "http://www.w3.org/XML/1998/namespace", "lang");
251                        } else {
252                            node = node.getParentNode();
253                        }
254                        break;
255                    case Node.CDATA_SECTION_NODE:
256                    case Node.TEXT_NODE:
257                        node = node.getParentNode();
258                        break;
259                    default:
260                        return "";
261                }
262            }
263        }
264    
265        public static final Locale languageAsLocale(Node node) {
266            String lang = language(node);
267            if ("".equals(lang)) {
268                return null;
269            }
270            StringTokenizer tok = new StringTokenizer(lang, "-");
271            String language = null;
272            String country = null;
273            try {
274                language = tok.nextToken();
275                country = tok.nextToken();
276            } catch (NoSuchElementException e) {
277            }
278            if (language == null) {
279                return null;
280            }
281            if (country == null) {
282                return new Locale(language);
283            } else {
284                return new Locale(language, country);
285            }
286        }
287    
288        public static final Element findBody(Document doc) {
289            for (Node n = doc.getDocumentElement().getLastChild(); n != null; n = n.getPreviousSibling()) {
290                if ("body".equals(n.getLocalName())
291                        && "http://www.w3.org/1999/xhtml".equals(n.getNamespaceURI())) {
292                    return (Element) n;
293                }
294            }
295            return null;
296        }
297    
298        public static final void importChildrenBefore(Node from, Node to, Node ref) {
299            Document doc = to.getOwnerDocument();
300            for (Node curr = from.getLastChild(); curr != null; curr = curr.getPreviousSibling()) {
301                ref = to.insertBefore(doc.importNode(curr, true), ref);
302            }
303        }
304    
305        /**
306         *
307         */
308        public static DomConsumer newDomConsumer() {
309            try {
310    //            return new DomConsumer(
311    //                    DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument().getClass());
312                return new DomConsumer(DomDocument.class);
313            } catch (Exception e) {
314                throw new RuntimeException(e);
315            }
316        }
317    
318        public static void serialize(Document doc, OutputStream out)
319                throws IOException {
320            DomParser parser = new DomParser(doc);
321            TextConsumer writer = new TextConsumer(out);
322            writer.setXhtml(false);
323            writer.setPrettyPrinting(false);
324            NSFilter nsFix = new NSFilter(writer);
325            parser.setContentHandler(nsFix);
326            try {
327                parser.setFeature("http://xml.org/sax/features/namespace-prefixes",
328                        false);
329                parser.parse("");
330            } catch (SAXException e) {
331                if (e.getException() instanceof IOException) {
332                    throw (IOException) new IOException().initCause(e);
333                } else {
334                    // This shouldn't happen unless there is a bug that can be 
335                    // likened to a NullPointerException. We're not parsing XML 
336                    // but traversing a tree that is known to be a tree.
337                    throw new RuntimeException(e);
338                }
339            }
340        }
341        
342        public static Document loadFromUrl(String url) throws SAXException, IOException {
343            ErrorHandler eh = new SilentDraconianErrorHandler();
344            PrudentHttpEntityResolver pher = new PrudentHttpEntityResolver(5000*1024, true, eh);
345            TypedInputSource tis = (TypedInputSource) pher.resolveEntity(null, url);
346            if("text/html".equals(tis.getType())) {
347                if(tis.getEncoding() == null) {
348                    tis.setEncoding("windows-1252");
349                }
350                XMLReader tagSoup = SAXUtils.newTagSoupXMLReader();
351                DomConsumer builder = DOMUtils.newDomConsumer();
352                LangToXmlLang lang = new LangToXmlLang(new ContentHandlerEventConsumer(builder.getContentHandler()));
353                tagSoup.setContentHandler(lang);
354                tagSoup.parse(tis);
355                return builder.getDocument();
356            } else {
357                DocumentBuilder builder = DOMUtils.newNonvalidatingDocumentBuilder();
358                return builder.parse(tis);
359            }
360        }
361        
362        public static void main(String[] args) throws SAXException, IOException {
363            PrudentHttpEntityResolver.setParams(5000, 5000, 100);
364            Document doc = loadFromUrl("http://hsivonen.iki.fi/");
365            System.out.println(language(doc.getElementsByTagName("dt").item(0)));
366        }
367    }