001 /*
002 * Copyright (c) 2003-2005 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package fi.iki.hsivonen.xml;
024
025 import java.io.IOException;
026 import java.io.OutputStream;
027 import java.util.Locale;
028 import java.util.NoSuchElementException;
029 import java.util.StringTokenizer;
030
031 import javax.xml.parsers.DocumentBuilder;
032 import javax.xml.parsers.DocumentBuilderFactory;
033 import javax.xml.parsers.ParserConfigurationException;
034
035 import org.w3c.dom.Document;
036 import org.w3c.dom.Element;
037 import org.w3c.dom.Node;
038 import org.xml.sax.ErrorHandler;
039 import org.xml.sax.SAXException;
040 import org.xml.sax.XMLReader;
041
042 import fi.iki.hsivonen.gnu.xml.pipeline.DomConsumer;
043 import fi.iki.hsivonen.schemas.dtd.DTDCatalog;
044 import fi.karppinen.xml.ContentHandlerEventConsumer;
045 import gnu.xml.dom.DomDocument;
046 import gnu.xml.pipeline.NSFilter;
047 import gnu.xml.pipeline.TextConsumer;
048 import gnu.xml.util.DomParser;
049
050 /**
051 * A collection of utility methods for working with the DOM.
052 *
053 * @author hsivonen
054 */
055 public class DOMUtils {
056
057 /**
058 * Finds the first occurrence of an element in the subtree rooted at
059 * <code>node</code>
060 *
061 * @param node the root of the subtree to search
062 * @param namespace the namespace URI of the element being seached
063 * @param localName the local name of the element being seached
064 *
065 * @return the first occurrence of the named element or <code>null</code>
066 * if not found
067 */
068 public static final Element findElement(Node node, String namespace,
069 String localName) {
070 Node current = node;
071 Node next;
072 for (;;) {
073 switch (current.getNodeType()) {
074 case Node.ELEMENT_NODE:
075 if (localName.equals(current.getLocalName())
076 && namespace.equals(current.getNamespaceURI())) {
077 return (Element) current;
078 }
079 // fall through
080 case Node.DOCUMENT_FRAGMENT_NODE:
081 case Node.DOCUMENT_NODE:
082 if ((next = current.getFirstChild()) != null) {
083 current = next;
084 continue;
085 }
086 }
087 for (;;) {
088 if ((next = current.getNextSibling()) != null) {
089 current = next;
090 break;
091 }
092 current = current.getParentNode();
093 if (current == node)
094 return null;
095 }
096 }
097 }
098
099 /**
100 * Finds an element of that has an attribute called <code>id</code> which
101 * has the given value and is not in a namespace. The IDness of the
102 * attribute is based on the attribute name--not on the DTD.
103 *
104 * @param node the root of the subtree to search
105 * @param id the value of the id attribute
106 * @return the first element that has the specified attribute
107 */
108 public static final Element getElementById(Node node, String id) {
109 Node current = node;
110 Node next;
111 for (;;) {
112 switch (current.getNodeType()) {
113 case Node.ELEMENT_NODE:
114 Element elt = (Element) current;
115 if (id.equals(elt.getAttribute("id"))) {
116 return elt;
117 }
118 // fall through
119 case Node.DOCUMENT_FRAGMENT_NODE:
120 case Node.DOCUMENT_NODE:
121 if ((next = current.getFirstChild()) != null) {
122 current = next;
123 continue;
124 }
125 }
126 for (;;) {
127 if ((next = current.getNextSibling()) != null) {
128 current = next;
129 break;
130 }
131 current = current.getParentNode();
132 if (current == node)
133 return null;
134 }
135 }
136 }
137
138 /**
139 * Returns the white space-normalized text content of the subtree rooted at
140 * <code>node</code>.
141 *
142 * @param node the subtree
143 * @return the white space-normalized text content
144 */
145 public static final String textContent(Node node) {
146 StringBuilder buf = new StringBuilder();
147 boolean lastIsWhitespace = true;
148 Node current = node;
149 Node next;
150 for (;;) {
151 switch (current.getNodeType()) {
152 case Node.TEXT_NODE:
153 case Node.CDATA_SECTION_NODE:
154 String text = current.getNodeValue();
155 for (int i = 0; i < text.length(); i++) {
156 char c = text.charAt(i);
157 if (c == ' ' || c == '\t' || c == '\n') {
158 if (!lastIsWhitespace) {
159 buf.append(' ');
160 lastIsWhitespace = true;
161 }
162 } else {
163 buf.append(c);
164 lastIsWhitespace = false;
165 }
166 }
167 break;
168 case Node.ELEMENT_NODE:
169 case Node.DOCUMENT_FRAGMENT_NODE:
170 case Node.DOCUMENT_NODE:
171 if ((next = current.getFirstChild()) != null) {
172 current = next;
173 continue;
174 }
175 break;
176 }
177 for (;;) {
178 if ((next = current.getNextSibling()) != null) {
179 current = next;
180 break;
181 }
182 current = current.getParentNode();
183 if (current == node) {
184 if (buf.charAt(buf.length() - 1) == ' ') {
185 buf.deleteCharAt(buf.length() - 1);
186 }
187 return buf.toString();
188 }
189 }
190 }
191 }
192
193 /**
194 * Instantiates a <code>DocumentBuilder</code> which is non-validating,
195 * is namespaces aware, expands entities using the local
196 * <code>DTDCatalog</code>, doesn't make arbitrary network connections,
197 * coalesces CDATA sections and ignores comments.
198 *
199 * @return a configured <code>DocumentBuilder</code>
200 */
201 public static final DocumentBuilder newNonvalidatingDocumentBuilder() {
202 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
203 factory.setNamespaceAware(true);
204 factory.setExpandEntityReferences(true);
205 factory.setCoalescing(true);
206 factory.setIgnoringComments(true);
207 factory.setIgnoringElementContentWhitespace(false);
208 factory.setValidating(false);
209 try {
210 DocumentBuilder builder = factory.newDocumentBuilder();
211 builder.setEntityResolver(DTDCatalog.getInstance());
212 builder.setErrorHandler(new SilentDraconianErrorHandler());
213 return builder;
214 } catch (ParserConfigurationException e) {
215 throw new RuntimeException(e);
216 }
217 }
218
219 /**
220 * Checks whether a node is a text or CDATA node consisting of white
221 * space only.
222 *
223 * @param node the node to examine
224 *
225 * @return <code>true</code> if it is a white space node and
226 * <code>false</code> otherwise
227 */
228 public static final boolean isWhiteSpace(Node node) {
229 String value = node.getNodeValue();
230 for (int i = 0; i < value.length(); i++) {
231 char c = value.charAt(i);
232 if (!(c == ' ' || c == '\t' || c == '\n')) {
233 return false;
234 }
235 }
236 return true;
237 }
238
239 public static final String language(Node node) {
240 for (;;) {
241 if (node == null) {
242 return "";
243 }
244 switch (node.getNodeType()) {
245 case Node.ELEMENT_NODE:
246 Element elt = (Element)node;
247 if (elt.hasAttributeNS(
248 "http://www.w3.org/XML/1998/namespace", "lang")) {
249 return elt.getAttributeNS(
250 "http://www.w3.org/XML/1998/namespace", "lang");
251 } else {
252 node = node.getParentNode();
253 }
254 break;
255 case Node.CDATA_SECTION_NODE:
256 case Node.TEXT_NODE:
257 node = node.getParentNode();
258 break;
259 default:
260 return "";
261 }
262 }
263 }
264
265 public static final Locale languageAsLocale(Node node) {
266 String lang = language(node);
267 if ("".equals(lang)) {
268 return null;
269 }
270 StringTokenizer tok = new StringTokenizer(lang, "-");
271 String language = null;
272 String country = null;
273 try {
274 language = tok.nextToken();
275 country = tok.nextToken();
276 } catch (NoSuchElementException e) {
277 }
278 if (language == null) {
279 return null;
280 }
281 if (country == null) {
282 return new Locale(language);
283 } else {
284 return new Locale(language, country);
285 }
286 }
287
288 public static final Element findBody(Document doc) {
289 for (Node n = doc.getDocumentElement().getLastChild(); n != null; n = n.getPreviousSibling()) {
290 if ("body".equals(n.getLocalName())
291 && "http://www.w3.org/1999/xhtml".equals(n.getNamespaceURI())) {
292 return (Element) n;
293 }
294 }
295 return null;
296 }
297
298 public static final void importChildrenBefore(Node from, Node to, Node ref) {
299 Document doc = to.getOwnerDocument();
300 for (Node curr = from.getLastChild(); curr != null; curr = curr.getPreviousSibling()) {
301 ref = to.insertBefore(doc.importNode(curr, true), ref);
302 }
303 }
304
305 /**
306 *
307 */
308 public static DomConsumer newDomConsumer() {
309 try {
310 // return new DomConsumer(
311 // DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument().getClass());
312 return new DomConsumer(DomDocument.class);
313 } catch (Exception e) {
314 throw new RuntimeException(e);
315 }
316 }
317
318 public static void serialize(Document doc, OutputStream out)
319 throws IOException {
320 DomParser parser = new DomParser(doc);
321 TextConsumer writer = new TextConsumer(out);
322 writer.setXhtml(false);
323 writer.setPrettyPrinting(false);
324 NSFilter nsFix = new NSFilter(writer);
325 parser.setContentHandler(nsFix);
326 try {
327 parser.setFeature("http://xml.org/sax/features/namespace-prefixes",
328 false);
329 parser.parse("");
330 } catch (SAXException e) {
331 if (e.getException() instanceof IOException) {
332 throw (IOException) new IOException().initCause(e);
333 } else {
334 // This shouldn't happen unless there is a bug that can be
335 // likened to a NullPointerException. We're not parsing XML
336 // but traversing a tree that is known to be a tree.
337 throw new RuntimeException(e);
338 }
339 }
340 }
341
342 public static Document loadFromUrl(String url) throws SAXException, IOException {
343 ErrorHandler eh = new SilentDraconianErrorHandler();
344 PrudentHttpEntityResolver pher = new PrudentHttpEntityResolver(5000*1024, true, eh);
345 TypedInputSource tis = (TypedInputSource) pher.resolveEntity(null, url);
346 if("text/html".equals(tis.getType())) {
347 if(tis.getEncoding() == null) {
348 tis.setEncoding("windows-1252");
349 }
350 XMLReader tagSoup = SAXUtils.newTagSoupXMLReader();
351 DomConsumer builder = DOMUtils.newDomConsumer();
352 LangToXmlLang lang = new LangToXmlLang(new ContentHandlerEventConsumer(builder.getContentHandler()));
353 tagSoup.setContentHandler(lang);
354 tagSoup.parse(tis);
355 return builder.getDocument();
356 } else {
357 DocumentBuilder builder = DOMUtils.newNonvalidatingDocumentBuilder();
358 return builder.parse(tis);
359 }
360 }
361
362 public static void main(String[] args) throws SAXException, IOException {
363 PrudentHttpEntityResolver.setParams(5000, 5000, 100);
364 Document doc = loadFromUrl("http://hsivonen.iki.fi/");
365 System.out.println(language(doc.getElementsByTagName("dt").item(0)));
366 }
367 }