001 /* 002 * Copyright (c) 2003-2005 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.iki.hsivonen.xml; 024 025 import java.io.IOException; 026 import java.io.OutputStream; 027 import java.util.Locale; 028 import java.util.NoSuchElementException; 029 import java.util.StringTokenizer; 030 031 import javax.xml.parsers.DocumentBuilder; 032 import javax.xml.parsers.DocumentBuilderFactory; 033 import javax.xml.parsers.ParserConfigurationException; 034 035 import org.w3c.dom.Document; 036 import org.w3c.dom.Element; 037 import org.w3c.dom.Node; 038 import org.xml.sax.ErrorHandler; 039 import org.xml.sax.SAXException; 040 import org.xml.sax.XMLReader; 041 042 import fi.iki.hsivonen.gnu.xml.pipeline.DomConsumer; 043 import fi.iki.hsivonen.schemas.dtd.DTDCatalog; 044 import fi.karppinen.xml.ContentHandlerEventConsumer; 045 import gnu.xml.dom.DomDocument; 046 import gnu.xml.pipeline.NSFilter; 047 import gnu.xml.pipeline.TextConsumer; 048 import gnu.xml.util.DomParser; 049 050 /** 051 * A collection of utility methods for working with the DOM. 052 * 053 * @author hsivonen 054 */ 055 public class DOMUtils { 056 057 /** 058 * Finds the first occurrence of an element in the subtree rooted at 059 * <code>node</code> 060 * 061 * @param node the root of the subtree to search 062 * @param namespace the namespace URI of the element being seached 063 * @param localName the local name of the element being seached 064 * 065 * @return the first occurrence of the named element or <code>null</code> 066 * if not found 067 */ 068 public static final Element findElement(Node node, String namespace, 069 String localName) { 070 Node current = node; 071 Node next; 072 for (;;) { 073 switch (current.getNodeType()) { 074 case Node.ELEMENT_NODE: 075 if (localName.equals(current.getLocalName()) 076 && namespace.equals(current.getNamespaceURI())) { 077 return (Element) current; 078 } 079 // fall through 080 case Node.DOCUMENT_FRAGMENT_NODE: 081 case Node.DOCUMENT_NODE: 082 if ((next = current.getFirstChild()) != null) { 083 current = next; 084 continue; 085 } 086 } 087 for (;;) { 088 if ((next = current.getNextSibling()) != null) { 089 current = next; 090 break; 091 } 092 current = current.getParentNode(); 093 if (current == node) 094 return null; 095 } 096 } 097 } 098 099 /** 100 * Finds an element of that has an attribute called <code>id</code> which 101 * has the given value and is not in a namespace. The IDness of the 102 * attribute is based on the attribute name--not on the DTD. 103 * 104 * @param node the root of the subtree to search 105 * @param id the value of the id attribute 106 * @return the first element that has the specified attribute 107 */ 108 public static final Element getElementById(Node node, String id) { 109 Node current = node; 110 Node next; 111 for (;;) { 112 switch (current.getNodeType()) { 113 case Node.ELEMENT_NODE: 114 Element elt = (Element) current; 115 if (id.equals(elt.getAttribute("id"))) { 116 return elt; 117 } 118 // fall through 119 case Node.DOCUMENT_FRAGMENT_NODE: 120 case Node.DOCUMENT_NODE: 121 if ((next = current.getFirstChild()) != null) { 122 current = next; 123 continue; 124 } 125 } 126 for (;;) { 127 if ((next = current.getNextSibling()) != null) { 128 current = next; 129 break; 130 } 131 current = current.getParentNode(); 132 if (current == node) 133 return null; 134 } 135 } 136 } 137 138 /** 139 * Returns the white space-normalized text content of the subtree rooted at 140 * <code>node</code>. 141 * 142 * @param node the subtree 143 * @return the white space-normalized text content 144 */ 145 public static final String textContent(Node node) { 146 StringBuilder buf = new StringBuilder(); 147 boolean lastIsWhitespace = true; 148 Node current = node; 149 Node next; 150 for (;;) { 151 switch (current.getNodeType()) { 152 case Node.TEXT_NODE: 153 case Node.CDATA_SECTION_NODE: 154 String text = current.getNodeValue(); 155 for (int i = 0; i < text.length(); i++) { 156 char c = text.charAt(i); 157 if (c == ' ' || c == '\t' || c == '\n') { 158 if (!lastIsWhitespace) { 159 buf.append(' '); 160 lastIsWhitespace = true; 161 } 162 } else { 163 buf.append(c); 164 lastIsWhitespace = false; 165 } 166 } 167 break; 168 case Node.ELEMENT_NODE: 169 case Node.DOCUMENT_FRAGMENT_NODE: 170 case Node.DOCUMENT_NODE: 171 if ((next = current.getFirstChild()) != null) { 172 current = next; 173 continue; 174 } 175 break; 176 } 177 for (;;) { 178 if ((next = current.getNextSibling()) != null) { 179 current = next; 180 break; 181 } 182 current = current.getParentNode(); 183 if (current == node) { 184 if (buf.charAt(buf.length() - 1) == ' ') { 185 buf.deleteCharAt(buf.length() - 1); 186 } 187 return buf.toString(); 188 } 189 } 190 } 191 } 192 193 /** 194 * Instantiates a <code>DocumentBuilder</code> which is non-validating, 195 * is namespaces aware, expands entities using the local 196 * <code>DTDCatalog</code>, doesn't make arbitrary network connections, 197 * coalesces CDATA sections and ignores comments. 198 * 199 * @return a configured <code>DocumentBuilder</code> 200 */ 201 public static final DocumentBuilder newNonvalidatingDocumentBuilder() { 202 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 203 factory.setNamespaceAware(true); 204 factory.setExpandEntityReferences(true); 205 factory.setCoalescing(true); 206 factory.setIgnoringComments(true); 207 factory.setIgnoringElementContentWhitespace(false); 208 factory.setValidating(false); 209 try { 210 DocumentBuilder builder = factory.newDocumentBuilder(); 211 builder.setEntityResolver(DTDCatalog.getInstance()); 212 builder.setErrorHandler(new SilentDraconianErrorHandler()); 213 return builder; 214 } catch (ParserConfigurationException e) { 215 throw new RuntimeException(e); 216 } 217 } 218 219 /** 220 * Checks whether a node is a text or CDATA node consisting of white 221 * space only. 222 * 223 * @param node the node to examine 224 * 225 * @return <code>true</code> if it is a white space node and 226 * <code>false</code> otherwise 227 */ 228 public static final boolean isWhiteSpace(Node node) { 229 String value = node.getNodeValue(); 230 for (int i = 0; i < value.length(); i++) { 231 char c = value.charAt(i); 232 if (!(c == ' ' || c == '\t' || c == '\n')) { 233 return false; 234 } 235 } 236 return true; 237 } 238 239 public static final String language(Node node) { 240 for (;;) { 241 if (node == null) { 242 return ""; 243 } 244 switch (node.getNodeType()) { 245 case Node.ELEMENT_NODE: 246 Element elt = (Element)node; 247 if (elt.hasAttributeNS( 248 "http://www.w3.org/XML/1998/namespace", "lang")) { 249 return elt.getAttributeNS( 250 "http://www.w3.org/XML/1998/namespace", "lang"); 251 } else { 252 node = node.getParentNode(); 253 } 254 break; 255 case Node.CDATA_SECTION_NODE: 256 case Node.TEXT_NODE: 257 node = node.getParentNode(); 258 break; 259 default: 260 return ""; 261 } 262 } 263 } 264 265 public static final Locale languageAsLocale(Node node) { 266 String lang = language(node); 267 if ("".equals(lang)) { 268 return null; 269 } 270 StringTokenizer tok = new StringTokenizer(lang, "-"); 271 String language = null; 272 String country = null; 273 try { 274 language = tok.nextToken(); 275 country = tok.nextToken(); 276 } catch (NoSuchElementException e) { 277 } 278 if (language == null) { 279 return null; 280 } 281 if (country == null) { 282 return new Locale(language); 283 } else { 284 return new Locale(language, country); 285 } 286 } 287 288 public static final Element findBody(Document doc) { 289 for (Node n = doc.getDocumentElement().getLastChild(); n != null; n = n.getPreviousSibling()) { 290 if ("body".equals(n.getLocalName()) 291 && "http://www.w3.org/1999/xhtml".equals(n.getNamespaceURI())) { 292 return (Element) n; 293 } 294 } 295 return null; 296 } 297 298 public static final void importChildrenBefore(Node from, Node to, Node ref) { 299 Document doc = to.getOwnerDocument(); 300 for (Node curr = from.getLastChild(); curr != null; curr = curr.getPreviousSibling()) { 301 ref = to.insertBefore(doc.importNode(curr, true), ref); 302 } 303 } 304 305 /** 306 * 307 */ 308 public static DomConsumer newDomConsumer() { 309 try { 310 // return new DomConsumer( 311 // DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument().getClass()); 312 return new DomConsumer(DomDocument.class); 313 } catch (Exception e) { 314 throw new RuntimeException(e); 315 } 316 } 317 318 public static void serialize(Document doc, OutputStream out) 319 throws IOException { 320 DomParser parser = new DomParser(doc); 321 TextConsumer writer = new TextConsumer(out); 322 writer.setXhtml(false); 323 writer.setPrettyPrinting(false); 324 NSFilter nsFix = new NSFilter(writer); 325 parser.setContentHandler(nsFix); 326 try { 327 parser.setFeature("http://xml.org/sax/features/namespace-prefixes", 328 false); 329 parser.parse(""); 330 } catch (SAXException e) { 331 if (e.getException() instanceof IOException) { 332 throw (IOException) new IOException().initCause(e); 333 } else { 334 // This shouldn't happen unless there is a bug that can be 335 // likened to a NullPointerException. We're not parsing XML 336 // but traversing a tree that is known to be a tree. 337 throw new RuntimeException(e); 338 } 339 } 340 } 341 342 public static Document loadFromUrl(String url) throws SAXException, IOException { 343 ErrorHandler eh = new SilentDraconianErrorHandler(); 344 PrudentHttpEntityResolver pher = new PrudentHttpEntityResolver(5000*1024, true, eh); 345 TypedInputSource tis = (TypedInputSource) pher.resolveEntity(null, url); 346 if("text/html".equals(tis.getType())) { 347 if(tis.getEncoding() == null) { 348 tis.setEncoding("windows-1252"); 349 } 350 XMLReader tagSoup = SAXUtils.newTagSoupXMLReader(); 351 DomConsumer builder = DOMUtils.newDomConsumer(); 352 LangToXmlLang lang = new LangToXmlLang(new ContentHandlerEventConsumer(builder.getContentHandler())); 353 tagSoup.setContentHandler(lang); 354 tagSoup.parse(tis); 355 return builder.getDocument(); 356 } else { 357 DocumentBuilder builder = DOMUtils.newNonvalidatingDocumentBuilder(); 358 return builder.parse(tis); 359 } 360 } 361 362 public static void main(String[] args) throws SAXException, IOException { 363 PrudentHttpEntityResolver.setParams(5000, 5000, 100); 364 Document doc = loadFromUrl("http://hsivonen.iki.fi/"); 365 System.out.println(language(doc.getElementsByTagName("dt").item(0))); 366 } 367 }