001 /* 002 * Copyright (c) 2003, 2004 Henri Sivonen and Taavi Hupponen 003 * Copyright (c) 2006 Henri Sivonen 004 * 005 * Permission is hereby granted, free of charge, to any person obtaining a 006 * copy of this software and associated documentation files (the "Software"), 007 * to deal in the Software without restriction, including without limitation 008 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 009 * and/or sell copies of the Software, and to permit persons to whom the 010 * Software is furnished to do so, subject to the following conditions: 011 * 012 * The above copyright notice and this permission notice shall be included in 013 * all copies or substantial portions of the Software. 014 * 015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 021 * DEALINGS IN THE SOFTWARE. 022 */ 023 024 package fi.iki.hsivonen.xml; 025 026 import java.io.IOException; 027 import java.io.OutputStream; 028 import java.io.OutputStreamWriter; 029 import java.io.UnsupportedEncodingException; 030 import java.io.Writer; 031 import java.util.Arrays; 032 033 import org.xml.sax.Attributes; 034 import org.xml.sax.ContentHandler; 035 import org.xml.sax.Locator; 036 import org.xml.sax.SAXException; 037 import org.xml.sax.XMLReader; 038 039 import fi.iki.hsivonen.io.NcrEscapingWindows1252OutputStreamWriter; 040 041 /** 042 * Serializes a sequence of SAX events representing an XHTML 1.0 Strict document 043 * to an <code>OutputStream</code> as a UTF-8-encoded HTML 4.01 Strict 044 * document. The SAX events must represent a valid XHTML 1.0 document, except 045 * the namespace prefixes don't matter and there may be 046 * <code>startElement</code> and <code>endElement</code> calls for elements 047 * from other namespaces. The <code>startElement</code> and 048 * <code>endElement</code> calls for non-XHTML elements are ignored. No 049 * validity checking is performed. Hence, the emitter of the SAX events is 050 * responsible for making sure the events represent a document that meets the 051 * above requirements. The <code>OutputStream</code> is closed when the end of 052 * the document is seen. 053 * 054 * @version $Id: HtmlSerializer.java,v 1.18 2006/10/30 20:03:10 hsivonen Exp $ 055 * @author hsivonen 056 * @author taavi 057 */ 058 public class HtmlSerializer implements ContentHandler { 059 060 public final static int NO_DOCTYPE = 0; 061 062 public final static int DOCTYPE_HTML401_TRANSITIONAL = 1; 063 064 public final static int DOCTYPE_HTML401_STRICT = 2; 065 066 public final static int DOCTYPE_HTML5 = 3; 067 068 /** 069 * The XHTML namespace URI 070 */ 071 private final static String XHTML_NS = "http://www.w3.org/1999/xhtml"; 072 073 /** 074 * HTML 4.01 elements which don't have an end tag 075 */ 076 private static final String[] emptyElements = { "area", "base", "basefont", 077 "br", "col", "command", "frame", "hr", "img", "input", "isindex", 078 "link", "meta", "param" }; 079 080 /** 081 * Minimized "boolean" HTML attributes 082 */ 083 private static final String[] booleanAttributes = { "active", "async", 084 "autofocus", "autosubmit", "checked", "compact", "declare", 085 "default", "defer", "disabled", "ismap", "multiple", "nohref", 086 "noresize", "noshade", "nowrap", "readonly", "required", "selected" }; 087 088 /** 089 * The writer used for output 090 */ 091 protected Writer writer; 092 093 private int doctype; 094 095 private String encoding; 096 097 private boolean emitMeta; 098 099 /** 100 * Creates a new instance of HtmlSerializer in the HTML 4.01 doctype mode 101 * with the UTF-8 encoding and no charset meta. 102 * 103 * @param out 104 * the stream to which the output is written 105 */ 106 public HtmlSerializer(OutputStream out) { 107 this(out, DOCTYPE_HTML401_STRICT, false, "UTF-8"); 108 } 109 110 public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta) { 111 this(out, doctype, emitMeta, "UTF-8"); 112 } 113 114 public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta, 115 String enc) { 116 this.emitMeta = emitMeta; 117 if (doctype < 0 || doctype > 3) { 118 throw new IllegalArgumentException("Bad doctype constant."); 119 } 120 this.doctype = doctype; 121 if ("UTF-8".equalsIgnoreCase(enc)) { 122 try { 123 this.encoding = "UTF-8"; 124 this.writer = new OutputStreamWriter(out, "UTF-8"); 125 } catch (UnsupportedEncodingException uee) { 126 throw new RuntimeException("UTF-8 not supported", uee); 127 } 128 } else if ("Windows-1252".equalsIgnoreCase(enc)) { 129 this.encoding = "Windows-1252"; 130 this.writer = new NcrEscapingWindows1252OutputStreamWriter(out); 131 } else { 132 throw new IllegalArgumentException( 133 "Encoding must be UTF-8 or Windows-1252."); 134 } 135 } 136 137 /** 138 * Writes out characters. 139 * 140 * @param ch 141 * the source array 142 * @param start 143 * the index of the first character to be written 144 * @param length 145 * the number of characters to write 146 * 147 * @throws SAXException 148 * if there are IO problems 149 */ 150 public void characters(char[] ch, int start, int length) 151 throws SAXException { 152 try { 153 for (int j = 0; j < length; j++) { 154 char c = ch[start + j]; 155 switch (c) { 156 case '<': 157 this.writer.write("<"); 158 break; 159 case '>': 160 this.writer.write(">"); 161 break; 162 case '&': 163 this.writer.write("&"); 164 break; 165 default: 166 this.writer.write(c); 167 } 168 } 169 } catch (IOException ioe) { 170 throw new SAXException(ioe); 171 } 172 } 173 174 /** 175 * Must be called in the end. 176 * 177 * @throws SAXException 178 * if there are IO problems 179 */ 180 public void endDocument() throws SAXException { 181 try { 182 this.writer.close(); 183 } catch (IOException ioe) { 184 throw new SAXException(ioe); 185 } 186 } 187 188 /** 189 * Writes an end tag if the element is an XHTML element and is not an empty 190 * element in HTML 4.01 Strict. 191 * 192 * @param namespaceURI 193 * the XML namespace 194 * @param localName 195 * the element name in the namespace 196 * @param qName 197 * ignored 198 * 199 * @throws SAXException 200 * if there are IO problems 201 */ 202 public void endElement(String namespaceURI, String localName, String qName) 203 throws SAXException { 204 try { 205 if (XHTML_NS.equals(namespaceURI) 206 && Arrays.binarySearch(emptyElements, localName) < 0) { 207 this.writer.write("</"); 208 this.writer.write(localName); 209 this.writer.write('>'); 210 } 211 } catch (IOException ioe) { 212 throw new SAXException(ioe); 213 } 214 } 215 216 /** 217 * Must be called first. 218 */ 219 public void startDocument() throws SAXException { 220 try { 221 switch (doctype) { 222 case NO_DOCTYPE: 223 return; 224 case DOCTYPE_HTML5: 225 writer.write("<!DOCTYPE html>\n"); 226 return; 227 case DOCTYPE_HTML401_STRICT: 228 writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n"); 229 return; 230 case DOCTYPE_HTML401_TRANSITIONAL: 231 writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n"); 232 return; 233 } 234 } catch (IOException ioe) { 235 throw new SAXException(ioe); 236 } 237 } 238 239 /** 240 * Writes a start tag if the element is an XHTML element. 241 * 242 * @param namespaceURI 243 * the XML namespace 244 * @param localName 245 * the element name in the namespace 246 * @param qName 247 * ignored 248 * @param atts 249 * the attribute list 250 * 251 * @throws SAXException 252 * if there are IO problems 253 */ 254 public void startElement(String namespaceURI, String localName, 255 String qName, Attributes atts) throws SAXException { 256 try { 257 if (XHTML_NS.equals(namespaceURI)) { 258 259 if ("meta".equals(localName) 260 && ((atts.getIndex("", "http-equiv") != -1) || (atts.getIndex( 261 "", "httpequiv") != -1))) { 262 return; 263 } 264 265 // start and element name 266 this.writer.write('<'); 267 this.writer.write(localName); 268 269 // attributes 270 int length = atts.getLength(); 271 boolean langPrinted = false; 272 for (int i = 0; i < length; i++) { 273 String ns = atts.getURI(i); 274 String name = null; 275 if ("".equals(ns)) { 276 name = atts.getLocalName(i); 277 } else if ("http://www.w3.org/XML/1998/namespace".equals(ns) 278 && "lang".equals(atts.getLocalName(i))) { 279 name = "lang"; 280 } 281 if (name != null && !(langPrinted && "lang".equals(name))) { 282 this.writer.write(' '); 283 this.writer.write(name); 284 if ("lang".equals(name)) { 285 langPrinted = true; 286 } 287 if (Arrays.binarySearch(booleanAttributes, name) < 0) { 288 // write value, escape certain characters 289 this.writer.write("=\""); 290 String value = atts.getValue(i); 291 for (int j = 0; j < value.length(); j++) { 292 char c = value.charAt(j); 293 switch (c) { 294 case '<': 295 this.writer.write("<"); 296 break; 297 case '>': 298 this.writer.write(">"); 299 break; 300 case '&': 301 this.writer.write("&"); 302 break; 303 case '"': 304 this.writer.write("""); 305 break; 306 default: 307 this.writer.write(c); 308 } 309 } 310 311 this.writer.write('"'); 312 } 313 } 314 } 315 316 // close 317 this.writer.write('>'); 318 if (emitMeta && "head".equals(localName)) { 319 this.writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset="); 320 this.writer.write(encoding); 321 this.writer.write("\">"); 322 } 323 } 324 } catch (IOException ioe) { 325 throw new SAXException(ioe); 326 } 327 } 328 329 /** 330 * Used for testing. Pass a file:// URL as the command line argument. 331 */ 332 public static void main(String[] args) { 333 try { 334 javax.xml.parsers.SAXParserFactory fac = javax.xml.parsers.SAXParserFactory.newInstance(); 335 fac.setNamespaceAware(true); 336 fac.setValidating(false); 337 XMLReader parser = fac.newSAXParser().getXMLReader(); 338 parser.setContentHandler(new HtmlSerializer(System.out)); 339 parser.parse(args[0]); 340 } catch (Exception e) { 341 throw new RuntimeException(e); 342 } 343 } 344 345 /** Does nothing. */ 346 public void endPrefixMapping(String str) throws SAXException { 347 } 348 349 /** Does nothing. */ 350 public void ignorableWhitespace(char[] values, int param, int param2) 351 throws SAXException { 352 } 353 354 /** Does nothing. */ 355 public void processingInstruction(String str, String str1) 356 throws SAXException { 357 } 358 359 /** Does nothing. */ 360 public void setDocumentLocator(Locator locator) { 361 } 362 363 /** Does nothing. */ 364 public void skippedEntity(String str) throws SAXException { 365 } 366 367 /** Does nothing. */ 368 public void startPrefixMapping(String str, String str1) throws SAXException { 369 } 370 }