001 /*
002 * Copyright (c) 2003, 2004 Henri Sivonen and Taavi Hupponen
003 * Copyright (c) 2006 Henri Sivonen
004 *
005 * Permission is hereby granted, free of charge, to any person obtaining a
006 * copy of this software and associated documentation files (the "Software"),
007 * to deal in the Software without restriction, including without limitation
008 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
009 * and/or sell copies of the Software, and to permit persons to whom the
010 * Software is furnished to do so, subject to the following conditions:
011 *
012 * The above copyright notice and this permission notice shall be included in
013 * all copies or substantial portions of the Software.
014 *
015 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
016 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
017 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
018 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
019 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
020 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
021 * DEALINGS IN THE SOFTWARE.
022 */
023
024 package fi.iki.hsivonen.xml;
025
026 import java.io.IOException;
027 import java.io.OutputStream;
028 import java.io.OutputStreamWriter;
029 import java.io.UnsupportedEncodingException;
030 import java.io.Writer;
031 import java.util.Arrays;
032
033 import org.xml.sax.Attributes;
034 import org.xml.sax.ContentHandler;
035 import org.xml.sax.Locator;
036 import org.xml.sax.SAXException;
037 import org.xml.sax.XMLReader;
038
039 import fi.iki.hsivonen.io.NcrEscapingWindows1252OutputStreamWriter;
040
041 /**
042 * Serializes a sequence of SAX events representing an XHTML 1.0 Strict document
043 * to an <code>OutputStream</code> as a UTF-8-encoded HTML 4.01 Strict
044 * document. The SAX events must represent a valid XHTML 1.0 document, except
045 * the namespace prefixes don't matter and there may be
046 * <code>startElement</code> and <code>endElement</code> calls for elements
047 * from other namespaces. The <code>startElement</code> and
048 * <code>endElement</code> calls for non-XHTML elements are ignored. No
049 * validity checking is performed. Hence, the emitter of the SAX events is
050 * responsible for making sure the events represent a document that meets the
051 * above requirements. The <code>OutputStream</code> is closed when the end of
052 * the document is seen.
053 *
054 * @version $Id: HtmlSerializer.java,v 1.18 2006/10/30 20:03:10 hsivonen Exp $
055 * @author hsivonen
056 * @author taavi
057 */
058 public class HtmlSerializer implements ContentHandler {
059
060 public final static int NO_DOCTYPE = 0;
061
062 public final static int DOCTYPE_HTML401_TRANSITIONAL = 1;
063
064 public final static int DOCTYPE_HTML401_STRICT = 2;
065
066 public final static int DOCTYPE_HTML5 = 3;
067
068 /**
069 * The XHTML namespace URI
070 */
071 private final static String XHTML_NS = "http://www.w3.org/1999/xhtml";
072
073 /**
074 * HTML 4.01 elements which don't have an end tag
075 */
076 private static final String[] emptyElements = { "area", "base", "basefont",
077 "br", "col", "command", "frame", "hr", "img", "input", "isindex",
078 "link", "meta", "param" };
079
080 /**
081 * Minimized "boolean" HTML attributes
082 */
083 private static final String[] booleanAttributes = { "active", "async",
084 "autofocus", "autosubmit", "checked", "compact", "declare",
085 "default", "defer", "disabled", "ismap", "multiple", "nohref",
086 "noresize", "noshade", "nowrap", "readonly", "required", "selected" };
087
088 /**
089 * The writer used for output
090 */
091 protected Writer writer;
092
093 private int doctype;
094
095 private String encoding;
096
097 private boolean emitMeta;
098
099 /**
100 * Creates a new instance of HtmlSerializer in the HTML 4.01 doctype mode
101 * with the UTF-8 encoding and no charset meta.
102 *
103 * @param out
104 * the stream to which the output is written
105 */
106 public HtmlSerializer(OutputStream out) {
107 this(out, DOCTYPE_HTML401_STRICT, false, "UTF-8");
108 }
109
110 public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta) {
111 this(out, doctype, emitMeta, "UTF-8");
112 }
113
114 public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta,
115 String enc) {
116 this.emitMeta = emitMeta;
117 if (doctype < 0 || doctype > 3) {
118 throw new IllegalArgumentException("Bad doctype constant.");
119 }
120 this.doctype = doctype;
121 if ("UTF-8".equalsIgnoreCase(enc)) {
122 try {
123 this.encoding = "UTF-8";
124 this.writer = new OutputStreamWriter(out, "UTF-8");
125 } catch (UnsupportedEncodingException uee) {
126 throw new RuntimeException("UTF-8 not supported", uee);
127 }
128 } else if ("Windows-1252".equalsIgnoreCase(enc)) {
129 this.encoding = "Windows-1252";
130 this.writer = new NcrEscapingWindows1252OutputStreamWriter(out);
131 } else {
132 throw new IllegalArgumentException(
133 "Encoding must be UTF-8 or Windows-1252.");
134 }
135 }
136
137 /**
138 * Writes out characters.
139 *
140 * @param ch
141 * the source array
142 * @param start
143 * the index of the first character to be written
144 * @param length
145 * the number of characters to write
146 *
147 * @throws SAXException
148 * if there are IO problems
149 */
150 public void characters(char[] ch, int start, int length)
151 throws SAXException {
152 try {
153 for (int j = 0; j < length; j++) {
154 char c = ch[start + j];
155 switch (c) {
156 case '<':
157 this.writer.write("<");
158 break;
159 case '>':
160 this.writer.write(">");
161 break;
162 case '&':
163 this.writer.write("&");
164 break;
165 default:
166 this.writer.write(c);
167 }
168 }
169 } catch (IOException ioe) {
170 throw new SAXException(ioe);
171 }
172 }
173
174 /**
175 * Must be called in the end.
176 *
177 * @throws SAXException
178 * if there are IO problems
179 */
180 public void endDocument() throws SAXException {
181 try {
182 this.writer.close();
183 } catch (IOException ioe) {
184 throw new SAXException(ioe);
185 }
186 }
187
188 /**
189 * Writes an end tag if the element is an XHTML element and is not an empty
190 * element in HTML 4.01 Strict.
191 *
192 * @param namespaceURI
193 * the XML namespace
194 * @param localName
195 * the element name in the namespace
196 * @param qName
197 * ignored
198 *
199 * @throws SAXException
200 * if there are IO problems
201 */
202 public void endElement(String namespaceURI, String localName, String qName)
203 throws SAXException {
204 try {
205 if (XHTML_NS.equals(namespaceURI)
206 && Arrays.binarySearch(emptyElements, localName) < 0) {
207 this.writer.write("</");
208 this.writer.write(localName);
209 this.writer.write('>');
210 }
211 } catch (IOException ioe) {
212 throw new SAXException(ioe);
213 }
214 }
215
216 /**
217 * Must be called first.
218 */
219 public void startDocument() throws SAXException {
220 try {
221 switch (doctype) {
222 case NO_DOCTYPE:
223 return;
224 case DOCTYPE_HTML5:
225 writer.write("<!DOCTYPE html>\n");
226 return;
227 case DOCTYPE_HTML401_STRICT:
228 writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n");
229 return;
230 case DOCTYPE_HTML401_TRANSITIONAL:
231 writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n");
232 return;
233 }
234 } catch (IOException ioe) {
235 throw new SAXException(ioe);
236 }
237 }
238
239 /**
240 * Writes a start tag if the element is an XHTML element.
241 *
242 * @param namespaceURI
243 * the XML namespace
244 * @param localName
245 * the element name in the namespace
246 * @param qName
247 * ignored
248 * @param atts
249 * the attribute list
250 *
251 * @throws SAXException
252 * if there are IO problems
253 */
254 public void startElement(String namespaceURI, String localName,
255 String qName, Attributes atts) throws SAXException {
256 try {
257 if (XHTML_NS.equals(namespaceURI)) {
258
259 if ("meta".equals(localName)
260 && ((atts.getIndex("", "http-equiv") != -1) || (atts.getIndex(
261 "", "httpequiv") != -1))) {
262 return;
263 }
264
265 // start and element name
266 this.writer.write('<');
267 this.writer.write(localName);
268
269 // attributes
270 int length = atts.getLength();
271 boolean langPrinted = false;
272 for (int i = 0; i < length; i++) {
273 String ns = atts.getURI(i);
274 String name = null;
275 if ("".equals(ns)) {
276 name = atts.getLocalName(i);
277 } else if ("http://www.w3.org/XML/1998/namespace".equals(ns)
278 && "lang".equals(atts.getLocalName(i))) {
279 name = "lang";
280 }
281 if (name != null && !(langPrinted && "lang".equals(name))) {
282 this.writer.write(' ');
283 this.writer.write(name);
284 if ("lang".equals(name)) {
285 langPrinted = true;
286 }
287 if (Arrays.binarySearch(booleanAttributes, name) < 0) {
288 // write value, escape certain characters
289 this.writer.write("=\"");
290 String value = atts.getValue(i);
291 for (int j = 0; j < value.length(); j++) {
292 char c = value.charAt(j);
293 switch (c) {
294 case '<':
295 this.writer.write("<");
296 break;
297 case '>':
298 this.writer.write(">");
299 break;
300 case '&':
301 this.writer.write("&");
302 break;
303 case '"':
304 this.writer.write(""");
305 break;
306 default:
307 this.writer.write(c);
308 }
309 }
310
311 this.writer.write('"');
312 }
313 }
314 }
315
316 // close
317 this.writer.write('>');
318 if (emitMeta && "head".equals(localName)) {
319 this.writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=");
320 this.writer.write(encoding);
321 this.writer.write("\">");
322 }
323 }
324 } catch (IOException ioe) {
325 throw new SAXException(ioe);
326 }
327 }
328
329 /**
330 * Used for testing. Pass a file:// URL as the command line argument.
331 */
332 public static void main(String[] args) {
333 try {
334 javax.xml.parsers.SAXParserFactory fac = javax.xml.parsers.SAXParserFactory.newInstance();
335 fac.setNamespaceAware(true);
336 fac.setValidating(false);
337 XMLReader parser = fac.newSAXParser().getXMLReader();
338 parser.setContentHandler(new HtmlSerializer(System.out));
339 parser.parse(args[0]);
340 } catch (Exception e) {
341 throw new RuntimeException(e);
342 }
343 }
344
345 /** Does nothing. */
346 public void endPrefixMapping(String str) throws SAXException {
347 }
348
349 /** Does nothing. */
350 public void ignorableWhitespace(char[] values, int param, int param2)
351 throws SAXException {
352 }
353
354 /** Does nothing. */
355 public void processingInstruction(String str, String str1)
356 throws SAXException {
357 }
358
359 /** Does nothing. */
360 public void setDocumentLocator(Locator locator) {
361 }
362
363 /** Does nothing. */
364 public void skippedEntity(String str) throws SAXException {
365 }
366
367 /** Does nothing. */
368 public void startPrefixMapping(String str, String str1) throws SAXException {
369 }
370 }