001    /*
002     * Copyright (c) 2003, 2004 Henri Sivonen and Taavi Hupponen
003     * Copyright (c) 2006 Henri Sivonen
004     *
005     * Permission is hereby granted, free of charge, to any person obtaining a 
006     * copy of this software and associated documentation files (the "Software"), 
007     * to deal in the Software without restriction, including without limitation 
008     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
009     * and/or sell copies of the Software, and to permit persons to whom the 
010     * Software is furnished to do so, subject to the following conditions:
011     *
012     * The above copyright notice and this permission notice shall be included in 
013     * all copies or substantial portions of the Software.
014     *
015     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
016     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
017     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
018     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
019     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
020     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
021     * DEALINGS IN THE SOFTWARE.
022     */
023    
024    package fi.iki.hsivonen.xml;
025    
026    import java.io.IOException;
027    import java.io.OutputStream;
028    import java.io.OutputStreamWriter;
029    import java.io.UnsupportedEncodingException;
030    import java.io.Writer;
031    import java.util.Arrays;
032    
033    import org.xml.sax.Attributes;
034    import org.xml.sax.ContentHandler;
035    import org.xml.sax.Locator;
036    import org.xml.sax.SAXException;
037    import org.xml.sax.XMLReader;
038    
039    import fi.iki.hsivonen.io.NcrEscapingWindows1252OutputStreamWriter;
040    
041    /**
042     * Serializes a sequence of SAX events representing an XHTML 1.0 Strict document
043     * to an <code>OutputStream</code> as a UTF-8-encoded HTML 4.01 Strict
044     * document. The SAX events must represent a valid XHTML 1.0 document, except
045     * the namespace prefixes don't matter and there may be
046     * <code>startElement</code> and <code>endElement</code> calls for elements
047     * from other namespaces. The <code>startElement</code> and
048     * <code>endElement</code> calls for non-XHTML elements are ignored. No
049     * validity checking is performed. Hence, the emitter of the SAX events is
050     * responsible for making sure the events represent a document that meets the
051     * above requirements. The <code>OutputStream</code> is closed when the end of
052     * the document is seen.
053     * 
054     * @version $Id: HtmlSerializer.java,v 1.18 2006/10/30 20:03:10 hsivonen Exp $
055     * @author hsivonen
056     * @author taavi
057     */
058    public class HtmlSerializer implements ContentHandler {
059    
060        public final static int NO_DOCTYPE = 0;
061    
062        public final static int DOCTYPE_HTML401_TRANSITIONAL = 1;
063    
064        public final static int DOCTYPE_HTML401_STRICT = 2;
065    
066        public final static int DOCTYPE_HTML5 = 3;
067    
068        /**
069         * The XHTML namespace URI
070         */
071        private final static String XHTML_NS = "http://www.w3.org/1999/xhtml";
072    
073        /**
074         * HTML 4.01 elements which don't have an end tag
075         */
076        private static final String[] emptyElements = { "area", "base", "basefont",
077                "br", "col", "command", "frame", "hr", "img", "input", "isindex",
078                "link", "meta", "param" };
079    
080        /**
081         * Minimized "boolean" HTML attributes
082         */
083        private static final String[] booleanAttributes = { "active", "async",
084                "autofocus", "autosubmit", "checked", "compact", "declare",
085                "default", "defer", "disabled", "ismap", "multiple", "nohref",
086                "noresize", "noshade", "nowrap", "readonly", "required", "selected" };
087    
088        /**
089         * The writer used for output
090         */
091        protected Writer writer;
092    
093        private int doctype;
094    
095        private String encoding;
096    
097        private boolean emitMeta;
098    
099        /**
100         * Creates a new instance of HtmlSerializer in the HTML 4.01 doctype mode
101         * with the UTF-8 encoding and no charset meta.
102         * 
103         * @param out
104         *            the stream to which the output is written
105         */
106        public HtmlSerializer(OutputStream out) {
107            this(out, DOCTYPE_HTML401_STRICT, false, "UTF-8");
108        }
109    
110        public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta) {
111            this(out, doctype, emitMeta, "UTF-8");
112        }
113    
114        public HtmlSerializer(OutputStream out, int doctype, boolean emitMeta,
115                String enc) {
116            this.emitMeta = emitMeta;
117            if (doctype < 0 || doctype > 3) {
118                throw new IllegalArgumentException("Bad doctype constant.");
119            }
120            this.doctype = doctype;
121            if ("UTF-8".equalsIgnoreCase(enc)) {
122                try {
123                    this.encoding = "UTF-8";
124                    this.writer = new OutputStreamWriter(out, "UTF-8");
125                } catch (UnsupportedEncodingException uee) {
126                    throw new RuntimeException("UTF-8 not supported", uee);
127                }
128            } else if ("Windows-1252".equalsIgnoreCase(enc)) {
129                this.encoding = "Windows-1252";
130                this.writer = new NcrEscapingWindows1252OutputStreamWriter(out);
131            } else {
132                throw new IllegalArgumentException(
133                        "Encoding must be UTF-8 or Windows-1252.");
134            }
135        }
136    
137        /**
138         * Writes out characters.
139         * 
140         * @param ch
141         *            the source array
142         * @param start
143         *            the index of the first character to be written
144         * @param length
145         *            the number of characters to write
146         * 
147         * @throws SAXException
148         *             if there are IO problems
149         */
150        public void characters(char[] ch, int start, int length)
151                throws SAXException {
152            try {
153                for (int j = 0; j < length; j++) {
154                    char c = ch[start + j];
155                    switch (c) {
156                        case '<':
157                            this.writer.write("&lt;");
158                            break;
159                        case '>':
160                            this.writer.write("&gt;");
161                            break;
162                        case '&':
163                            this.writer.write("&amp;");
164                            break;
165                        default:
166                            this.writer.write(c);
167                    }
168                }
169            } catch (IOException ioe) {
170                throw new SAXException(ioe);
171            }
172        }
173    
174        /**
175         * Must be called in the end.
176         * 
177         * @throws SAXException
178         *             if there are IO problems
179         */
180        public void endDocument() throws SAXException {
181            try {
182                this.writer.close();
183            } catch (IOException ioe) {
184                throw new SAXException(ioe);
185            }
186        }
187    
188        /**
189         * Writes an end tag if the element is an XHTML element and is not an empty
190         * element in HTML 4.01 Strict.
191         * 
192         * @param namespaceURI
193         *            the XML namespace
194         * @param localName
195         *            the element name in the namespace
196         * @param qName
197         *            ignored
198         * 
199         * @throws SAXException
200         *             if there are IO problems
201         */
202        public void endElement(String namespaceURI, String localName, String qName)
203                throws SAXException {
204            try {
205                if (XHTML_NS.equals(namespaceURI)
206                        && Arrays.binarySearch(emptyElements, localName) < 0) {
207                    this.writer.write("</");
208                    this.writer.write(localName);
209                    this.writer.write('>');
210                }
211            } catch (IOException ioe) {
212                throw new SAXException(ioe);
213            }
214        }
215    
216        /**
217         * Must be called first.
218         */
219        public void startDocument() throws SAXException {
220            try {
221                switch (doctype) {
222                    case NO_DOCTYPE:
223                        return;
224                    case DOCTYPE_HTML5:
225                        writer.write("<!DOCTYPE html>\n");
226                        return;
227                    case DOCTYPE_HTML401_STRICT:
228                        writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01//EN\" \"http://www.w3.org/TR/html4/strict.dtd\">\n");
229                        return;
230                    case DOCTYPE_HTML401_TRANSITIONAL:
231                        writer.write("<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\">\n");
232                        return;
233                }
234            } catch (IOException ioe) {
235                throw new SAXException(ioe);
236            }
237        }
238    
239        /**
240         * Writes a start tag if the element is an XHTML element.
241         * 
242         * @param namespaceURI
243         *            the XML namespace
244         * @param localName
245         *            the element name in the namespace
246         * @param qName
247         *            ignored
248         * @param atts
249         *            the attribute list
250         * 
251         * @throws SAXException
252         *             if there are IO problems
253         */
254        public void startElement(String namespaceURI, String localName,
255                String qName, Attributes atts) throws SAXException {
256            try {
257                if (XHTML_NS.equals(namespaceURI)) {
258    
259                    if ("meta".equals(localName)
260                            && ((atts.getIndex("", "http-equiv") != -1) || (atts.getIndex(
261                                    "", "httpequiv") != -1))) {
262                        return;
263                    }
264    
265                    // start and element name
266                    this.writer.write('<');
267                    this.writer.write(localName);
268    
269                    // attributes
270                    int length = atts.getLength();
271                    boolean langPrinted = false;
272                    for (int i = 0; i < length; i++) {
273                        String ns = atts.getURI(i);
274                        String name = null;
275                        if ("".equals(ns)) {
276                            name = atts.getLocalName(i);
277                        } else if ("http://www.w3.org/XML/1998/namespace".equals(ns)
278                                && "lang".equals(atts.getLocalName(i))) {
279                            name = "lang";
280                        }
281                        if (name != null && !(langPrinted && "lang".equals(name))) {
282                            this.writer.write(' ');
283                            this.writer.write(name);
284                            if ("lang".equals(name)) {
285                                langPrinted = true;
286                            }
287                            if (Arrays.binarySearch(booleanAttributes, name) < 0) {
288                                // write value, escape certain characters
289                                this.writer.write("=\"");
290                                String value = atts.getValue(i);
291                                for (int j = 0; j < value.length(); j++) {
292                                    char c = value.charAt(j);
293                                    switch (c) {
294                                        case '<':
295                                            this.writer.write("&lt;");
296                                            break;
297                                        case '>':
298                                            this.writer.write("&gt;");
299                                            break;
300                                        case '&':
301                                            this.writer.write("&amp;");
302                                            break;
303                                        case '"':
304                                            this.writer.write("&quot;");
305                                            break;
306                                        default:
307                                            this.writer.write(c);
308                                    }
309                                }
310    
311                                this.writer.write('"');
312                            }
313                        }
314                    }
315    
316                    // close
317                    this.writer.write('>');
318                    if (emitMeta && "head".equals(localName)) {
319                        this.writer.write("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=");
320                        this.writer.write(encoding);
321                        this.writer.write("\">");
322                    }
323                }
324            } catch (IOException ioe) {
325                throw new SAXException(ioe);
326            }
327        }
328    
329        /**
330         * Used for testing. Pass a file:// URL as the command line argument.
331         */
332        public static void main(String[] args) {
333            try {
334                javax.xml.parsers.SAXParserFactory fac = javax.xml.parsers.SAXParserFactory.newInstance();
335                fac.setNamespaceAware(true);
336                fac.setValidating(false);
337                XMLReader parser = fac.newSAXParser().getXMLReader();
338                parser.setContentHandler(new HtmlSerializer(System.out));
339                parser.parse(args[0]);
340            } catch (Exception e) {
341                throw new RuntimeException(e);
342            }
343        }
344    
345        /** Does nothing. */
346        public void endPrefixMapping(String str) throws SAXException {
347        }
348    
349        /** Does nothing. */
350        public void ignorableWhitespace(char[] values, int param, int param2)
351                throws SAXException {
352        }
353    
354        /** Does nothing. */
355        public void processingInstruction(String str, String str1)
356                throws SAXException {
357        }
358    
359        /** Does nothing. */
360        public void setDocumentLocator(Locator locator) {
361        }
362    
363        /** Does nothing. */
364        public void skippedEntity(String str) throws SAXException {
365        }
366    
367        /** Does nothing. */
368        public void startPrefixMapping(String str, String str1) throws SAXException {
369        }
370    }