001    /*
002     * XMLWriter.java
003     * Copyright (C) 1999,2000,2001 The Free Software Foundation
004     * Portions Copyright 2005 Marko Karppinen & Co. LLC
005     * 
006     * This file is part of GNU JAXP, a library.
007     * This version has been modified from the original GNU JAXP distribution 
008     * on 2005-02-11, 2005-02-12 and 2005-03-02 by Henri Sivonen working as an 
009     * employee of Marko Karppinen & Co. LLC.
010     *
011     * GNU JAXP is free software; you can redistribute it and/or modify
012     * it under the terms of the GNU General Public License as published by
013     * the Free Software Foundation; either version 2 of the License, or
014     * (at your option) any later version.
015     * 
016     * GNU JAXP is distributed in the hope that it will be useful,
017     * but WITHOUT ANY WARRANTY; without even the implied warranty of
018     * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
019     * GNU General Public License for more details.
020     * 
021     * You should have received a copy of the GNU General Public License
022     * along with this program; if not, write to the Free Software
023     * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
024     *
025     * Linking this library statically or dynamically with other modules is
026     * making a combined work based on this library.  Thus, the terms and
027     * conditions of the GNU General Public License cover the whole
028     * combination.
029     *
030     * As a special exception, the copyright holders of this library give you
031     * permission to link this library with independent modules to produce an
032     * executable, regardless of the license terms of these independent
033     * modules, and to copy and distribute the resulting executable under
034     * terms of your choice, provided that you also meet, for each linked
035     * independent module, the terms and conditions of the license of that
036     * module.  An independent module is a module which is not derived from
037     * or based on this library.  If you modify this library, you may extend
038     * this exception to your version of the library, but you are not
039     * obliged to do so.  If you do not wish to do so, delete this
040     * exception statement from your version. 
041     */
042    
043    // Package renamed -- 2005-02-11 hsivonen
044    package fi.karppinen.gnu.xml.util;
045    
046    import java.io.BufferedWriter;
047    import java.io.CharConversionException;
048    import java.io.IOException;
049    import java.io.OutputStream;
050    import java.io.OutputStreamWriter;
051    import java.io.Writer;
052    import java.util.Stack;
053    
054    import org.xml.sax.Attributes;
055    import org.xml.sax.ContentHandler;
056    import org.xml.sax.DTDHandler;
057    import org.xml.sax.ErrorHandler;
058    import org.xml.sax.Locator;
059    import org.xml.sax.SAXException;
060    import org.xml.sax.SAXParseException;
061    import org.xml.sax.ext.DeclHandler;
062    import org.xml.sax.ext.LexicalHandler;
063    import org.xml.sax.helpers.LocatorImpl;
064    
065    import fi.karppinen.xml.XmlDeclarationHandler;
066    
067    // doc edited -- 2005-02-11 hsivonen
068    /**
069     * This class is a SAX handler which writes all its input as a well formed XML
070     * or XHTML document. If driven using SAX2 events, this output may include a
071     * recreated document type declaration, subject to limitations of SAX (no
072     * internal subset exposed) or DOM (the important declarations, with their
073     * documentation, are discarded).
074     * 
075     * <p>
076     * By default, text is generated "as-is", but some optional modes are supported.
077     * Pretty-printing is supported, to make life easier for people reading the
078     * output. XHTML (1.0) output has can be made particularly pretty. Canonical XML
079     * can also be generated, assuming the input is properly formed.
080     * 
081     * <hr>
082     * 
083     * <p>
084     * Some of the methods on this class are intended for applications to use
085     * directly, rather than as pure SAX2 event callbacks. Some of those methods
086     * access the JavaBeans properties (used to tweak output formats, for example
087     * canonicalization and pretty printing). Subclasses are expected to add new
088     * behaviors, not to modify current behavior, so many such methods are final.
089     * </p>
090     * 
091     * <p>
092     * The <em>write*()</em> methods may be slightly simpler for some applications
093     * to use than direct callbacks. For example, they support a simple policy for
094     * encoding data items as the content of a single element.
095     * 
096     * <p>
097     * To reuse an XMLWriter you must provide it with a new Writer, since this
098     * handler closes the writer it was given as part of its endDocument() handling.
099     * (XML documents have an end of input, and the way to encode that on a stream
100     * is to close it.)
101     * </p>
102     * 
103     * <hr>
104     * 
105     * <p>
106     * Note that any relative URIs in the source document, as found in entity and
107     * notation declarations, ought to have been fully resolved by the parser
108     * providing events to this handler. This means that the output text should only
109     * have fully resolved URIs, which may not be the desired behavior in cases
110     * where later binding is desired.
111     * </p>
112     * 
113     * <p>
114     * <em>Note that due to SAX2 defaults, you may need to manually
115     * ensure that the input events are XML-conformant with respect to namespace
116     * prefixes and declarations.  {@link gnu.xml.pipeline.NSFilter} is
117     * one solution to this problem, in the context of processing pipelines.</em>
118     * Something as simple as connecting this handler to a parser might not generate
119     * the correct output. Another workaround is to ensure that the
120     * <em>namespace-prefixes</em> feature is always set to true, if you're
121     * hooking this directly up to some XMLReader implementation.
122     * 
123     * @see fi.karppinen.gnu.xml.pipeline.TextConsumer
124     * 
125     * @author David Brownell
126     * @author Henri Sivonen
127     */
128    public class XMLWriter implements ContentHandler, LexicalHandler, DTDHandler,
129            DeclHandler, XmlDeclarationHandler {
130        // added XmlDeclarationHandler -- 2005-03-02 hsivonen
131        
132        // text prints/escapes differently depending on context
133        //  CTX_ENTITY ... entity literal value
134        //  CTX_ATTRIBUTE ... attribute literal value
135        //  CTX_CONTENT ... content of an element
136        //  CTX_UNPARSED ... CDATA, comment, PI, names, etc
137        //  CTX_NAME ... name or nmtoken, no escapes possible
138        private static final int CTX_ENTITY = 1;
139    
140        private static final int CTX_ATTRIBUTE = 2;
141    
142        private static final int CTX_CONTENT = 3;
143    
144        private static final int CTX_UNPARSED = 4;
145    
146        private static final int CTX_NAME = 5;
147    
148        //  Removed obsolete comment -- 2005-02-11 hsivonen
149    
150        private Writer out;
151    
152        private boolean inCDATA;
153    
154        private int elementNestLevel;
155    
156        private final static String eol = "\n";
157    
158        //  Made eol independent of the underlying platform -- 2005-02-11 hsivonen
159    
160        //  Removed dangerMask -- 2005-02-11 hsivonen
161    
162        private StringBuilder stringBuf;
163    
164        private Locator locator;
165    
166        private ErrorHandler errHandler;
167    
168        private boolean expandingEntities = false;
169    
170        private int entityNestLevel;
171    
172        private boolean xhtml;
173    
174        private boolean startedDoctype;
175    
176        //  Removed encoding -- 2005-02-11 hsivonen
177    
178        private boolean canonical;
179    
180        private boolean inDoctype;
181    
182        private boolean inEpilogue;
183        
184        // pretty printing controls
185        private boolean prettyPrinting;
186    
187        private int column;
188    
189        private boolean noWrap;
190    
191        private Stack<String> space = new Stack<String>();
192    
193        // this is not a hard'n'fast rule -- longer lines are OK,
194        // but are to be avoided. Here, prettyprinting is more to
195        // show structure "cleanly" than to be precise about it.
196        // better to have ragged layout than one line 24Kb long.
197        private static final int lineLength = 75;
198    
199        /**
200         * Constructs this handler with System.out used to write SAX events using
201         * the UTF-8 encoding. Avoid using this except when you know it's safe to
202         * close System.out at the end of the document.
203         */
204        public XMLWriter() throws IOException {
205            this(System.out);
206        }
207    
208        /**
209         * Constructs a handler which writes all input to the output stream in the
210         * UTF-8 encoding, and closes it when endDocument is called. (Yes it's
211         * annoying that this throws an exception -- but there's really no way
212         * around it, since it's barely possible a JDK may exist somewhere that
213         * doesn't know how to emit UTF-8.)
214         */
215        public XMLWriter(OutputStream out) throws IOException {
216            this(new OutputStreamWriter(out, "UTF8"));
217        }
218    
219        //  doc edited -- 2005-02-11 hsivonen
220        /**
221         * Constructs a handler which writes all input to the writer, and then
222         * closes the writer when the document ends.
223         * 
224         * <P>
225         * See the description of the constructor which takes an encoding name for
226         * imporant information about selection of encodings.
227         * 
228         * @param writer
229         *            XML text is written to this writer.
230         */
231        public XMLWriter(Writer writer) {
232            //  Call to intermediate constructor removed -- 2005-02-11 hsivonen
233            this.setWriter(writer);
234           }
235    
236        //  Removed constructor taking an encoding -- 2005-02-11 hsivonen
237        //  Removed setter for encoding -- 2005-02-11 hsivonen
238    
239        /**
240         * Resets the handler to write a new text document.
241         * 
242         * @param writer
243         *            XML text is written to this writer.
244         * 
245         * @exception IllegalStateException
246         *                if the current document hasn't yet ended (with
247         *                {@link #endDocument})
248         */
249        final public void setWriter(Writer writer) {
250            //  Removed encoding-related code -- 2005-02-11 hsivonen
251            if (out != null)
252                throw new IllegalStateException("can't change stream in mid course");
253            out = writer;
254            if (!(out instanceof BufferedWriter))
255                out = new BufferedWriter(out);
256            space.push("default");
257        }
258    
259        // Removed setter for eol -- 2005-02-11 hsivonen
260    
261        /**
262         * Assigns the error handler to be used to present most fatal errors.
263         */
264        public void setErrorHandler(ErrorHandler handler) {
265            errHandler = handler;
266        }
267    
268        /**
269         * Used internally and by subclasses, this encapsulates the logic involved
270         * in reporting fatal errors. It uses locator information for good
271         * diagnostics, if available, and gives the application's ErrorHandler the
272         * opportunity to handle the error before throwing an exception.
273         */
274        protected void fatal(String message, Exception e) throws SAXException {
275            SAXParseException x;
276    
277            if (locator == null)
278                x = new SAXParseException(message, null, null, -1, -1, e);
279            else
280                x = new SAXParseException(message, locator, e);
281            if (errHandler != null)
282                errHandler.fatalError(x);
283            throw x;
284        }
285    
286        // JavaBeans properties
287        // JavaDoc comment modified to reflet encoding modifications -- 2005-02-11
288        // hsivonen
289        /**
290         * Controls whether the output should attempt to follow the "transitional"
291         * XHTML rules so that it meets the "HTML Compatibility Guidelines" appendix
292         * in the XHTML specification. XHTML empty
293         * elements are printed specially.
294         * 
295         * <p>
296         * When this option is enabled, it is the caller's responsibility to ensure
297         * that the input is otherwise valid as XHTML. Things to be careful of in
298         * all cases, as described in the appendix referenced above, include:
299         * <ul>
300         * 
301         * <li>Element and attribute names must be in lower case, both in the
302         * document and in any CSS style sheet.
303         * <li>The root element must be "html".
304         * <li>Elements that must be empty (such as <em>&lt;br&gt;</em> must have
305         * no content.
306         * <li>Use both <em>lang</em> and <em>xml:lang</em> attributes when
307         * specifying language.
308         * <li>Similarly, use both <em>id</em> and <em>name</em> attributes
309         * when defining elements that may be referred to through URI fragment
310         * identifiers ... and make sure that the value is a legal NMTOKEN, since
311         * not all such HTML 4.0 identifiers are valid in XML.
312         * <li>Be careful with character encodings; make sure you provide a
313         * <em>&lt;meta http-equiv="Content-type"
314         *          content="text/xml;charset=UTF-8" /&gt;</em>
315         * element in the HTML "head" element.
316         * </ul>
317         * 
318         * <p>
319         * Additionally, some of the oldest browsers have additional quirks, to
320         * address with guidelines such as:
321         * <ul>
322         * 
323         * <li>Processing instructions may be rendered, so avoid them. (Similarly
324         * for an XML declaration.)
325         * <li>Embedded style sheets and scripts should not contain XML markup
326         * delimiters: &amp;, &lt;, and ]]&gt; are trouble.
327         * <li>Attribute values should not have line breaks or multiple consecutive
328         * white space characters.
329         * <li>Use no more than one of the deprecated (transitional)
330         * <em>&lt;isindex&gt;</em> elements.
331         * <li>Some boolean attributes (such as <em>compact, checked,
332         *          disabled, readonly, selected,</em>
333         * and more) confuse some browsers, since they only understand minimized
334         * versions which are illegal in XML.
335         * </ul>
336         * 
337         * <p>
338         * Also, some characteristics of the resulting output may be a function of
339         * whether the document is later given a MIME content type of
340         * <em>text/html</em> rather than one indicating XML (
341         * <em>application/xml</em> or <em>text/xml</em>). Worse, some browsers
342         * ignore MIME content types and prefer to rely URI name suffixes -- so an
343         * "index.xml" could always be XML, never XHTML, no matter its MIME type.
344         */
345        final public void setXhtml(boolean value) {
346            if (locator != null)
347                throw new IllegalStateException("started parsing");
348            xhtml = value;
349            if (xhtml)
350                canonical = false;
351        }
352    
353        /**
354         * Returns true if the output attempts to echo the input following
355         * "transitional" XHTML rules and matching the "HTML Compatibility
356         * Guidelines" so that an HTML version 3 browser can read the output as
357         * HTML; returns false (the default) othewise.
358         */
359        final public boolean isXhtml() {
360            return xhtml;
361        }
362    
363        /**
364         * Controls whether the output text contains references to entities (the
365         * default), or instead contains the expanded values of those entities.
366         */
367        final public void setExpandingEntities(boolean value) {
368            if (locator != null)
369                throw new IllegalStateException("started parsing");
370            expandingEntities = value;
371            if (!expandingEntities)
372                canonical = false;
373        }
374    
375        /**
376         * Returns true if the output will have no entity references; returns false
377         * (the default) otherwise.
378         */
379        final public boolean isExpandingEntities() {
380            return expandingEntities;
381        }
382    
383        /**
384         * Controls pretty-printing, which by default is not enabled (and currently
385         * is most useful for XHTML output). Pretty printing enables structural
386         * indentation, sorting of attributes by name, line wrapping, and
387         * potentially other mechanisms for making output more or less readable.
388         * 
389         * <p>
390         * At this writing, structural indentation and line wrapping are enabled
391         * when pretty printing is enabled and the <em>xml:space</em> attribute
392         * has the value <em>default</em> (its other legal value is
393         * <em>preserve</em>, as defined in the XML specification). The three
394         * XHTML element types which use another value are recognized by their names
395         * (namespaces are ignored).
396         * 
397         * <p>
398         * Also, for the record, the "pretty" aspect of printing here is more to
399         * provide basic structure on outputs that would otherwise risk being a
400         * single long line of text. For now, expect the structure to be ragged ...
401         * unless you'd like to submit a patch to make this be more strictly
402         * formatted!
403         * 
404         * @exception IllegalStateException
405         *                thrown if this method is invoked after output has begun.
406         */
407        final public void setPrettyPrinting(boolean value) {
408            if (locator != null)
409                throw new IllegalStateException("started parsing");
410            prettyPrinting = value;
411            if (prettyPrinting)
412                canonical = false;
413        }
414    
415        /**
416         * Returns value of flag controlling pretty printing.
417         */
418        final public boolean isPrettyPrinting() {
419            return prettyPrinting;
420        }
421    
422        /**
423         * Sets the output style to be canonicalized. Input events must meet
424         * requirements that are slightly more stringent than the basic
425         * well-formedness ones, and include:
426         * <ul>
427         * 
428         * <li>Namespace prefixes must not have been changed from those in the
429         * original document. (This may only be ensured by setting the SAX2
430         * XMLReader <em>namespace-prefixes</em> feature flag; by default, it is
431         * cleared.)
432         * 
433         * <li>Redundant namespace declaration attributes have been removed. (If an
434         * ancestor element defines a namespace prefix and that declaration hasn't
435         * been overriden, an element must not redeclare it.)
436         * 
437         * <li>If comments are not to be included in the canonical output, they
438         * must first be removed from the input event stream; this
439         * <em>Canonical XML with comments</em> by default.
440         * 
441         * <li>If the input character encoding was not UCS-based, the character
442         * data must have been normalized using Unicode Normalization Form C. (UTF-8
443         * and UTF-16 are UCS-based.)
444         * 
445         * <li>Attribute values must have been normalized, as is done by any
446         * conformant XML processor which processes all external parameter entities.
447         * 
448         * <li>Similarly, attribute value defaulting has been performed.
449         * 
450         * </ul>
451         * 
452         * <p>
453         * Note that fragments of XML documents, as specified by an XPath node set,
454         * may be canonicalized. In such cases, elements may need some fixup (for
455         * <em>xml:*</em> attributes and application-specific context).
456         * 
457         * @exception IllegalArgumentException
458         *                if the output encoding is anything other than UTF-8.
459         */
460        final public void setCanonical(boolean value) {
461            //      Removed encoding check -- 2005-02-11 hsivonen
462            canonical = value;
463            if (canonical) {
464                prettyPrinting = xhtml = false;
465                expandingEntities = true;
466                // Removed eol modification -- 2005-02-11 hsivonen
467            }
468        }
469    
470        /**
471         * Returns value of flag controlling canonical output.
472         */
473        final public boolean isCanonical() {
474            return canonical;
475        }
476    
477        /**
478         * Flushes the output stream. When this handler is used in long lived
479         * pipelines, it can be important to flush buffered state, for example so
480         * that it can reach the disk as part of a state checkpoint.
481         */
482        final public void flush() throws IOException {
483            if (out != null)
484                out.flush();
485        }
486    
487        // convenience routines
488    
489        // FIXME: probably want a subclass that holds a lot of these...
490        // and maybe more!
491    
492        /**
493         * Writes the string as if characters() had been called on the contents of
494         * the string. This is particularly useful when applications act as
495         * producers and write data directly to event consumers.
496         */
497        final public void write(String data) throws SAXException {
498            char buf[] = data.toCharArray();
499            characters(buf, 0, buf.length);
500        }
501    
502        /**
503         * Writes an element that has content consisting of a single string.
504         * 
505         * @see #writeEmptyElement
506         * @see #startElement
507         */
508        public void writeElement(String uri, String localName, String qName,
509                Attributes atts, String content) throws SAXException {
510            if (content == null || content.length() == 0) {
511                writeEmptyElement(uri, localName, qName, atts);
512                return;
513            }
514            startElement(uri, localName, qName, atts);
515            char chars[] = content.toCharArray();
516            characters(chars, 0, chars.length);
517            endElement(uri, localName, qName);
518        }
519    
520        /**
521         * Writes an element that has content consisting of a single integer,
522         * encoded as a decimal string.
523         * 
524         * @see #writeEmptyElement
525         * @see #startElement
526         */
527        public void writeElement(String uri, String localName, String qName,
528                Attributes atts, int content) throws SAXException {
529            writeElement(uri, localName, qName, atts, Integer.toString(content));
530        }
531    
532        // SAX1 ContentHandler
533        /** <b>SAX1 </b>: provides parser status information */
534        final public void setDocumentLocator(Locator l) {
535            locator = l;
536        }
537    
538        // Removed transitional DTD URI -- 2005-02-11 hsivonen
539    
540        /**
541         * <b>SAX1 </b>: indicates the beginning of a document parse. If you're
542         * writing (well formed) fragments of XML, neither this nor endDocument
543         * should be called.
544         */
545        // NOT final
546        public void startDocument() throws SAXException {
547            try {
548                if (out == null)
549                    throw new IllegalStateException(
550                            "null Writer given to XMLWriter");
551    
552                // Not all parsers provide the locator we want; this also
553                // flags whether events are being sent to this object yet.
554                // We could only have this one call if we only printed whole
555                // documents ... but we also print fragments, so most of the
556                // callbacks here replicate this test.
557    
558                if (locator == null)
559                    locator = new LocatorImpl();
560    
561                // Unless we're in the XHTML mode or we're canonicalizing, write
562                // the XML declaration.
563                // Hard-coded UTF-8 -- 2005-02-11 hsivonen
564                if (!canonical && !xhtml) {
565                    rawWrite("<?xml version='1.0'");
566                    rawWrite(" encoding='UTF-8'");
567                    rawWrite("?>");
568                    newline();
569                }
570    
571                // Removed hard-coded Transitionl XHTML doctype -- 2005-02-11
572                // hsivonen
573    
574                entityNestLevel = 0;
575    
576            } catch (IOException e) {
577                fatal("can't write", e);
578            }
579        }
580    
581        /**
582         * <b>SAX1 </b>: indicates the completion of a parse. Note that all complete
583         * SAX event streams make this call, even if an error is reported during a
584         * parse.
585         */
586        // NOT final
587        public void endDocument() throws SAXException {
588            try {
589                if (!canonical) {
590                    newline();
591                    newline();
592                }
593                out.close();
594                out = null;
595                locator = null;
596            } catch (IOException e) {
597                fatal("can't write", e);
598            }
599        }
600    
601        // XHTML elements declared as EMPTY print differently
602        final private static boolean isEmptyElementTag(String tag) {
603            switch (tag.charAt(0)) {
604                case 'a':
605                    return "area".equals(tag);
606                case 'b':
607                    return "base".equals(tag) || "basefont".equals(tag)
608                            || "br".equals(tag);
609                case 'c':
610                    return "col".equals(tag);
611                case 'f':
612                    return "frame".equals(tag);
613                case 'h':
614                    return "hr".equals(tag);
615                case 'i':
616                    return "img".equals(tag) || "input".equals(tag)
617                            || "isindex".equals(tag);
618                case 'l':
619                    return "link".equals(tag);
620                case 'm':
621                    return "meta".equals(tag);
622                case 'p':
623                    return "param".equals(tag);
624            }
625            return false;
626        }
627    
628        private static boolean indentBefore(String tag) {
629            // basically indent before block content
630            // and within structure like tables, lists
631            switch (tag.charAt(0)) {
632                case 'a':
633                    return "applet".equals(tag);
634                case 'b':
635                    return "body".equals(tag) || "blockquote".equals(tag);
636                case 'c':
637                    return "center".equals(tag);
638                case 'f':
639                    return "frame".equals(tag) || "frameset".equals(tag);
640                case 'h':
641                    return "head".equals(tag);
642                case 'm':
643                    return "meta".equals(tag);
644                case 'o':
645                    return "object".equals(tag);
646                case 'p':
647                    return "param".equals(tag) || "pre".equals(tag);
648                case 's':
649                    return "style".equals(tag);
650                case 't':
651                    return "title".equals(tag) || "td".equals(tag)
652                            || "th".equals(tag);
653            }
654            // ... but not inline elements like "em", "b", "font"
655            return false;
656        }
657    
658        private static boolean spaceBefore(String tag) {
659            // blank line AND INDENT before certain structural content
660            switch (tag.charAt(0)) {
661                case 'h':
662                    return "h1".equals(tag) || "h2".equals(tag) || "h3".equals(tag)
663                            || "h4".equals(tag) || "h5".equals(tag)
664                            || "h6".equals(tag) || "hr".equals(tag);
665                case 'l':
666                    return "li".equals(tag);
667                case 'o':
668                    return "ol".equals(tag);
669                case 'p':
670                    return "p".equals(tag);
671                case 't':
672                    return "table".equals(tag) || "tr".equals(tag);
673                case 'u':
674                    return "ul".equals(tag);
675            }
676            return false;
677        }
678    
679        // XHTML DTDs say these three have xml:space="preserve"
680        private static boolean spacePreserve(String tag) {
681            return "pre".equals(tag) || "style".equals(tag) || "script".equals(tag);
682        }
683    
684        /**
685         * <b>SAX2 </b>: ignored.
686         */
687        final public void startPrefixMapping(String prefix, String uri) {
688        }
689    
690        /**
691         * <b>SAX2 </b>: ignored.
692         */
693        final public void endPrefixMapping(String prefix) {
694        }
695    
696        private void writeStartTag(String name, Attributes atts, boolean isEmpty)
697                throws SAXException, IOException {
698            rawWrite('<');
699            rawWrite(name);
700    
701            // write out attributes ... sorting is particularly useful
702            // with output that's been heavily defaulted.
703            if (atts != null && atts.getLength() != 0) {
704    
705                // Set up to write, with optional sorting
706                int indices[] = new int[atts.getLength()];
707    
708                for (int i = 0; i < indices.length; i++)
709                    indices[i] = i;
710    
711                // optionally sort
712    
713                // FIXME: canon xml demands xmlns nodes go first,
714                // and sorting by URI first (empty first) then localname
715                // it should maybe use a different sort
716    
717                if (canonical || prettyPrinting) {
718    
719                    // insertion sort by attribute name
720                    for (int i = 1; i < indices.length; i++) {
721                        int n = indices[i], j;
722                        String s = atts.getQName(n);
723    
724                        for (j = i - 1; j >= 0; j--) {
725                            if (s.compareTo(atts.getQName(indices[j])) >= 0)
726                                break;
727                            indices[j + 1] = indices[j];
728                        }
729                        indices[j + 1] = n;
730                    }
731                }
732    
733                // write, sorted or no
734                for (int i = 0; i < indices.length; i++) {
735                    String s = atts.getQName(indices[i]);
736    
737                    if (s == null || "".equals(s))
738                        throw new IllegalArgumentException("no XML name");
739                    rawWrite(" ");
740                    rawWrite(s);
741                    rawWrite("=");
742                    writeQuotedValue(atts.getValue(indices[i]), CTX_ATTRIBUTE);
743                }
744            }
745            if (isEmpty)
746                rawWrite(" /");
747            rawWrite('>');
748        }
749    
750        /**
751         * <b>SAX2 </b>: indicates the start of an element. When XHTML is in use,
752         * avoid attribute values with line breaks or multiple whitespace
753         * characters, since not all user agents handle them correctly.
754         */
755        final public void startElement(String uri, String localName, String qName,
756                Attributes atts) throws SAXException {
757            startedDoctype = false;
758    
759            if (locator == null)
760                locator = new LocatorImpl();
761    
762            if (qName == null || "".equals(qName))
763                throw new IllegalArgumentException("no XML name");
764    
765            try {
766                if (entityNestLevel != 0)
767                    return;
768                if (prettyPrinting) {
769                    String whitespace = null;
770    
771                    if (xhtml && spacePreserve(qName))
772                        whitespace = "preserve";
773                    else if (atts != null)
774                        whitespace = atts.getValue("xml:space");
775                    if (whitespace == null)
776                        whitespace = space.peek();
777                    space.push(whitespace);
778    
779                    if ("default".equals(whitespace)) {
780                        if (xhtml) {
781                            if (spaceBefore(qName)) {
782                                newline();
783                                doIndent();
784                            } else if (indentBefore(qName))
785                                doIndent();
786                            // else it's inlined, modulo line length
787                            // FIXME: incrementing element nest level
788                            // for inlined elements causes ugliness
789                        } else
790                            doIndent();
791                    }
792                }
793                elementNestLevel++;
794                writeStartTag(qName, atts, xhtml && isEmptyElementTag(qName));
795    
796                if (xhtml) {
797                    // FIXME: if this is an XHTML "pre" element, turn
798                    // off automatic wrapping.
799                }
800    
801            } catch (IOException e) {
802                fatal("can't write", e);
803            }
804        }
805    
806        /**
807         * Writes an empty element.
808         * 
809         * @see #startElement
810         */
811        public void writeEmptyElement(String uri, String localName, String qName,
812                Attributes atts) throws SAXException {
813            if (canonical) {
814                startElement(uri, localName, qName, atts);
815                endElement(uri, localName, qName);
816            } else {
817                try {
818                    writeStartTag(qName, atts, true);
819                } catch (IOException e) {
820                    fatal("can't write", e);
821                }
822            }
823        }
824    
825        /** <b>SAX2 </b>: indicates the end of an element */
826        final public void endElement(String uri, String localName, String qName)
827                throws SAXException {
828            if (qName == null || "".equals(qName))
829                throw new IllegalArgumentException("no XML name");
830    
831            try {
832                elementNestLevel--;
833                if (entityNestLevel != 0)
834                    return;
835                if (xhtml && isEmptyElementTag(qName))
836                    return;
837                rawWrite("</");
838                rawWrite(qName);
839                rawWrite('>');
840    
841                if (prettyPrinting) {
842                    if (!space.empty())
843                        space.pop();
844                    else
845                        fatal("stack discipline", null);
846                }
847                if (elementNestLevel == 0)
848                    inEpilogue = true;
849    
850            } catch (IOException e) {
851                fatal("can't write", e);
852            }
853        }
854    
855        /** <b>SAX1 </b>: reports content characters */
856        final public void characters(char ch[], int start, int length)
857                throws SAXException {
858            if (locator == null)
859                locator = new LocatorImpl();
860    
861    
862            try {
863                if (entityNestLevel != 0)
864                    return;
865                if (inCDATA) {
866                    escapeChars(ch, start, length, CTX_UNPARSED);
867                } else {
868                    escapeChars(ch, start, length, CTX_CONTENT);
869                }
870            } catch (IOException e) {
871                fatal("can't write", e);
872            }
873            
874        }
875    
876        /** <b>SAX1 </b>: reports ignorable whitespace */
877        final public void ignorableWhitespace(char ch[], int start, int length)
878                throws SAXException {
879            if (locator == null)
880                locator = new LocatorImpl();
881    
882            try {
883                if (entityNestLevel != 0)
884                    return;
885                // don't forget to map NL to CRLF, CR, etc
886                escapeChars(ch, start, length, CTX_CONTENT);
887            } catch (IOException e) {
888                fatal("can't write", e);
889            }
890        }
891    
892        /**
893         * <b>SAX1 </b>: reports a PI. This doesn't check for illegal target names,
894         * such as "xml" or "XML", or namespace-incompatible ones like "big:dog";
895         * the caller is responsible for ensuring those names are legal.
896         */
897        final public void processingInstruction(String target, String data)
898                throws SAXException {
899            if (locator == null)
900                locator = new LocatorImpl();
901    
902            // don't print internal subset for XHTML
903            if (xhtml && startedDoctype)
904                return;
905    
906            // ancient HTML browsers might render these ... their loss.
907            // to prevent: "if (xhtml) return;".
908    
909            try {
910                if (entityNestLevel != 0)
911                    return;
912                if (canonical && inEpilogue)
913                    newline();
914                rawWrite("<?");
915                rawWrite(target);
916                rawWrite(' ');
917                escapeChars(data.toCharArray(), -1, -1, CTX_UNPARSED);
918                rawWrite("?>");
919                if (elementNestLevel == 0 && !(canonical && inEpilogue))
920                    newline();
921            } catch (IOException e) {
922                fatal("can't write", e);
923            }
924        }
925    
926        /** <b>SAX1 </b>: indicates a non-expanded entity reference */
927        public void skippedEntity(String name) throws SAXException {
928            try {
929                rawWrite("&");
930                rawWrite(name);
931                rawWrite(";");
932            } catch (IOException e) {
933                fatal("can't write", e);
934            }
935        }
936    
937        // SAX2 LexicalHandler
938    
939        /** <b>SAX2 </b>: called before parsing CDATA characters */
940        final public void startCDATA() throws SAXException {
941            if (locator == null)
942                locator = new LocatorImpl();
943    
944            if (canonical || xhtml) // added xhtml check -- 2005-02-12 hsivonen
945                return;
946    
947            try {
948                inCDATA = true;
949                if (entityNestLevel == 0)
950                    rawWrite("<![CDATA[");
951            } catch (IOException e) {
952                fatal("can't write", e);
953            }
954        }
955    
956        /** <b>SAX2 </b>: called after parsing CDATA characters */
957        final public void endCDATA() throws SAXException {
958            if (canonical || xhtml) // added xhtml check -- 2005-02-12 hsivonen
959                return;
960    
961            try {
962                inCDATA = false;
963                if (entityNestLevel == 0)
964                    rawWrite("]]>");
965            } catch (IOException e) {
966                fatal("can't write", e);
967            }
968        }
969    
970        /**
971         * <b>SAX2 </b>: called when the doctype is partially parsed Note that this,
972         * like other doctype related calls, is ignored when XHTML is in use.
973         */
974        final public void startDTD(String name, String publicId, String systemId)
975                throws SAXException {
976            if (locator == null)
977                locator = new LocatorImpl();
978            // Removed xhtml check -- 2005-02-11 hsivonen
979            try {
980                inDoctype = startedDoctype = true;
981                if (canonical)
982                    return;
983                rawWrite("<!DOCTYPE ");
984                rawWrite(name);
985                rawWrite(' ');
986    
987                if (!expandingEntities) {
988                    // use double quotes  -- 2005-02-12 hsivonen
989                    if (publicId != null)
990                        rawWrite("PUBLIC \"" + publicId + "\" \"" + systemId + "\"");
991                    else if (systemId != null)
992                        rawWrite("SYSTEM \"" + systemId + "\"");
993                }
994                //    Added xhtml check -- 2005-02-11 hsivonen
995                if (!xhtml) {
996                    rawWrite(" [");
997                    newline();
998                }
999            } catch (IOException e) {
1000                fatal("can't write", e);
1001            }
1002        }
1003    
1004        /** <b>SAX2 </b>: called after the doctype is parsed */
1005        final public void endDTD() throws SAXException {
1006            inDoctype = false;
1007            //  Removed xhtml check -- 2005-02-11 hsivonen
1008            if (canonical)
1009                return;
1010            try {
1011                //      Added xhtml check -- 2005-02-11 hsivonen
1012                if (!xhtml)
1013                    rawWrite("]");
1014                rawWrite(">");
1015                newline();
1016            } catch (IOException e) {
1017                fatal("can't write", e);
1018            }
1019        }
1020    
1021        /**
1022         * <b>SAX2 </b>: called before parsing a general entity in content
1023         */
1024        final public void startEntity(String name) throws SAXException {
1025            try {
1026                boolean writeEOL = true;
1027    
1028                // Predefined XHTML entities (for characters) will get
1029                // mapped back later.
1030                if (xhtml || expandingEntities)
1031                    return;
1032    
1033                entityNestLevel++;
1034                if (name.equals("[dtd]"))
1035                    return;
1036                if (entityNestLevel != 1)
1037                    return;
1038                if (!name.startsWith("%")) {
1039                    writeEOL = false;
1040                    rawWrite('&');
1041                }
1042                rawWrite(name);
1043                rawWrite(';');
1044                if (writeEOL)
1045                    newline();
1046            } catch (IOException e) {
1047                fatal("can't write", e);
1048            }
1049        }
1050    
1051        /**
1052         * <b>SAX2 </b>: called after parsing a general entity in content
1053         */
1054        final public void endEntity(String name) throws SAXException {
1055            if (xhtml || expandingEntities)
1056                return;
1057            entityNestLevel--;
1058        }
1059    
1060        /**
1061         * <b>SAX2 </b>: called when comments are parsed. When XHTML is used, the
1062         * old HTML tradition of using comments to for inline CSS, or for JavaScript
1063         * code is discouraged. This is because XML processors are encouraged to
1064         * discard, on the grounds that comments are for users (and perhaps text
1065         * editors) not programs. Instead, use external scripts
1066         */
1067        final public void comment(char ch[], int start, int length)
1068                throws SAXException {
1069            if (locator == null)
1070                locator = new LocatorImpl();
1071    
1072            // don't print internal subset for XHTML
1073            if (xhtml && inDoctype) // changed check to match canon -- 2005-02-11
1074                // hsivonen
1075                return;
1076            // don't print comment in doctype for canon xml
1077            if (canonical && inDoctype)
1078                return;
1079    
1080            try {
1081                boolean indent;
1082    
1083                if (prettyPrinting && space.empty())
1084                    fatal("stack discipline", null);
1085                indent = prettyPrinting && "default".equals(space.peek());
1086                if (entityNestLevel != 0)
1087                    return;
1088                if (indent)
1089                    doIndent();
1090                if (canonical && inEpilogue)
1091                    newline();
1092                rawWrite("<!--");
1093                escapeChars(ch, start, length, CTX_UNPARSED);
1094                rawWrite("-->");
1095                if (indent)
1096                    doIndent();
1097                if (elementNestLevel == 0 && !(canonical && inEpilogue))
1098                    newline();
1099            } catch (IOException e) {
1100                fatal("can't write", e);
1101            }
1102        }
1103    
1104        // SAX1 DTDHandler
1105    
1106        /** <b>SAX1 </b>: called on notation declarations */
1107        final public void notationDecl(String name, String publicId, String systemId)
1108                throws SAXException {
1109            if (xhtml)
1110                return;
1111            try {
1112                // At this time, only SAX2 callbacks start these.
1113                if (!startedDoctype)
1114                    return;
1115    
1116                if (entityNestLevel != 0)
1117                    return;
1118                rawWrite("<!NOTATION " + name + " ");
1119                if (publicId != null)
1120                    rawWrite("PUBLIC \"" + publicId + '"');
1121                else
1122                    rawWrite("SYSTEM ");
1123                if (systemId != null)
1124                    rawWrite('"' + systemId + '"');
1125                rawWrite(">");
1126                newline();
1127            } catch (IOException e) {
1128                fatal("can't write", e);
1129            }
1130        }
1131    
1132        /** <b>SAX1 </b>: called on unparsed entity declarations */
1133        final public void unparsedEntityDecl(String name, String publicId,
1134                String systemId, String notationName) throws SAXException {
1135            if (xhtml)
1136                return;
1137            try {
1138                // At this time, only SAX2 callbacks start these.
1139                if (!startedDoctype) {
1140                    // FIXME: write to temporary buffer, and make the start
1141                    // of the root element write these declarations.
1142                    return;
1143                }
1144    
1145                if (entityNestLevel != 0)
1146                    return;
1147                rawWrite("<!ENTITY " + name + " ");
1148                if (publicId != null)
1149                    rawWrite("PUBLIC \"" + publicId + '"');
1150                else
1151                    rawWrite("SYSTEM ");
1152                rawWrite('"' + systemId + '"');
1153                rawWrite(" NDATA " + notationName + ">");
1154                newline();
1155            } catch (IOException e) {
1156                fatal("can't write", e);
1157            }
1158        }
1159    
1160        // SAX2 DeclHandler
1161    
1162        /** <b>SAX2 </b>: called on attribute declarations */
1163        final public void attributeDecl(String eName, String aName, String type,
1164                String mode, String value) throws SAXException {
1165            if (xhtml)
1166                return;
1167            try {
1168                // At this time, only SAX2 callbacks start these.
1169                if (!startedDoctype)
1170                    return;
1171                if (entityNestLevel != 0)
1172                    return;
1173                rawWrite("<!ATTLIST " + eName + ' ' + aName + ' ');
1174                rawWrite(type);
1175                rawWrite(' ');
1176                if (mode != null)
1177                    rawWrite(mode + ' ');
1178                if (value != null)
1179                    writeQuotedValue(value, CTX_ATTRIBUTE);
1180                rawWrite('>');
1181                newline();
1182            } catch (IOException e) {
1183                fatal("can't write", e);
1184            }
1185        }
1186    
1187        /** <b>SAX2 </b>: called on element declarations */
1188        final public void elementDecl(String name, String model)
1189                throws SAXException {
1190            if (xhtml)
1191                return;
1192            try {
1193                // At this time, only SAX2 callbacks start these.
1194                if (!startedDoctype)
1195                    return;
1196                if (entityNestLevel != 0)
1197                    return;
1198                rawWrite("<!ELEMENT " + name + ' ' + model + '>');
1199                newline();
1200            } catch (IOException e) {
1201                fatal("can't write", e);
1202            }
1203        }
1204    
1205        /** <b>SAX2 </b>: called on external entity declarations */
1206        final public void externalEntityDecl(String name, String publicId,
1207                String systemId) throws SAXException {
1208            if (xhtml)
1209                return;
1210            try {
1211                // At this time, only SAX2 callbacks start these.
1212                if (!startedDoctype)
1213                    return;
1214                if (entityNestLevel != 0)
1215                    return;
1216                rawWrite("<!ENTITY ");
1217                if (name.startsWith("%")) {
1218                    rawWrite("% ");
1219                    rawWrite(name.substring(1));
1220                } else
1221                    rawWrite(name);
1222                if (publicId != null)
1223                    rawWrite(" PUBLIC \"" + publicId + '"');
1224                else
1225                    rawWrite(" SYSTEM ");
1226                rawWrite('"' + systemId + "\">");
1227                newline();
1228            } catch (IOException e) {
1229                fatal("can't write", e);
1230            }
1231        }
1232    
1233        /** <b>SAX2 </b>: called on internal entity declarations */
1234        final public void internalEntityDecl(String name, String value)
1235                throws SAXException {
1236            if (xhtml)
1237                return;
1238            try {
1239                // At this time, only SAX2 callbacks start these.
1240                if (!startedDoctype)
1241                    return;
1242                if (entityNestLevel != 0)
1243                    return;
1244                rawWrite("<!ENTITY ");
1245                if (name.startsWith("%")) {
1246                    rawWrite("% ");
1247                    rawWrite(name.substring(1));
1248                } else
1249                    rawWrite(name);
1250                rawWrite(' ');
1251                writeQuotedValue(value, CTX_ENTITY);
1252                rawWrite('>');
1253                newline();
1254            } catch (IOException e) {
1255                fatal("can't write", e);
1256            }
1257        }
1258    
1259        // added xmlDecl() -- 2005-03-02 hsivonen
1260        /**
1261         * @see fi.karppinen.xml.XmlDeclarationHandler#xmlDecl(java.lang.String, java.lang.String, java.lang.String)
1262         */
1263        public void xmlDecl(String version, String encoding, String standalone)
1264                throws SAXException {
1265            try {
1266                if (xhtml) {
1267                    rawWrite("<?xml version='1.0'");
1268                    rawWrite(" encoding='UTF-8'");
1269                    rawWrite("?>");
1270                    newline();
1271                }
1272            } catch (IOException e) {
1273                fatal("can't write", e);
1274            }
1275        }
1276    
1277        private void writeQuotedValue(String value, int code) throws SAXException,
1278                IOException {
1279            char buf[] = value.toCharArray();
1280            int off = 0, len = buf.length;
1281    
1282            // we can't add line breaks to attribute/entity/... values
1283            noWrap = true;
1284            rawWrite('"');
1285            escapeChars(buf, off, len, code);
1286            rawWrite('"');
1287            noWrap = false;
1288        }
1289    
1290        // removed entity tables -- 2005-02-11 hsivonen
1291    
1292        // General routine to write text and substitute predefined
1293        // entities (XML, and a special case for XHTML) as needed.
1294        private void escapeChars(char buf[], int off, int len, int code)
1295                throws SAXException, IOException {
1296            int first = 0;
1297    
1298            if (off < 0) {
1299                off = 0;
1300                len = buf.length;
1301            }
1302            for (int i = 0; i < len; i++) {
1303                String esc = null; // init variable -- 2005-02-11 hsivonen
1304                char c = buf[off + i];
1305    
1306                switch (c) {
1307                    // Note that CTX_ATTRIBUTE isn't explicitly tested here;
1308                    // all syntax delimiters are escaped in CTX_ATTRIBUTE,
1309                    // otherwise it's similar to CTX_CONTENT
1310    
1311                    // ampersand flags entity references; entity replacement
1312                    // text has unexpanded references, other text doesn't.
1313                    case '&':
1314                        if (code == CTX_ENTITY || code == CTX_UNPARSED)
1315                            continue;
1316                        esc = "amp";
1317                        break;
1318    
1319                    // attributes and text may NOT have literal '<', but
1320                    // entities may have markup constructs
1321                    case '<':
1322                        if (code == CTX_ENTITY || code == CTX_UNPARSED)
1323                            continue;
1324                        esc = "lt";
1325                        break;
1326    
1327                    // as above re markup constructs; but otherwise
1328                    // except when canonicalizing, this is for consistency
1329                    case '>':
1330                        if (code == CTX_ENTITY || code == CTX_UNPARSED)
1331                            continue;
1332                        esc = "gt";
1333                        break;
1334                    case '\'':
1335                        if (code == CTX_CONTENT || code == CTX_UNPARSED)
1336                            continue;
1337                        if (canonical || xhtml) // added xhtml check -- 2005-02-11 hsivonen
1338                            continue;
1339                        esc = "apos";
1340                        break;
1341    
1342                    // needed when printing quoted attribute/entity values
1343                    case '"':
1344                        if (code == CTX_CONTENT || code == CTX_UNPARSED)
1345                            continue;
1346                        esc = "quot";
1347                        break;
1348    
1349                    // make line ends work per host OS convention
1350                    case '\n':
1351                        esc = eol;
1352                        break;
1353    
1354                    // removed obsolete comments -- 2005-02-11 hsivonen
1355    
1356                    default:
1357                        //
1358                        // There are characters we can never write safely; getting
1359                        // them is an error.
1360                        //
1361                        //   (a) They're never legal in XML ... detected by range
1362                        //  checks, and (eventually) by remerging surrogate
1363                        //  pairs on output. (Easy error for apps to prevent.)
1364                        //
1365    
1366                        // removed obsolete comments -- 2005-02-11 hsivonen
1367    
1368                        // FIXME: CR in CDATA is an error; in text, turn to a char
1369                        // ref
1370    
1371                        // FIXME: CR/LF/TAB in attributes should become char refs
1372    
1373                        if ((c > 0xfffd)
1374                                || ((c < 0x0020) && !((c == 0x0009)
1375                                        || (c == 0x000A) || (c == 0x000D)))) {
1376    
1377                            //                     removed danger mask check -- 2005-02-11 hsivonen
1378                            throw new CharConversionException(
1379                                    "Illegal or non-writable character: U+"
1380                                            + Integer.toHexString(c));
1381                        }
1382                        continue;
1383                        //                     removed entity handling -- 2005-02-12 hsivonen
1384                }
1385    
1386                if (i != first)
1387                    rawWrite(buf, off + first, i - first);
1388                first = i + 1;
1389                if (esc == eol)
1390                    newline();
1391                else {
1392                    rawWrite('&');
1393                    rawWrite(esc);
1394                    rawWrite(';');
1395                }
1396            }
1397            if (first < len)
1398                rawWrite(buf, off + first, len - first);
1399        }
1400    
1401        private void newline() throws SAXException, IOException {
1402            out.write(eol);
1403            column = 0;
1404        }
1405    
1406        private void doIndent() throws SAXException, IOException {
1407            int space = elementNestLevel * 2;
1408    
1409            newline();
1410            column = space;
1411            // track tabs only at line starts
1412            while (space > 8) {
1413                out.write("\t");
1414                space -= 8;
1415            }
1416            while (space > 0) {
1417                out.write("  ");
1418                space -= 2;
1419            }
1420        }
1421    
1422        private void rawWrite(char c) throws IOException {
1423            out.write(c);
1424            column++;
1425        }
1426    
1427        private void rawWrite(String s) throws SAXException, IOException {
1428            if (prettyPrinting && "default".equals(space.peek())) {
1429                char data[] = s.toCharArray();
1430                rawWrite(data, 0, data.length);
1431            } else {
1432                out.write(s);
1433                column += s.length();
1434            }
1435        }
1436    
1437        // NOTE: if xhtml, the REC gives some rules about whitespace
1438        // which we could follow ... notably, many places where conformant
1439        // agents "must" consolidate/normalize whitespace. Line ends can
1440        // be removed there, etc. This may not be the right place to do
1441        // such mappings though.
1442    
1443        // Line buffering may help clarify algorithms and improve results.
1444    
1445        // It's likely xml:space needs more attention.
1446    
1447        private void rawWrite(char buf[], int offset, int length)
1448                throws SAXException, IOException {
1449            boolean wrap;
1450    
1451            if (prettyPrinting && space.empty())
1452                fatal("stack discipline", null);
1453    
1454            wrap = prettyPrinting && "default".equals(space.peek());
1455            if (!wrap) {
1456                out.write(buf, offset, length);
1457                column += length;
1458                return;
1459            }
1460    
1461            // we're pretty printing and want to fill lines out only
1462            // to the desired line length.
1463            while (length > 0) {
1464                int target = lineLength - column;
1465                boolean wrote = false;
1466    
1467                // Do we even have a problem?
1468                if (target > length || noWrap) {
1469                    out.write(buf, offset, length);
1470                    column += length;
1471                    return;
1472                }
1473    
1474                // break the line at a space character, trying to fill
1475                // as much of the line as possible.
1476                char c;
1477    
1478                for (int i = target - 1; i >= 0; i--) {
1479                    if ((c = buf[offset + i]) == ' ' || c == '\t') {
1480                        i++;
1481                        out.write(buf, offset, i);
1482                        doIndent();
1483                        offset += i;
1484                        length -= i;
1485                        wrote = true;
1486                        break;
1487                    }
1488                }
1489                if (wrote)
1490                    continue;
1491    
1492                // no space character permitting break before target
1493                // line length is filled. So, take the next one.
1494                if (target < 0)
1495                    target = 0;
1496                for (int i = target; i < length; i++)
1497                    if ((c = buf[offset + i]) == ' ' || c == '\t') {
1498                        i++;
1499                        out.write(buf, offset, i);
1500                        doIndent();
1501                        offset += i;
1502                        length -= i;
1503                        wrote = true;
1504                        break;
1505                    }
1506                if (wrote)
1507                    continue;
1508    
1509                // no such luck.
1510                out.write(buf, offset, length);
1511                column += length;
1512                break;
1513            }
1514        }
1515    }