001 /*
002 * XMLWriter.java
003 * Copyright (C) 1999,2000,2001 The Free Software Foundation
004 * Portions Copyright 2005 Marko Karppinen & Co. LLC
005 *
006 * This file is part of GNU JAXP, a library.
007 * This version has been modified from the original GNU JAXP distribution
008 * on 2005-02-11, 2005-02-12 and 2005-03-02 by Henri Sivonen working as an
009 * employee of Marko Karppinen & Co. LLC.
010 *
011 * GNU JAXP is free software; you can redistribute it and/or modify
012 * it under the terms of the GNU General Public License as published by
013 * the Free Software Foundation; either version 2 of the License, or
014 * (at your option) any later version.
015 *
016 * GNU JAXP is distributed in the hope that it will be useful,
017 * but WITHOUT ANY WARRANTY; without even the implied warranty of
018 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
019 * GNU General Public License for more details.
020 *
021 * You should have received a copy of the GNU General Public License
022 * along with this program; if not, write to the Free Software
023 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
024 *
025 * Linking this library statically or dynamically with other modules is
026 * making a combined work based on this library. Thus, the terms and
027 * conditions of the GNU General Public License cover the whole
028 * combination.
029 *
030 * As a special exception, the copyright holders of this library give you
031 * permission to link this library with independent modules to produce an
032 * executable, regardless of the license terms of these independent
033 * modules, and to copy and distribute the resulting executable under
034 * terms of your choice, provided that you also meet, for each linked
035 * independent module, the terms and conditions of the license of that
036 * module. An independent module is a module which is not derived from
037 * or based on this library. If you modify this library, you may extend
038 * this exception to your version of the library, but you are not
039 * obliged to do so. If you do not wish to do so, delete this
040 * exception statement from your version.
041 */
042
043 // Package renamed -- 2005-02-11 hsivonen
044 package fi.karppinen.gnu.xml.util;
045
046 import java.io.BufferedWriter;
047 import java.io.CharConversionException;
048 import java.io.IOException;
049 import java.io.OutputStream;
050 import java.io.OutputStreamWriter;
051 import java.io.Writer;
052 import java.util.Stack;
053
054 import org.xml.sax.Attributes;
055 import org.xml.sax.ContentHandler;
056 import org.xml.sax.DTDHandler;
057 import org.xml.sax.ErrorHandler;
058 import org.xml.sax.Locator;
059 import org.xml.sax.SAXException;
060 import org.xml.sax.SAXParseException;
061 import org.xml.sax.ext.DeclHandler;
062 import org.xml.sax.ext.LexicalHandler;
063 import org.xml.sax.helpers.LocatorImpl;
064
065 import fi.karppinen.xml.XmlDeclarationHandler;
066
067 // doc edited -- 2005-02-11 hsivonen
068 /**
069 * This class is a SAX handler which writes all its input as a well formed XML
070 * or XHTML document. If driven using SAX2 events, this output may include a
071 * recreated document type declaration, subject to limitations of SAX (no
072 * internal subset exposed) or DOM (the important declarations, with their
073 * documentation, are discarded).
074 *
075 * <p>
076 * By default, text is generated "as-is", but some optional modes are supported.
077 * Pretty-printing is supported, to make life easier for people reading the
078 * output. XHTML (1.0) output has can be made particularly pretty. Canonical XML
079 * can also be generated, assuming the input is properly formed.
080 *
081 * <hr>
082 *
083 * <p>
084 * Some of the methods on this class are intended for applications to use
085 * directly, rather than as pure SAX2 event callbacks. Some of those methods
086 * access the JavaBeans properties (used to tweak output formats, for example
087 * canonicalization and pretty printing). Subclasses are expected to add new
088 * behaviors, not to modify current behavior, so many such methods are final.
089 * </p>
090 *
091 * <p>
092 * The <em>write*()</em> methods may be slightly simpler for some applications
093 * to use than direct callbacks. For example, they support a simple policy for
094 * encoding data items as the content of a single element.
095 *
096 * <p>
097 * To reuse an XMLWriter you must provide it with a new Writer, since this
098 * handler closes the writer it was given as part of its endDocument() handling.
099 * (XML documents have an end of input, and the way to encode that on a stream
100 * is to close it.)
101 * </p>
102 *
103 * <hr>
104 *
105 * <p>
106 * Note that any relative URIs in the source document, as found in entity and
107 * notation declarations, ought to have been fully resolved by the parser
108 * providing events to this handler. This means that the output text should only
109 * have fully resolved URIs, which may not be the desired behavior in cases
110 * where later binding is desired.
111 * </p>
112 *
113 * <p>
114 * <em>Note that due to SAX2 defaults, you may need to manually
115 * ensure that the input events are XML-conformant with respect to namespace
116 * prefixes and declarations. {@link gnu.xml.pipeline.NSFilter} is
117 * one solution to this problem, in the context of processing pipelines.</em>
118 * Something as simple as connecting this handler to a parser might not generate
119 * the correct output. Another workaround is to ensure that the
120 * <em>namespace-prefixes</em> feature is always set to true, if you're
121 * hooking this directly up to some XMLReader implementation.
122 *
123 * @see fi.karppinen.gnu.xml.pipeline.TextConsumer
124 *
125 * @author David Brownell
126 * @author Henri Sivonen
127 */
128 public class XMLWriter implements ContentHandler, LexicalHandler, DTDHandler,
129 DeclHandler, XmlDeclarationHandler {
130 // added XmlDeclarationHandler -- 2005-03-02 hsivonen
131
132 // text prints/escapes differently depending on context
133 // CTX_ENTITY ... entity literal value
134 // CTX_ATTRIBUTE ... attribute literal value
135 // CTX_CONTENT ... content of an element
136 // CTX_UNPARSED ... CDATA, comment, PI, names, etc
137 // CTX_NAME ... name or nmtoken, no escapes possible
138 private static final int CTX_ENTITY = 1;
139
140 private static final int CTX_ATTRIBUTE = 2;
141
142 private static final int CTX_CONTENT = 3;
143
144 private static final int CTX_UNPARSED = 4;
145
146 private static final int CTX_NAME = 5;
147
148 // Removed obsolete comment -- 2005-02-11 hsivonen
149
150 private Writer out;
151
152 private boolean inCDATA;
153
154 private int elementNestLevel;
155
156 private final static String eol = "\n";
157
158 // Made eol independent of the underlying platform -- 2005-02-11 hsivonen
159
160 // Removed dangerMask -- 2005-02-11 hsivonen
161
162 private StringBuilder stringBuf;
163
164 private Locator locator;
165
166 private ErrorHandler errHandler;
167
168 private boolean expandingEntities = false;
169
170 private int entityNestLevel;
171
172 private boolean xhtml;
173
174 private boolean startedDoctype;
175
176 // Removed encoding -- 2005-02-11 hsivonen
177
178 private boolean canonical;
179
180 private boolean inDoctype;
181
182 private boolean inEpilogue;
183
184 // pretty printing controls
185 private boolean prettyPrinting;
186
187 private int column;
188
189 private boolean noWrap;
190
191 private Stack<String> space = new Stack<String>();
192
193 // this is not a hard'n'fast rule -- longer lines are OK,
194 // but are to be avoided. Here, prettyprinting is more to
195 // show structure "cleanly" than to be precise about it.
196 // better to have ragged layout than one line 24Kb long.
197 private static final int lineLength = 75;
198
199 /**
200 * Constructs this handler with System.out used to write SAX events using
201 * the UTF-8 encoding. Avoid using this except when you know it's safe to
202 * close System.out at the end of the document.
203 */
204 public XMLWriter() throws IOException {
205 this(System.out);
206 }
207
208 /**
209 * Constructs a handler which writes all input to the output stream in the
210 * UTF-8 encoding, and closes it when endDocument is called. (Yes it's
211 * annoying that this throws an exception -- but there's really no way
212 * around it, since it's barely possible a JDK may exist somewhere that
213 * doesn't know how to emit UTF-8.)
214 */
215 public XMLWriter(OutputStream out) throws IOException {
216 this(new OutputStreamWriter(out, "UTF8"));
217 }
218
219 // doc edited -- 2005-02-11 hsivonen
220 /**
221 * Constructs a handler which writes all input to the writer, and then
222 * closes the writer when the document ends.
223 *
224 * <P>
225 * See the description of the constructor which takes an encoding name for
226 * imporant information about selection of encodings.
227 *
228 * @param writer
229 * XML text is written to this writer.
230 */
231 public XMLWriter(Writer writer) {
232 // Call to intermediate constructor removed -- 2005-02-11 hsivonen
233 this.setWriter(writer);
234 }
235
236 // Removed constructor taking an encoding -- 2005-02-11 hsivonen
237 // Removed setter for encoding -- 2005-02-11 hsivonen
238
239 /**
240 * Resets the handler to write a new text document.
241 *
242 * @param writer
243 * XML text is written to this writer.
244 *
245 * @exception IllegalStateException
246 * if the current document hasn't yet ended (with
247 * {@link #endDocument})
248 */
249 final public void setWriter(Writer writer) {
250 // Removed encoding-related code -- 2005-02-11 hsivonen
251 if (out != null)
252 throw new IllegalStateException("can't change stream in mid course");
253 out = writer;
254 if (!(out instanceof BufferedWriter))
255 out = new BufferedWriter(out);
256 space.push("default");
257 }
258
259 // Removed setter for eol -- 2005-02-11 hsivonen
260
261 /**
262 * Assigns the error handler to be used to present most fatal errors.
263 */
264 public void setErrorHandler(ErrorHandler handler) {
265 errHandler = handler;
266 }
267
268 /**
269 * Used internally and by subclasses, this encapsulates the logic involved
270 * in reporting fatal errors. It uses locator information for good
271 * diagnostics, if available, and gives the application's ErrorHandler the
272 * opportunity to handle the error before throwing an exception.
273 */
274 protected void fatal(String message, Exception e) throws SAXException {
275 SAXParseException x;
276
277 if (locator == null)
278 x = new SAXParseException(message, null, null, -1, -1, e);
279 else
280 x = new SAXParseException(message, locator, e);
281 if (errHandler != null)
282 errHandler.fatalError(x);
283 throw x;
284 }
285
286 // JavaBeans properties
287 // JavaDoc comment modified to reflet encoding modifications -- 2005-02-11
288 // hsivonen
289 /**
290 * Controls whether the output should attempt to follow the "transitional"
291 * XHTML rules so that it meets the "HTML Compatibility Guidelines" appendix
292 * in the XHTML specification. XHTML empty
293 * elements are printed specially.
294 *
295 * <p>
296 * When this option is enabled, it is the caller's responsibility to ensure
297 * that the input is otherwise valid as XHTML. Things to be careful of in
298 * all cases, as described in the appendix referenced above, include:
299 * <ul>
300 *
301 * <li>Element and attribute names must be in lower case, both in the
302 * document and in any CSS style sheet.
303 * <li>The root element must be "html".
304 * <li>Elements that must be empty (such as <em><br></em> must have
305 * no content.
306 * <li>Use both <em>lang</em> and <em>xml:lang</em> attributes when
307 * specifying language.
308 * <li>Similarly, use both <em>id</em> and <em>name</em> attributes
309 * when defining elements that may be referred to through URI fragment
310 * identifiers ... and make sure that the value is a legal NMTOKEN, since
311 * not all such HTML 4.0 identifiers are valid in XML.
312 * <li>Be careful with character encodings; make sure you provide a
313 * <em><meta http-equiv="Content-type"
314 * content="text/xml;charset=UTF-8" /></em>
315 * element in the HTML "head" element.
316 * </ul>
317 *
318 * <p>
319 * Additionally, some of the oldest browsers have additional quirks, to
320 * address with guidelines such as:
321 * <ul>
322 *
323 * <li>Processing instructions may be rendered, so avoid them. (Similarly
324 * for an XML declaration.)
325 * <li>Embedded style sheets and scripts should not contain XML markup
326 * delimiters: &, <, and ]]> are trouble.
327 * <li>Attribute values should not have line breaks or multiple consecutive
328 * white space characters.
329 * <li>Use no more than one of the deprecated (transitional)
330 * <em><isindex></em> elements.
331 * <li>Some boolean attributes (such as <em>compact, checked,
332 * disabled, readonly, selected,</em>
333 * and more) confuse some browsers, since they only understand minimized
334 * versions which are illegal in XML.
335 * </ul>
336 *
337 * <p>
338 * Also, some characteristics of the resulting output may be a function of
339 * whether the document is later given a MIME content type of
340 * <em>text/html</em> rather than one indicating XML (
341 * <em>application/xml</em> or <em>text/xml</em>). Worse, some browsers
342 * ignore MIME content types and prefer to rely URI name suffixes -- so an
343 * "index.xml" could always be XML, never XHTML, no matter its MIME type.
344 */
345 final public void setXhtml(boolean value) {
346 if (locator != null)
347 throw new IllegalStateException("started parsing");
348 xhtml = value;
349 if (xhtml)
350 canonical = false;
351 }
352
353 /**
354 * Returns true if the output attempts to echo the input following
355 * "transitional" XHTML rules and matching the "HTML Compatibility
356 * Guidelines" so that an HTML version 3 browser can read the output as
357 * HTML; returns false (the default) othewise.
358 */
359 final public boolean isXhtml() {
360 return xhtml;
361 }
362
363 /**
364 * Controls whether the output text contains references to entities (the
365 * default), or instead contains the expanded values of those entities.
366 */
367 final public void setExpandingEntities(boolean value) {
368 if (locator != null)
369 throw new IllegalStateException("started parsing");
370 expandingEntities = value;
371 if (!expandingEntities)
372 canonical = false;
373 }
374
375 /**
376 * Returns true if the output will have no entity references; returns false
377 * (the default) otherwise.
378 */
379 final public boolean isExpandingEntities() {
380 return expandingEntities;
381 }
382
383 /**
384 * Controls pretty-printing, which by default is not enabled (and currently
385 * is most useful for XHTML output). Pretty printing enables structural
386 * indentation, sorting of attributes by name, line wrapping, and
387 * potentially other mechanisms for making output more or less readable.
388 *
389 * <p>
390 * At this writing, structural indentation and line wrapping are enabled
391 * when pretty printing is enabled and the <em>xml:space</em> attribute
392 * has the value <em>default</em> (its other legal value is
393 * <em>preserve</em>, as defined in the XML specification). The three
394 * XHTML element types which use another value are recognized by their names
395 * (namespaces are ignored).
396 *
397 * <p>
398 * Also, for the record, the "pretty" aspect of printing here is more to
399 * provide basic structure on outputs that would otherwise risk being a
400 * single long line of text. For now, expect the structure to be ragged ...
401 * unless you'd like to submit a patch to make this be more strictly
402 * formatted!
403 *
404 * @exception IllegalStateException
405 * thrown if this method is invoked after output has begun.
406 */
407 final public void setPrettyPrinting(boolean value) {
408 if (locator != null)
409 throw new IllegalStateException("started parsing");
410 prettyPrinting = value;
411 if (prettyPrinting)
412 canonical = false;
413 }
414
415 /**
416 * Returns value of flag controlling pretty printing.
417 */
418 final public boolean isPrettyPrinting() {
419 return prettyPrinting;
420 }
421
422 /**
423 * Sets the output style to be canonicalized. Input events must meet
424 * requirements that are slightly more stringent than the basic
425 * well-formedness ones, and include:
426 * <ul>
427 *
428 * <li>Namespace prefixes must not have been changed from those in the
429 * original document. (This may only be ensured by setting the SAX2
430 * XMLReader <em>namespace-prefixes</em> feature flag; by default, it is
431 * cleared.)
432 *
433 * <li>Redundant namespace declaration attributes have been removed. (If an
434 * ancestor element defines a namespace prefix and that declaration hasn't
435 * been overriden, an element must not redeclare it.)
436 *
437 * <li>If comments are not to be included in the canonical output, they
438 * must first be removed from the input event stream; this
439 * <em>Canonical XML with comments</em> by default.
440 *
441 * <li>If the input character encoding was not UCS-based, the character
442 * data must have been normalized using Unicode Normalization Form C. (UTF-8
443 * and UTF-16 are UCS-based.)
444 *
445 * <li>Attribute values must have been normalized, as is done by any
446 * conformant XML processor which processes all external parameter entities.
447 *
448 * <li>Similarly, attribute value defaulting has been performed.
449 *
450 * </ul>
451 *
452 * <p>
453 * Note that fragments of XML documents, as specified by an XPath node set,
454 * may be canonicalized. In such cases, elements may need some fixup (for
455 * <em>xml:*</em> attributes and application-specific context).
456 *
457 * @exception IllegalArgumentException
458 * if the output encoding is anything other than UTF-8.
459 */
460 final public void setCanonical(boolean value) {
461 // Removed encoding check -- 2005-02-11 hsivonen
462 canonical = value;
463 if (canonical) {
464 prettyPrinting = xhtml = false;
465 expandingEntities = true;
466 // Removed eol modification -- 2005-02-11 hsivonen
467 }
468 }
469
470 /**
471 * Returns value of flag controlling canonical output.
472 */
473 final public boolean isCanonical() {
474 return canonical;
475 }
476
477 /**
478 * Flushes the output stream. When this handler is used in long lived
479 * pipelines, it can be important to flush buffered state, for example so
480 * that it can reach the disk as part of a state checkpoint.
481 */
482 final public void flush() throws IOException {
483 if (out != null)
484 out.flush();
485 }
486
487 // convenience routines
488
489 // FIXME: probably want a subclass that holds a lot of these...
490 // and maybe more!
491
492 /**
493 * Writes the string as if characters() had been called on the contents of
494 * the string. This is particularly useful when applications act as
495 * producers and write data directly to event consumers.
496 */
497 final public void write(String data) throws SAXException {
498 char buf[] = data.toCharArray();
499 characters(buf, 0, buf.length);
500 }
501
502 /**
503 * Writes an element that has content consisting of a single string.
504 *
505 * @see #writeEmptyElement
506 * @see #startElement
507 */
508 public void writeElement(String uri, String localName, String qName,
509 Attributes atts, String content) throws SAXException {
510 if (content == null || content.length() == 0) {
511 writeEmptyElement(uri, localName, qName, atts);
512 return;
513 }
514 startElement(uri, localName, qName, atts);
515 char chars[] = content.toCharArray();
516 characters(chars, 0, chars.length);
517 endElement(uri, localName, qName);
518 }
519
520 /**
521 * Writes an element that has content consisting of a single integer,
522 * encoded as a decimal string.
523 *
524 * @see #writeEmptyElement
525 * @see #startElement
526 */
527 public void writeElement(String uri, String localName, String qName,
528 Attributes atts, int content) throws SAXException {
529 writeElement(uri, localName, qName, atts, Integer.toString(content));
530 }
531
532 // SAX1 ContentHandler
533 /** <b>SAX1 </b>: provides parser status information */
534 final public void setDocumentLocator(Locator l) {
535 locator = l;
536 }
537
538 // Removed transitional DTD URI -- 2005-02-11 hsivonen
539
540 /**
541 * <b>SAX1 </b>: indicates the beginning of a document parse. If you're
542 * writing (well formed) fragments of XML, neither this nor endDocument
543 * should be called.
544 */
545 // NOT final
546 public void startDocument() throws SAXException {
547 try {
548 if (out == null)
549 throw new IllegalStateException(
550 "null Writer given to XMLWriter");
551
552 // Not all parsers provide the locator we want; this also
553 // flags whether events are being sent to this object yet.
554 // We could only have this one call if we only printed whole
555 // documents ... but we also print fragments, so most of the
556 // callbacks here replicate this test.
557
558 if (locator == null)
559 locator = new LocatorImpl();
560
561 // Unless we're in the XHTML mode or we're canonicalizing, write
562 // the XML declaration.
563 // Hard-coded UTF-8 -- 2005-02-11 hsivonen
564 if (!canonical && !xhtml) {
565 rawWrite("<?xml version='1.0'");
566 rawWrite(" encoding='UTF-8'");
567 rawWrite("?>");
568 newline();
569 }
570
571 // Removed hard-coded Transitionl XHTML doctype -- 2005-02-11
572 // hsivonen
573
574 entityNestLevel = 0;
575
576 } catch (IOException e) {
577 fatal("can't write", e);
578 }
579 }
580
581 /**
582 * <b>SAX1 </b>: indicates the completion of a parse. Note that all complete
583 * SAX event streams make this call, even if an error is reported during a
584 * parse.
585 */
586 // NOT final
587 public void endDocument() throws SAXException {
588 try {
589 if (!canonical) {
590 newline();
591 newline();
592 }
593 out.close();
594 out = null;
595 locator = null;
596 } catch (IOException e) {
597 fatal("can't write", e);
598 }
599 }
600
601 // XHTML elements declared as EMPTY print differently
602 final private static boolean isEmptyElementTag(String tag) {
603 switch (tag.charAt(0)) {
604 case 'a':
605 return "area".equals(tag);
606 case 'b':
607 return "base".equals(tag) || "basefont".equals(tag)
608 || "br".equals(tag);
609 case 'c':
610 return "col".equals(tag);
611 case 'f':
612 return "frame".equals(tag);
613 case 'h':
614 return "hr".equals(tag);
615 case 'i':
616 return "img".equals(tag) || "input".equals(tag)
617 || "isindex".equals(tag);
618 case 'l':
619 return "link".equals(tag);
620 case 'm':
621 return "meta".equals(tag);
622 case 'p':
623 return "param".equals(tag);
624 }
625 return false;
626 }
627
628 private static boolean indentBefore(String tag) {
629 // basically indent before block content
630 // and within structure like tables, lists
631 switch (tag.charAt(0)) {
632 case 'a':
633 return "applet".equals(tag);
634 case 'b':
635 return "body".equals(tag) || "blockquote".equals(tag);
636 case 'c':
637 return "center".equals(tag);
638 case 'f':
639 return "frame".equals(tag) || "frameset".equals(tag);
640 case 'h':
641 return "head".equals(tag);
642 case 'm':
643 return "meta".equals(tag);
644 case 'o':
645 return "object".equals(tag);
646 case 'p':
647 return "param".equals(tag) || "pre".equals(tag);
648 case 's':
649 return "style".equals(tag);
650 case 't':
651 return "title".equals(tag) || "td".equals(tag)
652 || "th".equals(tag);
653 }
654 // ... but not inline elements like "em", "b", "font"
655 return false;
656 }
657
658 private static boolean spaceBefore(String tag) {
659 // blank line AND INDENT before certain structural content
660 switch (tag.charAt(0)) {
661 case 'h':
662 return "h1".equals(tag) || "h2".equals(tag) || "h3".equals(tag)
663 || "h4".equals(tag) || "h5".equals(tag)
664 || "h6".equals(tag) || "hr".equals(tag);
665 case 'l':
666 return "li".equals(tag);
667 case 'o':
668 return "ol".equals(tag);
669 case 'p':
670 return "p".equals(tag);
671 case 't':
672 return "table".equals(tag) || "tr".equals(tag);
673 case 'u':
674 return "ul".equals(tag);
675 }
676 return false;
677 }
678
679 // XHTML DTDs say these three have xml:space="preserve"
680 private static boolean spacePreserve(String tag) {
681 return "pre".equals(tag) || "style".equals(tag) || "script".equals(tag);
682 }
683
684 /**
685 * <b>SAX2 </b>: ignored.
686 */
687 final public void startPrefixMapping(String prefix, String uri) {
688 }
689
690 /**
691 * <b>SAX2 </b>: ignored.
692 */
693 final public void endPrefixMapping(String prefix) {
694 }
695
696 private void writeStartTag(String name, Attributes atts, boolean isEmpty)
697 throws SAXException, IOException {
698 rawWrite('<');
699 rawWrite(name);
700
701 // write out attributes ... sorting is particularly useful
702 // with output that's been heavily defaulted.
703 if (atts != null && atts.getLength() != 0) {
704
705 // Set up to write, with optional sorting
706 int indices[] = new int[atts.getLength()];
707
708 for (int i = 0; i < indices.length; i++)
709 indices[i] = i;
710
711 // optionally sort
712
713 // FIXME: canon xml demands xmlns nodes go first,
714 // and sorting by URI first (empty first) then localname
715 // it should maybe use a different sort
716
717 if (canonical || prettyPrinting) {
718
719 // insertion sort by attribute name
720 for (int i = 1; i < indices.length; i++) {
721 int n = indices[i], j;
722 String s = atts.getQName(n);
723
724 for (j = i - 1; j >= 0; j--) {
725 if (s.compareTo(atts.getQName(indices[j])) >= 0)
726 break;
727 indices[j + 1] = indices[j];
728 }
729 indices[j + 1] = n;
730 }
731 }
732
733 // write, sorted or no
734 for (int i = 0; i < indices.length; i++) {
735 String s = atts.getQName(indices[i]);
736
737 if (s == null || "".equals(s))
738 throw new IllegalArgumentException("no XML name");
739 rawWrite(" ");
740 rawWrite(s);
741 rawWrite("=");
742 writeQuotedValue(atts.getValue(indices[i]), CTX_ATTRIBUTE);
743 }
744 }
745 if (isEmpty)
746 rawWrite(" /");
747 rawWrite('>');
748 }
749
750 /**
751 * <b>SAX2 </b>: indicates the start of an element. When XHTML is in use,
752 * avoid attribute values with line breaks or multiple whitespace
753 * characters, since not all user agents handle them correctly.
754 */
755 final public void startElement(String uri, String localName, String qName,
756 Attributes atts) throws SAXException {
757 startedDoctype = false;
758
759 if (locator == null)
760 locator = new LocatorImpl();
761
762 if (qName == null || "".equals(qName))
763 throw new IllegalArgumentException("no XML name");
764
765 try {
766 if (entityNestLevel != 0)
767 return;
768 if (prettyPrinting) {
769 String whitespace = null;
770
771 if (xhtml && spacePreserve(qName))
772 whitespace = "preserve";
773 else if (atts != null)
774 whitespace = atts.getValue("xml:space");
775 if (whitespace == null)
776 whitespace = space.peek();
777 space.push(whitespace);
778
779 if ("default".equals(whitespace)) {
780 if (xhtml) {
781 if (spaceBefore(qName)) {
782 newline();
783 doIndent();
784 } else if (indentBefore(qName))
785 doIndent();
786 // else it's inlined, modulo line length
787 // FIXME: incrementing element nest level
788 // for inlined elements causes ugliness
789 } else
790 doIndent();
791 }
792 }
793 elementNestLevel++;
794 writeStartTag(qName, atts, xhtml && isEmptyElementTag(qName));
795
796 if (xhtml) {
797 // FIXME: if this is an XHTML "pre" element, turn
798 // off automatic wrapping.
799 }
800
801 } catch (IOException e) {
802 fatal("can't write", e);
803 }
804 }
805
806 /**
807 * Writes an empty element.
808 *
809 * @see #startElement
810 */
811 public void writeEmptyElement(String uri, String localName, String qName,
812 Attributes atts) throws SAXException {
813 if (canonical) {
814 startElement(uri, localName, qName, atts);
815 endElement(uri, localName, qName);
816 } else {
817 try {
818 writeStartTag(qName, atts, true);
819 } catch (IOException e) {
820 fatal("can't write", e);
821 }
822 }
823 }
824
825 /** <b>SAX2 </b>: indicates the end of an element */
826 final public void endElement(String uri, String localName, String qName)
827 throws SAXException {
828 if (qName == null || "".equals(qName))
829 throw new IllegalArgumentException("no XML name");
830
831 try {
832 elementNestLevel--;
833 if (entityNestLevel != 0)
834 return;
835 if (xhtml && isEmptyElementTag(qName))
836 return;
837 rawWrite("</");
838 rawWrite(qName);
839 rawWrite('>');
840
841 if (prettyPrinting) {
842 if (!space.empty())
843 space.pop();
844 else
845 fatal("stack discipline", null);
846 }
847 if (elementNestLevel == 0)
848 inEpilogue = true;
849
850 } catch (IOException e) {
851 fatal("can't write", e);
852 }
853 }
854
855 /** <b>SAX1 </b>: reports content characters */
856 final public void characters(char ch[], int start, int length)
857 throws SAXException {
858 if (locator == null)
859 locator = new LocatorImpl();
860
861
862 try {
863 if (entityNestLevel != 0)
864 return;
865 if (inCDATA) {
866 escapeChars(ch, start, length, CTX_UNPARSED);
867 } else {
868 escapeChars(ch, start, length, CTX_CONTENT);
869 }
870 } catch (IOException e) {
871 fatal("can't write", e);
872 }
873
874 }
875
876 /** <b>SAX1 </b>: reports ignorable whitespace */
877 final public void ignorableWhitespace(char ch[], int start, int length)
878 throws SAXException {
879 if (locator == null)
880 locator = new LocatorImpl();
881
882 try {
883 if (entityNestLevel != 0)
884 return;
885 // don't forget to map NL to CRLF, CR, etc
886 escapeChars(ch, start, length, CTX_CONTENT);
887 } catch (IOException e) {
888 fatal("can't write", e);
889 }
890 }
891
892 /**
893 * <b>SAX1 </b>: reports a PI. This doesn't check for illegal target names,
894 * such as "xml" or "XML", or namespace-incompatible ones like "big:dog";
895 * the caller is responsible for ensuring those names are legal.
896 */
897 final public void processingInstruction(String target, String data)
898 throws SAXException {
899 if (locator == null)
900 locator = new LocatorImpl();
901
902 // don't print internal subset for XHTML
903 if (xhtml && startedDoctype)
904 return;
905
906 // ancient HTML browsers might render these ... their loss.
907 // to prevent: "if (xhtml) return;".
908
909 try {
910 if (entityNestLevel != 0)
911 return;
912 if (canonical && inEpilogue)
913 newline();
914 rawWrite("<?");
915 rawWrite(target);
916 rawWrite(' ');
917 escapeChars(data.toCharArray(), -1, -1, CTX_UNPARSED);
918 rawWrite("?>");
919 if (elementNestLevel == 0 && !(canonical && inEpilogue))
920 newline();
921 } catch (IOException e) {
922 fatal("can't write", e);
923 }
924 }
925
926 /** <b>SAX1 </b>: indicates a non-expanded entity reference */
927 public void skippedEntity(String name) throws SAXException {
928 try {
929 rawWrite("&");
930 rawWrite(name);
931 rawWrite(";");
932 } catch (IOException e) {
933 fatal("can't write", e);
934 }
935 }
936
937 // SAX2 LexicalHandler
938
939 /** <b>SAX2 </b>: called before parsing CDATA characters */
940 final public void startCDATA() throws SAXException {
941 if (locator == null)
942 locator = new LocatorImpl();
943
944 if (canonical || xhtml) // added xhtml check -- 2005-02-12 hsivonen
945 return;
946
947 try {
948 inCDATA = true;
949 if (entityNestLevel == 0)
950 rawWrite("<![CDATA[");
951 } catch (IOException e) {
952 fatal("can't write", e);
953 }
954 }
955
956 /** <b>SAX2 </b>: called after parsing CDATA characters */
957 final public void endCDATA() throws SAXException {
958 if (canonical || xhtml) // added xhtml check -- 2005-02-12 hsivonen
959 return;
960
961 try {
962 inCDATA = false;
963 if (entityNestLevel == 0)
964 rawWrite("]]>");
965 } catch (IOException e) {
966 fatal("can't write", e);
967 }
968 }
969
970 /**
971 * <b>SAX2 </b>: called when the doctype is partially parsed Note that this,
972 * like other doctype related calls, is ignored when XHTML is in use.
973 */
974 final public void startDTD(String name, String publicId, String systemId)
975 throws SAXException {
976 if (locator == null)
977 locator = new LocatorImpl();
978 // Removed xhtml check -- 2005-02-11 hsivonen
979 try {
980 inDoctype = startedDoctype = true;
981 if (canonical)
982 return;
983 rawWrite("<!DOCTYPE ");
984 rawWrite(name);
985 rawWrite(' ');
986
987 if (!expandingEntities) {
988 // use double quotes -- 2005-02-12 hsivonen
989 if (publicId != null)
990 rawWrite("PUBLIC \"" + publicId + "\" \"" + systemId + "\"");
991 else if (systemId != null)
992 rawWrite("SYSTEM \"" + systemId + "\"");
993 }
994 // Added xhtml check -- 2005-02-11 hsivonen
995 if (!xhtml) {
996 rawWrite(" [");
997 newline();
998 }
999 } catch (IOException e) {
1000 fatal("can't write", e);
1001 }
1002 }
1003
1004 /** <b>SAX2 </b>: called after the doctype is parsed */
1005 final public void endDTD() throws SAXException {
1006 inDoctype = false;
1007 // Removed xhtml check -- 2005-02-11 hsivonen
1008 if (canonical)
1009 return;
1010 try {
1011 // Added xhtml check -- 2005-02-11 hsivonen
1012 if (!xhtml)
1013 rawWrite("]");
1014 rawWrite(">");
1015 newline();
1016 } catch (IOException e) {
1017 fatal("can't write", e);
1018 }
1019 }
1020
1021 /**
1022 * <b>SAX2 </b>: called before parsing a general entity in content
1023 */
1024 final public void startEntity(String name) throws SAXException {
1025 try {
1026 boolean writeEOL = true;
1027
1028 // Predefined XHTML entities (for characters) will get
1029 // mapped back later.
1030 if (xhtml || expandingEntities)
1031 return;
1032
1033 entityNestLevel++;
1034 if (name.equals("[dtd]"))
1035 return;
1036 if (entityNestLevel != 1)
1037 return;
1038 if (!name.startsWith("%")) {
1039 writeEOL = false;
1040 rawWrite('&');
1041 }
1042 rawWrite(name);
1043 rawWrite(';');
1044 if (writeEOL)
1045 newline();
1046 } catch (IOException e) {
1047 fatal("can't write", e);
1048 }
1049 }
1050
1051 /**
1052 * <b>SAX2 </b>: called after parsing a general entity in content
1053 */
1054 final public void endEntity(String name) throws SAXException {
1055 if (xhtml || expandingEntities)
1056 return;
1057 entityNestLevel--;
1058 }
1059
1060 /**
1061 * <b>SAX2 </b>: called when comments are parsed. When XHTML is used, the
1062 * old HTML tradition of using comments to for inline CSS, or for JavaScript
1063 * code is discouraged. This is because XML processors are encouraged to
1064 * discard, on the grounds that comments are for users (and perhaps text
1065 * editors) not programs. Instead, use external scripts
1066 */
1067 final public void comment(char ch[], int start, int length)
1068 throws SAXException {
1069 if (locator == null)
1070 locator = new LocatorImpl();
1071
1072 // don't print internal subset for XHTML
1073 if (xhtml && inDoctype) // changed check to match canon -- 2005-02-11
1074 // hsivonen
1075 return;
1076 // don't print comment in doctype for canon xml
1077 if (canonical && inDoctype)
1078 return;
1079
1080 try {
1081 boolean indent;
1082
1083 if (prettyPrinting && space.empty())
1084 fatal("stack discipline", null);
1085 indent = prettyPrinting && "default".equals(space.peek());
1086 if (entityNestLevel != 0)
1087 return;
1088 if (indent)
1089 doIndent();
1090 if (canonical && inEpilogue)
1091 newline();
1092 rawWrite("<!--");
1093 escapeChars(ch, start, length, CTX_UNPARSED);
1094 rawWrite("-->");
1095 if (indent)
1096 doIndent();
1097 if (elementNestLevel == 0 && !(canonical && inEpilogue))
1098 newline();
1099 } catch (IOException e) {
1100 fatal("can't write", e);
1101 }
1102 }
1103
1104 // SAX1 DTDHandler
1105
1106 /** <b>SAX1 </b>: called on notation declarations */
1107 final public void notationDecl(String name, String publicId, String systemId)
1108 throws SAXException {
1109 if (xhtml)
1110 return;
1111 try {
1112 // At this time, only SAX2 callbacks start these.
1113 if (!startedDoctype)
1114 return;
1115
1116 if (entityNestLevel != 0)
1117 return;
1118 rawWrite("<!NOTATION " + name + " ");
1119 if (publicId != null)
1120 rawWrite("PUBLIC \"" + publicId + '"');
1121 else
1122 rawWrite("SYSTEM ");
1123 if (systemId != null)
1124 rawWrite('"' + systemId + '"');
1125 rawWrite(">");
1126 newline();
1127 } catch (IOException e) {
1128 fatal("can't write", e);
1129 }
1130 }
1131
1132 /** <b>SAX1 </b>: called on unparsed entity declarations */
1133 final public void unparsedEntityDecl(String name, String publicId,
1134 String systemId, String notationName) throws SAXException {
1135 if (xhtml)
1136 return;
1137 try {
1138 // At this time, only SAX2 callbacks start these.
1139 if (!startedDoctype) {
1140 // FIXME: write to temporary buffer, and make the start
1141 // of the root element write these declarations.
1142 return;
1143 }
1144
1145 if (entityNestLevel != 0)
1146 return;
1147 rawWrite("<!ENTITY " + name + " ");
1148 if (publicId != null)
1149 rawWrite("PUBLIC \"" + publicId + '"');
1150 else
1151 rawWrite("SYSTEM ");
1152 rawWrite('"' + systemId + '"');
1153 rawWrite(" NDATA " + notationName + ">");
1154 newline();
1155 } catch (IOException e) {
1156 fatal("can't write", e);
1157 }
1158 }
1159
1160 // SAX2 DeclHandler
1161
1162 /** <b>SAX2 </b>: called on attribute declarations */
1163 final public void attributeDecl(String eName, String aName, String type,
1164 String mode, String value) throws SAXException {
1165 if (xhtml)
1166 return;
1167 try {
1168 // At this time, only SAX2 callbacks start these.
1169 if (!startedDoctype)
1170 return;
1171 if (entityNestLevel != 0)
1172 return;
1173 rawWrite("<!ATTLIST " + eName + ' ' + aName + ' ');
1174 rawWrite(type);
1175 rawWrite(' ');
1176 if (mode != null)
1177 rawWrite(mode + ' ');
1178 if (value != null)
1179 writeQuotedValue(value, CTX_ATTRIBUTE);
1180 rawWrite('>');
1181 newline();
1182 } catch (IOException e) {
1183 fatal("can't write", e);
1184 }
1185 }
1186
1187 /** <b>SAX2 </b>: called on element declarations */
1188 final public void elementDecl(String name, String model)
1189 throws SAXException {
1190 if (xhtml)
1191 return;
1192 try {
1193 // At this time, only SAX2 callbacks start these.
1194 if (!startedDoctype)
1195 return;
1196 if (entityNestLevel != 0)
1197 return;
1198 rawWrite("<!ELEMENT " + name + ' ' + model + '>');
1199 newline();
1200 } catch (IOException e) {
1201 fatal("can't write", e);
1202 }
1203 }
1204
1205 /** <b>SAX2 </b>: called on external entity declarations */
1206 final public void externalEntityDecl(String name, String publicId,
1207 String systemId) throws SAXException {
1208 if (xhtml)
1209 return;
1210 try {
1211 // At this time, only SAX2 callbacks start these.
1212 if (!startedDoctype)
1213 return;
1214 if (entityNestLevel != 0)
1215 return;
1216 rawWrite("<!ENTITY ");
1217 if (name.startsWith("%")) {
1218 rawWrite("% ");
1219 rawWrite(name.substring(1));
1220 } else
1221 rawWrite(name);
1222 if (publicId != null)
1223 rawWrite(" PUBLIC \"" + publicId + '"');
1224 else
1225 rawWrite(" SYSTEM ");
1226 rawWrite('"' + systemId + "\">");
1227 newline();
1228 } catch (IOException e) {
1229 fatal("can't write", e);
1230 }
1231 }
1232
1233 /** <b>SAX2 </b>: called on internal entity declarations */
1234 final public void internalEntityDecl(String name, String value)
1235 throws SAXException {
1236 if (xhtml)
1237 return;
1238 try {
1239 // At this time, only SAX2 callbacks start these.
1240 if (!startedDoctype)
1241 return;
1242 if (entityNestLevel != 0)
1243 return;
1244 rawWrite("<!ENTITY ");
1245 if (name.startsWith("%")) {
1246 rawWrite("% ");
1247 rawWrite(name.substring(1));
1248 } else
1249 rawWrite(name);
1250 rawWrite(' ');
1251 writeQuotedValue(value, CTX_ENTITY);
1252 rawWrite('>');
1253 newline();
1254 } catch (IOException e) {
1255 fatal("can't write", e);
1256 }
1257 }
1258
1259 // added xmlDecl() -- 2005-03-02 hsivonen
1260 /**
1261 * @see fi.karppinen.xml.XmlDeclarationHandler#xmlDecl(java.lang.String, java.lang.String, java.lang.String)
1262 */
1263 public void xmlDecl(String version, String encoding, String standalone)
1264 throws SAXException {
1265 try {
1266 if (xhtml) {
1267 rawWrite("<?xml version='1.0'");
1268 rawWrite(" encoding='UTF-8'");
1269 rawWrite("?>");
1270 newline();
1271 }
1272 } catch (IOException e) {
1273 fatal("can't write", e);
1274 }
1275 }
1276
1277 private void writeQuotedValue(String value, int code) throws SAXException,
1278 IOException {
1279 char buf[] = value.toCharArray();
1280 int off = 0, len = buf.length;
1281
1282 // we can't add line breaks to attribute/entity/... values
1283 noWrap = true;
1284 rawWrite('"');
1285 escapeChars(buf, off, len, code);
1286 rawWrite('"');
1287 noWrap = false;
1288 }
1289
1290 // removed entity tables -- 2005-02-11 hsivonen
1291
1292 // General routine to write text and substitute predefined
1293 // entities (XML, and a special case for XHTML) as needed.
1294 private void escapeChars(char buf[], int off, int len, int code)
1295 throws SAXException, IOException {
1296 int first = 0;
1297
1298 if (off < 0) {
1299 off = 0;
1300 len = buf.length;
1301 }
1302 for (int i = 0; i < len; i++) {
1303 String esc = null; // init variable -- 2005-02-11 hsivonen
1304 char c = buf[off + i];
1305
1306 switch (c) {
1307 // Note that CTX_ATTRIBUTE isn't explicitly tested here;
1308 // all syntax delimiters are escaped in CTX_ATTRIBUTE,
1309 // otherwise it's similar to CTX_CONTENT
1310
1311 // ampersand flags entity references; entity replacement
1312 // text has unexpanded references, other text doesn't.
1313 case '&':
1314 if (code == CTX_ENTITY || code == CTX_UNPARSED)
1315 continue;
1316 esc = "amp";
1317 break;
1318
1319 // attributes and text may NOT have literal '<', but
1320 // entities may have markup constructs
1321 case '<':
1322 if (code == CTX_ENTITY || code == CTX_UNPARSED)
1323 continue;
1324 esc = "lt";
1325 break;
1326
1327 // as above re markup constructs; but otherwise
1328 // except when canonicalizing, this is for consistency
1329 case '>':
1330 if (code == CTX_ENTITY || code == CTX_UNPARSED)
1331 continue;
1332 esc = "gt";
1333 break;
1334 case '\'':
1335 if (code == CTX_CONTENT || code == CTX_UNPARSED)
1336 continue;
1337 if (canonical || xhtml) // added xhtml check -- 2005-02-11 hsivonen
1338 continue;
1339 esc = "apos";
1340 break;
1341
1342 // needed when printing quoted attribute/entity values
1343 case '"':
1344 if (code == CTX_CONTENT || code == CTX_UNPARSED)
1345 continue;
1346 esc = "quot";
1347 break;
1348
1349 // make line ends work per host OS convention
1350 case '\n':
1351 esc = eol;
1352 break;
1353
1354 // removed obsolete comments -- 2005-02-11 hsivonen
1355
1356 default:
1357 //
1358 // There are characters we can never write safely; getting
1359 // them is an error.
1360 //
1361 // (a) They're never legal in XML ... detected by range
1362 // checks, and (eventually) by remerging surrogate
1363 // pairs on output. (Easy error for apps to prevent.)
1364 //
1365
1366 // removed obsolete comments -- 2005-02-11 hsivonen
1367
1368 // FIXME: CR in CDATA is an error; in text, turn to a char
1369 // ref
1370
1371 // FIXME: CR/LF/TAB in attributes should become char refs
1372
1373 if ((c > 0xfffd)
1374 || ((c < 0x0020) && !((c == 0x0009)
1375 || (c == 0x000A) || (c == 0x000D)))) {
1376
1377 // removed danger mask check -- 2005-02-11 hsivonen
1378 throw new CharConversionException(
1379 "Illegal or non-writable character: U+"
1380 + Integer.toHexString(c));
1381 }
1382 continue;
1383 // removed entity handling -- 2005-02-12 hsivonen
1384 }
1385
1386 if (i != first)
1387 rawWrite(buf, off + first, i - first);
1388 first = i + 1;
1389 if (esc == eol)
1390 newline();
1391 else {
1392 rawWrite('&');
1393 rawWrite(esc);
1394 rawWrite(';');
1395 }
1396 }
1397 if (first < len)
1398 rawWrite(buf, off + first, len - first);
1399 }
1400
1401 private void newline() throws SAXException, IOException {
1402 out.write(eol);
1403 column = 0;
1404 }
1405
1406 private void doIndent() throws SAXException, IOException {
1407 int space = elementNestLevel * 2;
1408
1409 newline();
1410 column = space;
1411 // track tabs only at line starts
1412 while (space > 8) {
1413 out.write("\t");
1414 space -= 8;
1415 }
1416 while (space > 0) {
1417 out.write(" ");
1418 space -= 2;
1419 }
1420 }
1421
1422 private void rawWrite(char c) throws IOException {
1423 out.write(c);
1424 column++;
1425 }
1426
1427 private void rawWrite(String s) throws SAXException, IOException {
1428 if (prettyPrinting && "default".equals(space.peek())) {
1429 char data[] = s.toCharArray();
1430 rawWrite(data, 0, data.length);
1431 } else {
1432 out.write(s);
1433 column += s.length();
1434 }
1435 }
1436
1437 // NOTE: if xhtml, the REC gives some rules about whitespace
1438 // which we could follow ... notably, many places where conformant
1439 // agents "must" consolidate/normalize whitespace. Line ends can
1440 // be removed there, etc. This may not be the right place to do
1441 // such mappings though.
1442
1443 // Line buffering may help clarify algorithms and improve results.
1444
1445 // It's likely xml:space needs more attention.
1446
1447 private void rawWrite(char buf[], int offset, int length)
1448 throws SAXException, IOException {
1449 boolean wrap;
1450
1451 if (prettyPrinting && space.empty())
1452 fatal("stack discipline", null);
1453
1454 wrap = prettyPrinting && "default".equals(space.peek());
1455 if (!wrap) {
1456 out.write(buf, offset, length);
1457 column += length;
1458 return;
1459 }
1460
1461 // we're pretty printing and want to fill lines out only
1462 // to the desired line length.
1463 while (length > 0) {
1464 int target = lineLength - column;
1465 boolean wrote = false;
1466
1467 // Do we even have a problem?
1468 if (target > length || noWrap) {
1469 out.write(buf, offset, length);
1470 column += length;
1471 return;
1472 }
1473
1474 // break the line at a space character, trying to fill
1475 // as much of the line as possible.
1476 char c;
1477
1478 for (int i = target - 1; i >= 0; i--) {
1479 if ((c = buf[offset + i]) == ' ' || c == '\t') {
1480 i++;
1481 out.write(buf, offset, i);
1482 doIndent();
1483 offset += i;
1484 length -= i;
1485 wrote = true;
1486 break;
1487 }
1488 }
1489 if (wrote)
1490 continue;
1491
1492 // no space character permitting break before target
1493 // line length is filled. So, take the next one.
1494 if (target < 0)
1495 target = 0;
1496 for (int i = target; i < length; i++)
1497 if ((c = buf[offset + i]) == ' ' || c == '\t') {
1498 i++;
1499 out.write(buf, offset, i);
1500 doIndent();
1501 offset += i;
1502 length -= i;
1503 wrote = true;
1504 break;
1505 }
1506 if (wrote)
1507 continue;
1508
1509 // no such luck.
1510 out.write(buf, offset, length);
1511 column += length;
1512 break;
1513 }
1514 }
1515 }