001 /* 002 * XMLWriter.java 003 * Copyright (C) 1999,2000,2001 The Free Software Foundation 004 * Portions Copyright 2005 Marko Karppinen & Co. LLC 005 * 006 * This file is part of GNU JAXP, a library. 007 * This version has been modified from the original GNU JAXP distribution 008 * on 2005-02-11, 2005-02-12 and 2005-03-02 by Henri Sivonen working as an 009 * employee of Marko Karppinen & Co. LLC. 010 * 011 * GNU JAXP is free software; you can redistribute it and/or modify 012 * it under the terms of the GNU General Public License as published by 013 * the Free Software Foundation; either version 2 of the License, or 014 * (at your option) any later version. 015 * 016 * GNU JAXP is distributed in the hope that it will be useful, 017 * but WITHOUT ANY WARRANTY; without even the implied warranty of 018 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 019 * GNU General Public License for more details. 020 * 021 * You should have received a copy of the GNU General Public License 022 * along with this program; if not, write to the Free Software 023 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 024 * 025 * Linking this library statically or dynamically with other modules is 026 * making a combined work based on this library. Thus, the terms and 027 * conditions of the GNU General Public License cover the whole 028 * combination. 029 * 030 * As a special exception, the copyright holders of this library give you 031 * permission to link this library with independent modules to produce an 032 * executable, regardless of the license terms of these independent 033 * modules, and to copy and distribute the resulting executable under 034 * terms of your choice, provided that you also meet, for each linked 035 * independent module, the terms and conditions of the license of that 036 * module. An independent module is a module which is not derived from 037 * or based on this library. If you modify this library, you may extend 038 * this exception to your version of the library, but you are not 039 * obliged to do so. If you do not wish to do so, delete this 040 * exception statement from your version. 041 */ 042 043 // Package renamed -- 2005-02-11 hsivonen 044 package fi.karppinen.gnu.xml.util; 045 046 import java.io.BufferedWriter; 047 import java.io.CharConversionException; 048 import java.io.IOException; 049 import java.io.OutputStream; 050 import java.io.OutputStreamWriter; 051 import java.io.Writer; 052 import java.util.Stack; 053 054 import org.xml.sax.Attributes; 055 import org.xml.sax.ContentHandler; 056 import org.xml.sax.DTDHandler; 057 import org.xml.sax.ErrorHandler; 058 import org.xml.sax.Locator; 059 import org.xml.sax.SAXException; 060 import org.xml.sax.SAXParseException; 061 import org.xml.sax.ext.DeclHandler; 062 import org.xml.sax.ext.LexicalHandler; 063 import org.xml.sax.helpers.LocatorImpl; 064 065 import fi.karppinen.xml.XmlDeclarationHandler; 066 067 // doc edited -- 2005-02-11 hsivonen 068 /** 069 * This class is a SAX handler which writes all its input as a well formed XML 070 * or XHTML document. If driven using SAX2 events, this output may include a 071 * recreated document type declaration, subject to limitations of SAX (no 072 * internal subset exposed) or DOM (the important declarations, with their 073 * documentation, are discarded). 074 * 075 * <p> 076 * By default, text is generated "as-is", but some optional modes are supported. 077 * Pretty-printing is supported, to make life easier for people reading the 078 * output. XHTML (1.0) output has can be made particularly pretty. Canonical XML 079 * can also be generated, assuming the input is properly formed. 080 * 081 * <hr> 082 * 083 * <p> 084 * Some of the methods on this class are intended for applications to use 085 * directly, rather than as pure SAX2 event callbacks. Some of those methods 086 * access the JavaBeans properties (used to tweak output formats, for example 087 * canonicalization and pretty printing). Subclasses are expected to add new 088 * behaviors, not to modify current behavior, so many such methods are final. 089 * </p> 090 * 091 * <p> 092 * The <em>write*()</em> methods may be slightly simpler for some applications 093 * to use than direct callbacks. For example, they support a simple policy for 094 * encoding data items as the content of a single element. 095 * 096 * <p> 097 * To reuse an XMLWriter you must provide it with a new Writer, since this 098 * handler closes the writer it was given as part of its endDocument() handling. 099 * (XML documents have an end of input, and the way to encode that on a stream 100 * is to close it.) 101 * </p> 102 * 103 * <hr> 104 * 105 * <p> 106 * Note that any relative URIs in the source document, as found in entity and 107 * notation declarations, ought to have been fully resolved by the parser 108 * providing events to this handler. This means that the output text should only 109 * have fully resolved URIs, which may not be the desired behavior in cases 110 * where later binding is desired. 111 * </p> 112 * 113 * <p> 114 * <em>Note that due to SAX2 defaults, you may need to manually 115 * ensure that the input events are XML-conformant with respect to namespace 116 * prefixes and declarations. {@link gnu.xml.pipeline.NSFilter} is 117 * one solution to this problem, in the context of processing pipelines.</em> 118 * Something as simple as connecting this handler to a parser might not generate 119 * the correct output. Another workaround is to ensure that the 120 * <em>namespace-prefixes</em> feature is always set to true, if you're 121 * hooking this directly up to some XMLReader implementation. 122 * 123 * @see fi.karppinen.gnu.xml.pipeline.TextConsumer 124 * 125 * @author David Brownell 126 * @author Henri Sivonen 127 */ 128 public class XMLWriter implements ContentHandler, LexicalHandler, DTDHandler, 129 DeclHandler, XmlDeclarationHandler { 130 // added XmlDeclarationHandler -- 2005-03-02 hsivonen 131 132 // text prints/escapes differently depending on context 133 // CTX_ENTITY ... entity literal value 134 // CTX_ATTRIBUTE ... attribute literal value 135 // CTX_CONTENT ... content of an element 136 // CTX_UNPARSED ... CDATA, comment, PI, names, etc 137 // CTX_NAME ... name or nmtoken, no escapes possible 138 private static final int CTX_ENTITY = 1; 139 140 private static final int CTX_ATTRIBUTE = 2; 141 142 private static final int CTX_CONTENT = 3; 143 144 private static final int CTX_UNPARSED = 4; 145 146 private static final int CTX_NAME = 5; 147 148 // Removed obsolete comment -- 2005-02-11 hsivonen 149 150 private Writer out; 151 152 private boolean inCDATA; 153 154 private int elementNestLevel; 155 156 private final static String eol = "\n"; 157 158 // Made eol independent of the underlying platform -- 2005-02-11 hsivonen 159 160 // Removed dangerMask -- 2005-02-11 hsivonen 161 162 private StringBuilder stringBuf; 163 164 private Locator locator; 165 166 private ErrorHandler errHandler; 167 168 private boolean expandingEntities = false; 169 170 private int entityNestLevel; 171 172 private boolean xhtml; 173 174 private boolean startedDoctype; 175 176 // Removed encoding -- 2005-02-11 hsivonen 177 178 private boolean canonical; 179 180 private boolean inDoctype; 181 182 private boolean inEpilogue; 183 184 // pretty printing controls 185 private boolean prettyPrinting; 186 187 private int column; 188 189 private boolean noWrap; 190 191 private Stack<String> space = new Stack<String>(); 192 193 // this is not a hard'n'fast rule -- longer lines are OK, 194 // but are to be avoided. Here, prettyprinting is more to 195 // show structure "cleanly" than to be precise about it. 196 // better to have ragged layout than one line 24Kb long. 197 private static final int lineLength = 75; 198 199 /** 200 * Constructs this handler with System.out used to write SAX events using 201 * the UTF-8 encoding. Avoid using this except when you know it's safe to 202 * close System.out at the end of the document. 203 */ 204 public XMLWriter() throws IOException { 205 this(System.out); 206 } 207 208 /** 209 * Constructs a handler which writes all input to the output stream in the 210 * UTF-8 encoding, and closes it when endDocument is called. (Yes it's 211 * annoying that this throws an exception -- but there's really no way 212 * around it, since it's barely possible a JDK may exist somewhere that 213 * doesn't know how to emit UTF-8.) 214 */ 215 public XMLWriter(OutputStream out) throws IOException { 216 this(new OutputStreamWriter(out, "UTF8")); 217 } 218 219 // doc edited -- 2005-02-11 hsivonen 220 /** 221 * Constructs a handler which writes all input to the writer, and then 222 * closes the writer when the document ends. 223 * 224 * <P> 225 * See the description of the constructor which takes an encoding name for 226 * imporant information about selection of encodings. 227 * 228 * @param writer 229 * XML text is written to this writer. 230 */ 231 public XMLWriter(Writer writer) { 232 // Call to intermediate constructor removed -- 2005-02-11 hsivonen 233 this.setWriter(writer); 234 } 235 236 // Removed constructor taking an encoding -- 2005-02-11 hsivonen 237 // Removed setter for encoding -- 2005-02-11 hsivonen 238 239 /** 240 * Resets the handler to write a new text document. 241 * 242 * @param writer 243 * XML text is written to this writer. 244 * 245 * @exception IllegalStateException 246 * if the current document hasn't yet ended (with 247 * {@link #endDocument}) 248 */ 249 final public void setWriter(Writer writer) { 250 // Removed encoding-related code -- 2005-02-11 hsivonen 251 if (out != null) 252 throw new IllegalStateException("can't change stream in mid course"); 253 out = writer; 254 if (!(out instanceof BufferedWriter)) 255 out = new BufferedWriter(out); 256 space.push("default"); 257 } 258 259 // Removed setter for eol -- 2005-02-11 hsivonen 260 261 /** 262 * Assigns the error handler to be used to present most fatal errors. 263 */ 264 public void setErrorHandler(ErrorHandler handler) { 265 errHandler = handler; 266 } 267 268 /** 269 * Used internally and by subclasses, this encapsulates the logic involved 270 * in reporting fatal errors. It uses locator information for good 271 * diagnostics, if available, and gives the application's ErrorHandler the 272 * opportunity to handle the error before throwing an exception. 273 */ 274 protected void fatal(String message, Exception e) throws SAXException { 275 SAXParseException x; 276 277 if (locator == null) 278 x = new SAXParseException(message, null, null, -1, -1, e); 279 else 280 x = new SAXParseException(message, locator, e); 281 if (errHandler != null) 282 errHandler.fatalError(x); 283 throw x; 284 } 285 286 // JavaBeans properties 287 // JavaDoc comment modified to reflet encoding modifications -- 2005-02-11 288 // hsivonen 289 /** 290 * Controls whether the output should attempt to follow the "transitional" 291 * XHTML rules so that it meets the "HTML Compatibility Guidelines" appendix 292 * in the XHTML specification. XHTML empty 293 * elements are printed specially. 294 * 295 * <p> 296 * When this option is enabled, it is the caller's responsibility to ensure 297 * that the input is otherwise valid as XHTML. Things to be careful of in 298 * all cases, as described in the appendix referenced above, include: 299 * <ul> 300 * 301 * <li>Element and attribute names must be in lower case, both in the 302 * document and in any CSS style sheet. 303 * <li>The root element must be "html". 304 * <li>Elements that must be empty (such as <em><br></em> must have 305 * no content. 306 * <li>Use both <em>lang</em> and <em>xml:lang</em> attributes when 307 * specifying language. 308 * <li>Similarly, use both <em>id</em> and <em>name</em> attributes 309 * when defining elements that may be referred to through URI fragment 310 * identifiers ... and make sure that the value is a legal NMTOKEN, since 311 * not all such HTML 4.0 identifiers are valid in XML. 312 * <li>Be careful with character encodings; make sure you provide a 313 * <em><meta http-equiv="Content-type" 314 * content="text/xml;charset=UTF-8" /></em> 315 * element in the HTML "head" element. 316 * </ul> 317 * 318 * <p> 319 * Additionally, some of the oldest browsers have additional quirks, to 320 * address with guidelines such as: 321 * <ul> 322 * 323 * <li>Processing instructions may be rendered, so avoid them. (Similarly 324 * for an XML declaration.) 325 * <li>Embedded style sheets and scripts should not contain XML markup 326 * delimiters: &, <, and ]]> are trouble. 327 * <li>Attribute values should not have line breaks or multiple consecutive 328 * white space characters. 329 * <li>Use no more than one of the deprecated (transitional) 330 * <em><isindex></em> elements. 331 * <li>Some boolean attributes (such as <em>compact, checked, 332 * disabled, readonly, selected,</em> 333 * and more) confuse some browsers, since they only understand minimized 334 * versions which are illegal in XML. 335 * </ul> 336 * 337 * <p> 338 * Also, some characteristics of the resulting output may be a function of 339 * whether the document is later given a MIME content type of 340 * <em>text/html</em> rather than one indicating XML ( 341 * <em>application/xml</em> or <em>text/xml</em>). Worse, some browsers 342 * ignore MIME content types and prefer to rely URI name suffixes -- so an 343 * "index.xml" could always be XML, never XHTML, no matter its MIME type. 344 */ 345 final public void setXhtml(boolean value) { 346 if (locator != null) 347 throw new IllegalStateException("started parsing"); 348 xhtml = value; 349 if (xhtml) 350 canonical = false; 351 } 352 353 /** 354 * Returns true if the output attempts to echo the input following 355 * "transitional" XHTML rules and matching the "HTML Compatibility 356 * Guidelines" so that an HTML version 3 browser can read the output as 357 * HTML; returns false (the default) othewise. 358 */ 359 final public boolean isXhtml() { 360 return xhtml; 361 } 362 363 /** 364 * Controls whether the output text contains references to entities (the 365 * default), or instead contains the expanded values of those entities. 366 */ 367 final public void setExpandingEntities(boolean value) { 368 if (locator != null) 369 throw new IllegalStateException("started parsing"); 370 expandingEntities = value; 371 if (!expandingEntities) 372 canonical = false; 373 } 374 375 /** 376 * Returns true if the output will have no entity references; returns false 377 * (the default) otherwise. 378 */ 379 final public boolean isExpandingEntities() { 380 return expandingEntities; 381 } 382 383 /** 384 * Controls pretty-printing, which by default is not enabled (and currently 385 * is most useful for XHTML output). Pretty printing enables structural 386 * indentation, sorting of attributes by name, line wrapping, and 387 * potentially other mechanisms for making output more or less readable. 388 * 389 * <p> 390 * At this writing, structural indentation and line wrapping are enabled 391 * when pretty printing is enabled and the <em>xml:space</em> attribute 392 * has the value <em>default</em> (its other legal value is 393 * <em>preserve</em>, as defined in the XML specification). The three 394 * XHTML element types which use another value are recognized by their names 395 * (namespaces are ignored). 396 * 397 * <p> 398 * Also, for the record, the "pretty" aspect of printing here is more to 399 * provide basic structure on outputs that would otherwise risk being a 400 * single long line of text. For now, expect the structure to be ragged ... 401 * unless you'd like to submit a patch to make this be more strictly 402 * formatted! 403 * 404 * @exception IllegalStateException 405 * thrown if this method is invoked after output has begun. 406 */ 407 final public void setPrettyPrinting(boolean value) { 408 if (locator != null) 409 throw new IllegalStateException("started parsing"); 410 prettyPrinting = value; 411 if (prettyPrinting) 412 canonical = false; 413 } 414 415 /** 416 * Returns value of flag controlling pretty printing. 417 */ 418 final public boolean isPrettyPrinting() { 419 return prettyPrinting; 420 } 421 422 /** 423 * Sets the output style to be canonicalized. Input events must meet 424 * requirements that are slightly more stringent than the basic 425 * well-formedness ones, and include: 426 * <ul> 427 * 428 * <li>Namespace prefixes must not have been changed from those in the 429 * original document. (This may only be ensured by setting the SAX2 430 * XMLReader <em>namespace-prefixes</em> feature flag; by default, it is 431 * cleared.) 432 * 433 * <li>Redundant namespace declaration attributes have been removed. (If an 434 * ancestor element defines a namespace prefix and that declaration hasn't 435 * been overriden, an element must not redeclare it.) 436 * 437 * <li>If comments are not to be included in the canonical output, they 438 * must first be removed from the input event stream; this 439 * <em>Canonical XML with comments</em> by default. 440 * 441 * <li>If the input character encoding was not UCS-based, the character 442 * data must have been normalized using Unicode Normalization Form C. (UTF-8 443 * and UTF-16 are UCS-based.) 444 * 445 * <li>Attribute values must have been normalized, as is done by any 446 * conformant XML processor which processes all external parameter entities. 447 * 448 * <li>Similarly, attribute value defaulting has been performed. 449 * 450 * </ul> 451 * 452 * <p> 453 * Note that fragments of XML documents, as specified by an XPath node set, 454 * may be canonicalized. In such cases, elements may need some fixup (for 455 * <em>xml:*</em> attributes and application-specific context). 456 * 457 * @exception IllegalArgumentException 458 * if the output encoding is anything other than UTF-8. 459 */ 460 final public void setCanonical(boolean value) { 461 // Removed encoding check -- 2005-02-11 hsivonen 462 canonical = value; 463 if (canonical) { 464 prettyPrinting = xhtml = false; 465 expandingEntities = true; 466 // Removed eol modification -- 2005-02-11 hsivonen 467 } 468 } 469 470 /** 471 * Returns value of flag controlling canonical output. 472 */ 473 final public boolean isCanonical() { 474 return canonical; 475 } 476 477 /** 478 * Flushes the output stream. When this handler is used in long lived 479 * pipelines, it can be important to flush buffered state, for example so 480 * that it can reach the disk as part of a state checkpoint. 481 */ 482 final public void flush() throws IOException { 483 if (out != null) 484 out.flush(); 485 } 486 487 // convenience routines 488 489 // FIXME: probably want a subclass that holds a lot of these... 490 // and maybe more! 491 492 /** 493 * Writes the string as if characters() had been called on the contents of 494 * the string. This is particularly useful when applications act as 495 * producers and write data directly to event consumers. 496 */ 497 final public void write(String data) throws SAXException { 498 char buf[] = data.toCharArray(); 499 characters(buf, 0, buf.length); 500 } 501 502 /** 503 * Writes an element that has content consisting of a single string. 504 * 505 * @see #writeEmptyElement 506 * @see #startElement 507 */ 508 public void writeElement(String uri, String localName, String qName, 509 Attributes atts, String content) throws SAXException { 510 if (content == null || content.length() == 0) { 511 writeEmptyElement(uri, localName, qName, atts); 512 return; 513 } 514 startElement(uri, localName, qName, atts); 515 char chars[] = content.toCharArray(); 516 characters(chars, 0, chars.length); 517 endElement(uri, localName, qName); 518 } 519 520 /** 521 * Writes an element that has content consisting of a single integer, 522 * encoded as a decimal string. 523 * 524 * @see #writeEmptyElement 525 * @see #startElement 526 */ 527 public void writeElement(String uri, String localName, String qName, 528 Attributes atts, int content) throws SAXException { 529 writeElement(uri, localName, qName, atts, Integer.toString(content)); 530 } 531 532 // SAX1 ContentHandler 533 /** <b>SAX1 </b>: provides parser status information */ 534 final public void setDocumentLocator(Locator l) { 535 locator = l; 536 } 537 538 // Removed transitional DTD URI -- 2005-02-11 hsivonen 539 540 /** 541 * <b>SAX1 </b>: indicates the beginning of a document parse. If you're 542 * writing (well formed) fragments of XML, neither this nor endDocument 543 * should be called. 544 */ 545 // NOT final 546 public void startDocument() throws SAXException { 547 try { 548 if (out == null) 549 throw new IllegalStateException( 550 "null Writer given to XMLWriter"); 551 552 // Not all parsers provide the locator we want; this also 553 // flags whether events are being sent to this object yet. 554 // We could only have this one call if we only printed whole 555 // documents ... but we also print fragments, so most of the 556 // callbacks here replicate this test. 557 558 if (locator == null) 559 locator = new LocatorImpl(); 560 561 // Unless we're in the XHTML mode or we're canonicalizing, write 562 // the XML declaration. 563 // Hard-coded UTF-8 -- 2005-02-11 hsivonen 564 if (!canonical && !xhtml) { 565 rawWrite("<?xml version='1.0'"); 566 rawWrite(" encoding='UTF-8'"); 567 rawWrite("?>"); 568 newline(); 569 } 570 571 // Removed hard-coded Transitionl XHTML doctype -- 2005-02-11 572 // hsivonen 573 574 entityNestLevel = 0; 575 576 } catch (IOException e) { 577 fatal("can't write", e); 578 } 579 } 580 581 /** 582 * <b>SAX1 </b>: indicates the completion of a parse. Note that all complete 583 * SAX event streams make this call, even if an error is reported during a 584 * parse. 585 */ 586 // NOT final 587 public void endDocument() throws SAXException { 588 try { 589 if (!canonical) { 590 newline(); 591 newline(); 592 } 593 out.close(); 594 out = null; 595 locator = null; 596 } catch (IOException e) { 597 fatal("can't write", e); 598 } 599 } 600 601 // XHTML elements declared as EMPTY print differently 602 final private static boolean isEmptyElementTag(String tag) { 603 switch (tag.charAt(0)) { 604 case 'a': 605 return "area".equals(tag); 606 case 'b': 607 return "base".equals(tag) || "basefont".equals(tag) 608 || "br".equals(tag); 609 case 'c': 610 return "col".equals(tag); 611 case 'f': 612 return "frame".equals(tag); 613 case 'h': 614 return "hr".equals(tag); 615 case 'i': 616 return "img".equals(tag) || "input".equals(tag) 617 || "isindex".equals(tag); 618 case 'l': 619 return "link".equals(tag); 620 case 'm': 621 return "meta".equals(tag); 622 case 'p': 623 return "param".equals(tag); 624 } 625 return false; 626 } 627 628 private static boolean indentBefore(String tag) { 629 // basically indent before block content 630 // and within structure like tables, lists 631 switch (tag.charAt(0)) { 632 case 'a': 633 return "applet".equals(tag); 634 case 'b': 635 return "body".equals(tag) || "blockquote".equals(tag); 636 case 'c': 637 return "center".equals(tag); 638 case 'f': 639 return "frame".equals(tag) || "frameset".equals(tag); 640 case 'h': 641 return "head".equals(tag); 642 case 'm': 643 return "meta".equals(tag); 644 case 'o': 645 return "object".equals(tag); 646 case 'p': 647 return "param".equals(tag) || "pre".equals(tag); 648 case 's': 649 return "style".equals(tag); 650 case 't': 651 return "title".equals(tag) || "td".equals(tag) 652 || "th".equals(tag); 653 } 654 // ... but not inline elements like "em", "b", "font" 655 return false; 656 } 657 658 private static boolean spaceBefore(String tag) { 659 // blank line AND INDENT before certain structural content 660 switch (tag.charAt(0)) { 661 case 'h': 662 return "h1".equals(tag) || "h2".equals(tag) || "h3".equals(tag) 663 || "h4".equals(tag) || "h5".equals(tag) 664 || "h6".equals(tag) || "hr".equals(tag); 665 case 'l': 666 return "li".equals(tag); 667 case 'o': 668 return "ol".equals(tag); 669 case 'p': 670 return "p".equals(tag); 671 case 't': 672 return "table".equals(tag) || "tr".equals(tag); 673 case 'u': 674 return "ul".equals(tag); 675 } 676 return false; 677 } 678 679 // XHTML DTDs say these three have xml:space="preserve" 680 private static boolean spacePreserve(String tag) { 681 return "pre".equals(tag) || "style".equals(tag) || "script".equals(tag); 682 } 683 684 /** 685 * <b>SAX2 </b>: ignored. 686 */ 687 final public void startPrefixMapping(String prefix, String uri) { 688 } 689 690 /** 691 * <b>SAX2 </b>: ignored. 692 */ 693 final public void endPrefixMapping(String prefix) { 694 } 695 696 private void writeStartTag(String name, Attributes atts, boolean isEmpty) 697 throws SAXException, IOException { 698 rawWrite('<'); 699 rawWrite(name); 700 701 // write out attributes ... sorting is particularly useful 702 // with output that's been heavily defaulted. 703 if (atts != null && atts.getLength() != 0) { 704 705 // Set up to write, with optional sorting 706 int indices[] = new int[atts.getLength()]; 707 708 for (int i = 0; i < indices.length; i++) 709 indices[i] = i; 710 711 // optionally sort 712 713 // FIXME: canon xml demands xmlns nodes go first, 714 // and sorting by URI first (empty first) then localname 715 // it should maybe use a different sort 716 717 if (canonical || prettyPrinting) { 718 719 // insertion sort by attribute name 720 for (int i = 1; i < indices.length; i++) { 721 int n = indices[i], j; 722 String s = atts.getQName(n); 723 724 for (j = i - 1; j >= 0; j--) { 725 if (s.compareTo(atts.getQName(indices[j])) >= 0) 726 break; 727 indices[j + 1] = indices[j]; 728 } 729 indices[j + 1] = n; 730 } 731 } 732 733 // write, sorted or no 734 for (int i = 0; i < indices.length; i++) { 735 String s = atts.getQName(indices[i]); 736 737 if (s == null || "".equals(s)) 738 throw new IllegalArgumentException("no XML name"); 739 rawWrite(" "); 740 rawWrite(s); 741 rawWrite("="); 742 writeQuotedValue(atts.getValue(indices[i]), CTX_ATTRIBUTE); 743 } 744 } 745 if (isEmpty) 746 rawWrite(" /"); 747 rawWrite('>'); 748 } 749 750 /** 751 * <b>SAX2 </b>: indicates the start of an element. When XHTML is in use, 752 * avoid attribute values with line breaks or multiple whitespace 753 * characters, since not all user agents handle them correctly. 754 */ 755 final public void startElement(String uri, String localName, String qName, 756 Attributes atts) throws SAXException { 757 startedDoctype = false; 758 759 if (locator == null) 760 locator = new LocatorImpl(); 761 762 if (qName == null || "".equals(qName)) 763 throw new IllegalArgumentException("no XML name"); 764 765 try { 766 if (entityNestLevel != 0) 767 return; 768 if (prettyPrinting) { 769 String whitespace = null; 770 771 if (xhtml && spacePreserve(qName)) 772 whitespace = "preserve"; 773 else if (atts != null) 774 whitespace = atts.getValue("xml:space"); 775 if (whitespace == null) 776 whitespace = space.peek(); 777 space.push(whitespace); 778 779 if ("default".equals(whitespace)) { 780 if (xhtml) { 781 if (spaceBefore(qName)) { 782 newline(); 783 doIndent(); 784 } else if (indentBefore(qName)) 785 doIndent(); 786 // else it's inlined, modulo line length 787 // FIXME: incrementing element nest level 788 // for inlined elements causes ugliness 789 } else 790 doIndent(); 791 } 792 } 793 elementNestLevel++; 794 writeStartTag(qName, atts, xhtml && isEmptyElementTag(qName)); 795 796 if (xhtml) { 797 // FIXME: if this is an XHTML "pre" element, turn 798 // off automatic wrapping. 799 } 800 801 } catch (IOException e) { 802 fatal("can't write", e); 803 } 804 } 805 806 /** 807 * Writes an empty element. 808 * 809 * @see #startElement 810 */ 811 public void writeEmptyElement(String uri, String localName, String qName, 812 Attributes atts) throws SAXException { 813 if (canonical) { 814 startElement(uri, localName, qName, atts); 815 endElement(uri, localName, qName); 816 } else { 817 try { 818 writeStartTag(qName, atts, true); 819 } catch (IOException e) { 820 fatal("can't write", e); 821 } 822 } 823 } 824 825 /** <b>SAX2 </b>: indicates the end of an element */ 826 final public void endElement(String uri, String localName, String qName) 827 throws SAXException { 828 if (qName == null || "".equals(qName)) 829 throw new IllegalArgumentException("no XML name"); 830 831 try { 832 elementNestLevel--; 833 if (entityNestLevel != 0) 834 return; 835 if (xhtml && isEmptyElementTag(qName)) 836 return; 837 rawWrite("</"); 838 rawWrite(qName); 839 rawWrite('>'); 840 841 if (prettyPrinting) { 842 if (!space.empty()) 843 space.pop(); 844 else 845 fatal("stack discipline", null); 846 } 847 if (elementNestLevel == 0) 848 inEpilogue = true; 849 850 } catch (IOException e) { 851 fatal("can't write", e); 852 } 853 } 854 855 /** <b>SAX1 </b>: reports content characters */ 856 final public void characters(char ch[], int start, int length) 857 throws SAXException { 858 if (locator == null) 859 locator = new LocatorImpl(); 860 861 862 try { 863 if (entityNestLevel != 0) 864 return; 865 if (inCDATA) { 866 escapeChars(ch, start, length, CTX_UNPARSED); 867 } else { 868 escapeChars(ch, start, length, CTX_CONTENT); 869 } 870 } catch (IOException e) { 871 fatal("can't write", e); 872 } 873 874 } 875 876 /** <b>SAX1 </b>: reports ignorable whitespace */ 877 final public void ignorableWhitespace(char ch[], int start, int length) 878 throws SAXException { 879 if (locator == null) 880 locator = new LocatorImpl(); 881 882 try { 883 if (entityNestLevel != 0) 884 return; 885 // don't forget to map NL to CRLF, CR, etc 886 escapeChars(ch, start, length, CTX_CONTENT); 887 } catch (IOException e) { 888 fatal("can't write", e); 889 } 890 } 891 892 /** 893 * <b>SAX1 </b>: reports a PI. This doesn't check for illegal target names, 894 * such as "xml" or "XML", or namespace-incompatible ones like "big:dog"; 895 * the caller is responsible for ensuring those names are legal. 896 */ 897 final public void processingInstruction(String target, String data) 898 throws SAXException { 899 if (locator == null) 900 locator = new LocatorImpl(); 901 902 // don't print internal subset for XHTML 903 if (xhtml && startedDoctype) 904 return; 905 906 // ancient HTML browsers might render these ... their loss. 907 // to prevent: "if (xhtml) return;". 908 909 try { 910 if (entityNestLevel != 0) 911 return; 912 if (canonical && inEpilogue) 913 newline(); 914 rawWrite("<?"); 915 rawWrite(target); 916 rawWrite(' '); 917 escapeChars(data.toCharArray(), -1, -1, CTX_UNPARSED); 918 rawWrite("?>"); 919 if (elementNestLevel == 0 && !(canonical && inEpilogue)) 920 newline(); 921 } catch (IOException e) { 922 fatal("can't write", e); 923 } 924 } 925 926 /** <b>SAX1 </b>: indicates a non-expanded entity reference */ 927 public void skippedEntity(String name) throws SAXException { 928 try { 929 rawWrite("&"); 930 rawWrite(name); 931 rawWrite(";"); 932 } catch (IOException e) { 933 fatal("can't write", e); 934 } 935 } 936 937 // SAX2 LexicalHandler 938 939 /** <b>SAX2 </b>: called before parsing CDATA characters */ 940 final public void startCDATA() throws SAXException { 941 if (locator == null) 942 locator = new LocatorImpl(); 943 944 if (canonical || xhtml) // added xhtml check -- 2005-02-12 hsivonen 945 return; 946 947 try { 948 inCDATA = true; 949 if (entityNestLevel == 0) 950 rawWrite("<![CDATA["); 951 } catch (IOException e) { 952 fatal("can't write", e); 953 } 954 } 955 956 /** <b>SAX2 </b>: called after parsing CDATA characters */ 957 final public void endCDATA() throws SAXException { 958 if (canonical || xhtml) // added xhtml check -- 2005-02-12 hsivonen 959 return; 960 961 try { 962 inCDATA = false; 963 if (entityNestLevel == 0) 964 rawWrite("]]>"); 965 } catch (IOException e) { 966 fatal("can't write", e); 967 } 968 } 969 970 /** 971 * <b>SAX2 </b>: called when the doctype is partially parsed Note that this, 972 * like other doctype related calls, is ignored when XHTML is in use. 973 */ 974 final public void startDTD(String name, String publicId, String systemId) 975 throws SAXException { 976 if (locator == null) 977 locator = new LocatorImpl(); 978 // Removed xhtml check -- 2005-02-11 hsivonen 979 try { 980 inDoctype = startedDoctype = true; 981 if (canonical) 982 return; 983 rawWrite("<!DOCTYPE "); 984 rawWrite(name); 985 rawWrite(' '); 986 987 if (!expandingEntities) { 988 // use double quotes -- 2005-02-12 hsivonen 989 if (publicId != null) 990 rawWrite("PUBLIC \"" + publicId + "\" \"" + systemId + "\""); 991 else if (systemId != null) 992 rawWrite("SYSTEM \"" + systemId + "\""); 993 } 994 // Added xhtml check -- 2005-02-11 hsivonen 995 if (!xhtml) { 996 rawWrite(" ["); 997 newline(); 998 } 999 } catch (IOException e) { 1000 fatal("can't write", e); 1001 } 1002 } 1003 1004 /** <b>SAX2 </b>: called after the doctype is parsed */ 1005 final public void endDTD() throws SAXException { 1006 inDoctype = false; 1007 // Removed xhtml check -- 2005-02-11 hsivonen 1008 if (canonical) 1009 return; 1010 try { 1011 // Added xhtml check -- 2005-02-11 hsivonen 1012 if (!xhtml) 1013 rawWrite("]"); 1014 rawWrite(">"); 1015 newline(); 1016 } catch (IOException e) { 1017 fatal("can't write", e); 1018 } 1019 } 1020 1021 /** 1022 * <b>SAX2 </b>: called before parsing a general entity in content 1023 */ 1024 final public void startEntity(String name) throws SAXException { 1025 try { 1026 boolean writeEOL = true; 1027 1028 // Predefined XHTML entities (for characters) will get 1029 // mapped back later. 1030 if (xhtml || expandingEntities) 1031 return; 1032 1033 entityNestLevel++; 1034 if (name.equals("[dtd]")) 1035 return; 1036 if (entityNestLevel != 1) 1037 return; 1038 if (!name.startsWith("%")) { 1039 writeEOL = false; 1040 rawWrite('&'); 1041 } 1042 rawWrite(name); 1043 rawWrite(';'); 1044 if (writeEOL) 1045 newline(); 1046 } catch (IOException e) { 1047 fatal("can't write", e); 1048 } 1049 } 1050 1051 /** 1052 * <b>SAX2 </b>: called after parsing a general entity in content 1053 */ 1054 final public void endEntity(String name) throws SAXException { 1055 if (xhtml || expandingEntities) 1056 return; 1057 entityNestLevel--; 1058 } 1059 1060 /** 1061 * <b>SAX2 </b>: called when comments are parsed. When XHTML is used, the 1062 * old HTML tradition of using comments to for inline CSS, or for JavaScript 1063 * code is discouraged. This is because XML processors are encouraged to 1064 * discard, on the grounds that comments are for users (and perhaps text 1065 * editors) not programs. Instead, use external scripts 1066 */ 1067 final public void comment(char ch[], int start, int length) 1068 throws SAXException { 1069 if (locator == null) 1070 locator = new LocatorImpl(); 1071 1072 // don't print internal subset for XHTML 1073 if (xhtml && inDoctype) // changed check to match canon -- 2005-02-11 1074 // hsivonen 1075 return; 1076 // don't print comment in doctype for canon xml 1077 if (canonical && inDoctype) 1078 return; 1079 1080 try { 1081 boolean indent; 1082 1083 if (prettyPrinting && space.empty()) 1084 fatal("stack discipline", null); 1085 indent = prettyPrinting && "default".equals(space.peek()); 1086 if (entityNestLevel != 0) 1087 return; 1088 if (indent) 1089 doIndent(); 1090 if (canonical && inEpilogue) 1091 newline(); 1092 rawWrite("<!--"); 1093 escapeChars(ch, start, length, CTX_UNPARSED); 1094 rawWrite("-->"); 1095 if (indent) 1096 doIndent(); 1097 if (elementNestLevel == 0 && !(canonical && inEpilogue)) 1098 newline(); 1099 } catch (IOException e) { 1100 fatal("can't write", e); 1101 } 1102 } 1103 1104 // SAX1 DTDHandler 1105 1106 /** <b>SAX1 </b>: called on notation declarations */ 1107 final public void notationDecl(String name, String publicId, String systemId) 1108 throws SAXException { 1109 if (xhtml) 1110 return; 1111 try { 1112 // At this time, only SAX2 callbacks start these. 1113 if (!startedDoctype) 1114 return; 1115 1116 if (entityNestLevel != 0) 1117 return; 1118 rawWrite("<!NOTATION " + name + " "); 1119 if (publicId != null) 1120 rawWrite("PUBLIC \"" + publicId + '"'); 1121 else 1122 rawWrite("SYSTEM "); 1123 if (systemId != null) 1124 rawWrite('"' + systemId + '"'); 1125 rawWrite(">"); 1126 newline(); 1127 } catch (IOException e) { 1128 fatal("can't write", e); 1129 } 1130 } 1131 1132 /** <b>SAX1 </b>: called on unparsed entity declarations */ 1133 final public void unparsedEntityDecl(String name, String publicId, 1134 String systemId, String notationName) throws SAXException { 1135 if (xhtml) 1136 return; 1137 try { 1138 // At this time, only SAX2 callbacks start these. 1139 if (!startedDoctype) { 1140 // FIXME: write to temporary buffer, and make the start 1141 // of the root element write these declarations. 1142 return; 1143 } 1144 1145 if (entityNestLevel != 0) 1146 return; 1147 rawWrite("<!ENTITY " + name + " "); 1148 if (publicId != null) 1149 rawWrite("PUBLIC \"" + publicId + '"'); 1150 else 1151 rawWrite("SYSTEM "); 1152 rawWrite('"' + systemId + '"'); 1153 rawWrite(" NDATA " + notationName + ">"); 1154 newline(); 1155 } catch (IOException e) { 1156 fatal("can't write", e); 1157 } 1158 } 1159 1160 // SAX2 DeclHandler 1161 1162 /** <b>SAX2 </b>: called on attribute declarations */ 1163 final public void attributeDecl(String eName, String aName, String type, 1164 String mode, String value) throws SAXException { 1165 if (xhtml) 1166 return; 1167 try { 1168 // At this time, only SAX2 callbacks start these. 1169 if (!startedDoctype) 1170 return; 1171 if (entityNestLevel != 0) 1172 return; 1173 rawWrite("<!ATTLIST " + eName + ' ' + aName + ' '); 1174 rawWrite(type); 1175 rawWrite(' '); 1176 if (mode != null) 1177 rawWrite(mode + ' '); 1178 if (value != null) 1179 writeQuotedValue(value, CTX_ATTRIBUTE); 1180 rawWrite('>'); 1181 newline(); 1182 } catch (IOException e) { 1183 fatal("can't write", e); 1184 } 1185 } 1186 1187 /** <b>SAX2 </b>: called on element declarations */ 1188 final public void elementDecl(String name, String model) 1189 throws SAXException { 1190 if (xhtml) 1191 return; 1192 try { 1193 // At this time, only SAX2 callbacks start these. 1194 if (!startedDoctype) 1195 return; 1196 if (entityNestLevel != 0) 1197 return; 1198 rawWrite("<!ELEMENT " + name + ' ' + model + '>'); 1199 newline(); 1200 } catch (IOException e) { 1201 fatal("can't write", e); 1202 } 1203 } 1204 1205 /** <b>SAX2 </b>: called on external entity declarations */ 1206 final public void externalEntityDecl(String name, String publicId, 1207 String systemId) throws SAXException { 1208 if (xhtml) 1209 return; 1210 try { 1211 // At this time, only SAX2 callbacks start these. 1212 if (!startedDoctype) 1213 return; 1214 if (entityNestLevel != 0) 1215 return; 1216 rawWrite("<!ENTITY "); 1217 if (name.startsWith("%")) { 1218 rawWrite("% "); 1219 rawWrite(name.substring(1)); 1220 } else 1221 rawWrite(name); 1222 if (publicId != null) 1223 rawWrite(" PUBLIC \"" + publicId + '"'); 1224 else 1225 rawWrite(" SYSTEM "); 1226 rawWrite('"' + systemId + "\">"); 1227 newline(); 1228 } catch (IOException e) { 1229 fatal("can't write", e); 1230 } 1231 } 1232 1233 /** <b>SAX2 </b>: called on internal entity declarations */ 1234 final public void internalEntityDecl(String name, String value) 1235 throws SAXException { 1236 if (xhtml) 1237 return; 1238 try { 1239 // At this time, only SAX2 callbacks start these. 1240 if (!startedDoctype) 1241 return; 1242 if (entityNestLevel != 0) 1243 return; 1244 rawWrite("<!ENTITY "); 1245 if (name.startsWith("%")) { 1246 rawWrite("% "); 1247 rawWrite(name.substring(1)); 1248 } else 1249 rawWrite(name); 1250 rawWrite(' '); 1251 writeQuotedValue(value, CTX_ENTITY); 1252 rawWrite('>'); 1253 newline(); 1254 } catch (IOException e) { 1255 fatal("can't write", e); 1256 } 1257 } 1258 1259 // added xmlDecl() -- 2005-03-02 hsivonen 1260 /** 1261 * @see fi.karppinen.xml.XmlDeclarationHandler#xmlDecl(java.lang.String, java.lang.String, java.lang.String) 1262 */ 1263 public void xmlDecl(String version, String encoding, String standalone) 1264 throws SAXException { 1265 try { 1266 if (xhtml) { 1267 rawWrite("<?xml version='1.0'"); 1268 rawWrite(" encoding='UTF-8'"); 1269 rawWrite("?>"); 1270 newline(); 1271 } 1272 } catch (IOException e) { 1273 fatal("can't write", e); 1274 } 1275 } 1276 1277 private void writeQuotedValue(String value, int code) throws SAXException, 1278 IOException { 1279 char buf[] = value.toCharArray(); 1280 int off = 0, len = buf.length; 1281 1282 // we can't add line breaks to attribute/entity/... values 1283 noWrap = true; 1284 rawWrite('"'); 1285 escapeChars(buf, off, len, code); 1286 rawWrite('"'); 1287 noWrap = false; 1288 } 1289 1290 // removed entity tables -- 2005-02-11 hsivonen 1291 1292 // General routine to write text and substitute predefined 1293 // entities (XML, and a special case for XHTML) as needed. 1294 private void escapeChars(char buf[], int off, int len, int code) 1295 throws SAXException, IOException { 1296 int first = 0; 1297 1298 if (off < 0) { 1299 off = 0; 1300 len = buf.length; 1301 } 1302 for (int i = 0; i < len; i++) { 1303 String esc = null; // init variable -- 2005-02-11 hsivonen 1304 char c = buf[off + i]; 1305 1306 switch (c) { 1307 // Note that CTX_ATTRIBUTE isn't explicitly tested here; 1308 // all syntax delimiters are escaped in CTX_ATTRIBUTE, 1309 // otherwise it's similar to CTX_CONTENT 1310 1311 // ampersand flags entity references; entity replacement 1312 // text has unexpanded references, other text doesn't. 1313 case '&': 1314 if (code == CTX_ENTITY || code == CTX_UNPARSED) 1315 continue; 1316 esc = "amp"; 1317 break; 1318 1319 // attributes and text may NOT have literal '<', but 1320 // entities may have markup constructs 1321 case '<': 1322 if (code == CTX_ENTITY || code == CTX_UNPARSED) 1323 continue; 1324 esc = "lt"; 1325 break; 1326 1327 // as above re markup constructs; but otherwise 1328 // except when canonicalizing, this is for consistency 1329 case '>': 1330 if (code == CTX_ENTITY || code == CTX_UNPARSED) 1331 continue; 1332 esc = "gt"; 1333 break; 1334 case '\'': 1335 if (code == CTX_CONTENT || code == CTX_UNPARSED) 1336 continue; 1337 if (canonical || xhtml) // added xhtml check -- 2005-02-11 hsivonen 1338 continue; 1339 esc = "apos"; 1340 break; 1341 1342 // needed when printing quoted attribute/entity values 1343 case '"': 1344 if (code == CTX_CONTENT || code == CTX_UNPARSED) 1345 continue; 1346 esc = "quot"; 1347 break; 1348 1349 // make line ends work per host OS convention 1350 case '\n': 1351 esc = eol; 1352 break; 1353 1354 // removed obsolete comments -- 2005-02-11 hsivonen 1355 1356 default: 1357 // 1358 // There are characters we can never write safely; getting 1359 // them is an error. 1360 // 1361 // (a) They're never legal in XML ... detected by range 1362 // checks, and (eventually) by remerging surrogate 1363 // pairs on output. (Easy error for apps to prevent.) 1364 // 1365 1366 // removed obsolete comments -- 2005-02-11 hsivonen 1367 1368 // FIXME: CR in CDATA is an error; in text, turn to a char 1369 // ref 1370 1371 // FIXME: CR/LF/TAB in attributes should become char refs 1372 1373 if ((c > 0xfffd) 1374 || ((c < 0x0020) && !((c == 0x0009) 1375 || (c == 0x000A) || (c == 0x000D)))) { 1376 1377 // removed danger mask check -- 2005-02-11 hsivonen 1378 throw new CharConversionException( 1379 "Illegal or non-writable character: U+" 1380 + Integer.toHexString(c)); 1381 } 1382 continue; 1383 // removed entity handling -- 2005-02-12 hsivonen 1384 } 1385 1386 if (i != first) 1387 rawWrite(buf, off + first, i - first); 1388 first = i + 1; 1389 if (esc == eol) 1390 newline(); 1391 else { 1392 rawWrite('&'); 1393 rawWrite(esc); 1394 rawWrite(';'); 1395 } 1396 } 1397 if (first < len) 1398 rawWrite(buf, off + first, len - first); 1399 } 1400 1401 private void newline() throws SAXException, IOException { 1402 out.write(eol); 1403 column = 0; 1404 } 1405 1406 private void doIndent() throws SAXException, IOException { 1407 int space = elementNestLevel * 2; 1408 1409 newline(); 1410 column = space; 1411 // track tabs only at line starts 1412 while (space > 8) { 1413 out.write("\t"); 1414 space -= 8; 1415 } 1416 while (space > 0) { 1417 out.write(" "); 1418 space -= 2; 1419 } 1420 } 1421 1422 private void rawWrite(char c) throws IOException { 1423 out.write(c); 1424 column++; 1425 } 1426 1427 private void rawWrite(String s) throws SAXException, IOException { 1428 if (prettyPrinting && "default".equals(space.peek())) { 1429 char data[] = s.toCharArray(); 1430 rawWrite(data, 0, data.length); 1431 } else { 1432 out.write(s); 1433 column += s.length(); 1434 } 1435 } 1436 1437 // NOTE: if xhtml, the REC gives some rules about whitespace 1438 // which we could follow ... notably, many places where conformant 1439 // agents "must" consolidate/normalize whitespace. Line ends can 1440 // be removed there, etc. This may not be the right place to do 1441 // such mappings though. 1442 1443 // Line buffering may help clarify algorithms and improve results. 1444 1445 // It's likely xml:space needs more attention. 1446 1447 private void rawWrite(char buf[], int offset, int length) 1448 throws SAXException, IOException { 1449 boolean wrap; 1450 1451 if (prettyPrinting && space.empty()) 1452 fatal("stack discipline", null); 1453 1454 wrap = prettyPrinting && "default".equals(space.peek()); 1455 if (!wrap) { 1456 out.write(buf, offset, length); 1457 column += length; 1458 return; 1459 } 1460 1461 // we're pretty printing and want to fill lines out only 1462 // to the desired line length. 1463 while (length > 0) { 1464 int target = lineLength - column; 1465 boolean wrote = false; 1466 1467 // Do we even have a problem? 1468 if (target > length || noWrap) { 1469 out.write(buf, offset, length); 1470 column += length; 1471 return; 1472 } 1473 1474 // break the line at a space character, trying to fill 1475 // as much of the line as possible. 1476 char c; 1477 1478 for (int i = target - 1; i >= 0; i--) { 1479 if ((c = buf[offset + i]) == ' ' || c == '\t') { 1480 i++; 1481 out.write(buf, offset, i); 1482 doIndent(); 1483 offset += i; 1484 length -= i; 1485 wrote = true; 1486 break; 1487 } 1488 } 1489 if (wrote) 1490 continue; 1491 1492 // no space character permitting break before target 1493 // line length is filled. So, take the next one. 1494 if (target < 0) 1495 target = 0; 1496 for (int i = target; i < length; i++) 1497 if ((c = buf[offset + i]) == ' ' || c == '\t') { 1498 i++; 1499 out.write(buf, offset, i); 1500 doIndent(); 1501 offset += i; 1502 length -= i; 1503 wrote = true; 1504 break; 1505 } 1506 if (wrote) 1507 continue; 1508 1509 // no such luck. 1510 out.write(buf, offset, length); 1511 column += length; 1512 break; 1513 } 1514 } 1515 }