001 /* 002 * Copyright (c) 2003, 2004 Taavi Hupponen and 2004 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.iki.hsivonen.xml; 024 025 import java.io.IOException; 026 027 import org.xml.sax.Attributes; 028 import org.xml.sax.ContentHandler; 029 import org.xml.sax.ErrorHandler; 030 import org.xml.sax.SAXException; 031 import org.xml.sax.helpers.AttributesImpl; 032 033 import fi.iki.hsivonen.webcms.application.Globals; 034 035 /** 036 * TODO Warning: Not tested after refactoring. Need to check for permissible characters. 037 * 038 * 039 * @version $Id: SimpleWikiParser.java,v 1.2 2006/11/18 00:05:24 hsivonen Exp $ 040 * @author taavi 041 * @author hsivonen 042 */ 043 public class SimpleWikiParser { 044 045 private ContentHandler contentHandler; 046 047 private ErrorHandler errorHandler; 048 049 private String source; 050 051 private String title; 052 053 /** 054 * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler) 055 */ 056 public void setContentHandler(ContentHandler arg0) { 057 this.contentHandler = arg0; 058 } 059 060 /** 061 * @see org.xml.sax.XMLReader#getContentHandler() 062 */ 063 public ContentHandler getContentHandler() { 064 return this.contentHandler; 065 } 066 067 /** 068 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) 069 */ 070 public void setErrorHandler(ErrorHandler arg0) { 071 this.errorHandler = arg0; 072 } 073 074 /** 075 * @see org.xml.sax.XMLReader#getErrorHandler() 076 */ 077 public ErrorHandler getErrorHandler() { 078 return this.errorHandler; 079 } 080 081 public void parse() throws IOException, SAXException { 082 // TODO Auto-generated method stub 083 084 } 085 086 /** 087 * Takes a string and builds DOM Document of it by scanning the string and 088 * emitting SAX Events for DOMConsumer. 089 * 090 * The root element for the document will be <body>. 091 * 092 * Multiple linefeeds (1 linefeed here means \n, \r or \r\n) invoke 093 * </p> 094 * <p>. Non-empty body always contains at least one p element. 095 * 096 * [linkTarget link title] is turned into <a href="linkTarget">link title 097 * </a>. Link title 098 * [[ can be used to insert a single '['. 099 * 100 * @param source 101 * the string of which the document is built 102 * @return the Document built from the source 103 * @throws SAXException 104 * @throws IOException 105 * if there are problems when loading the DOM class 106 */ 107 private void foo() throws SAXException, IOException { 108 109 try { 110 // buffer for characters to be emitted 111 // TODO change to char[] 112 StringBuilder characters = new StringBuilder(); 113 114 boolean pOpen = false; 115 int lineFeedCount = 0; 116 117 String linkTarget, linkTitle; 118 int linkEndIndex = -1; 119 int linkDelimIndex = -1; 120 char linkChar; 121 boolean linkTargetValid, linkTitleValid; 122 char highSurrogate = '\u0000'; 123 124 char c; 125 int i = 0; 126 int sourceLength = source.length(); 127 128 // start parsing the string, one char at a time 129 this.startDocument(); 130 this.startElement(Globals.XHTML_NS, "body", "body", 131 new AttributesImpl()); 132 while (i < sourceLength) { 133 c = source.charAt(i); 134 135 // high surrogate 136 if('\uD800' <= c && c <= '\uDBFF') { 137 if(highSurrogate != '\u0000') { 138 139 } 140 highSurrogate = c; 141 i++; 142 continue; 143 } 144 // low surrogate 145 // else if('\uDC00' <= c && c <= '\uDFFF') { 146 // 147 // 148 // } 149 // // forbidden 150 // else if() { 151 // 152 // } 153 // // BMP 154 // else { 155 // 156 // } 157 // whitespace 158 if (isWhiteSpace(c)) { 159 // if not looking for additional lineFeed, add this 160 // whiteSpace 161 // to characters to be emitted 162 if (lineFeedCount < 1) { 163 characters.append(c); 164 } 165 i++; 166 } 167 168 // line feed 169 else if (c == '\n' || c == '\r') { 170 // emit characters this far 171 if (characters.length() > 0) { 172 this.characters(characters.toString().toCharArray(), 0, 173 characters.length()); 174 characters.delete(0, characters.length()); 175 } 176 177 // increase lineFeedCount 178 lineFeedCount++; 179 180 // peek for \r\n and step by 2 if found 181 if (c == '\r' && (i < (source.length() - 1)) 182 && (source.charAt(i + 1) == '\n')) { 183 i += 2; 184 } else { 185 i++; 186 } 187 } 188 189 // other than whitespace or linefeeds 190 // if lineFeedCount > 0, characters shoud be empty here 191 else { 192 // emit line feed if one line feed before, reset 193 // lineFeedCount 194 if (lineFeedCount == 1) { 195 lineFeedCount = 0; 196 characters.append("\n"); 197 } 198 199 // emit </p><p> if more than one line feeds before, reset 200 // lineFeedCount 201 else if (lineFeedCount > 1) { 202 if (pOpen) { 203 this.endElement(Globals.XHTML_NS, "p", "p"); 204 this.startElement(Globals.XHTML_NS, "p", "p", 205 new AttributesImpl()); 206 } else { 207 // TODO poista chekki 208 System.err.println("KERRO TAAVILLE 2"); 209 } 210 lineFeedCount = 0; 211 } 212 213 // no lineFeed previously 214 // open p, if not yet open (at the beginning of the string) 215 else if (!pOpen) { 216 // TODO poista chekki 217 if (characters.length() > 0) { 218 System.err.println("KERRO TAAVILLE!"); 219 } 220 221 this.startElement(Globals.XHTML_NS, "p", "p", 222 new AttributesImpl()); 223 pOpen = true; 224 } 225 226 // now handle this character 227 228 // a link or a possible link i.e. '[' 229 if (c == '[') { 230 231 // peek for [[ 232 if (i < (sourceLength - 1) 233 && source.charAt(i + 1) == '[') { 234 // add [ to characters, and skip the other [ 235 characters.append('['); 236 i += 2; 237 238 } 239 240 // single [, try to make a link 241 else { 242 linkEndIndex = source.indexOf("]", i); 243 linkDelimIndex = source.indexOf(" ", i); 244 245 // either no ending ] or no delim --> not a link 246 if (linkEndIndex < 0 || linkDelimIndex < 0 247 || linkEndIndex < linkDelimIndex) { 248 characters.append('['); 249 i++; 250 } 251 252 // ending ] and delim exist 253 else { 254 linkTargetValid = false; 255 linkTitleValid = false; 256 // check that link url contains something else 257 // than whitespace 258 for (int k = i + 1; k < linkDelimIndex - 1 259 && !linkTargetValid; k++) { 260 linkChar = source.charAt(k); 261 if (!isWhiteSpace(linkChar) 262 && (linkChar != '\n') 263 && (linkChar != '\r')) { 264 linkTargetValid = true; 265 } 266 } 267 268 // check that link title contains something else 269 // than whitespace 270 for (int k = linkDelimIndex; k < linkEndIndex 271 && linkTargetValid && !linkTitleValid; k++) { 272 linkChar = source.charAt(k); 273 if (!isWhiteSpace(linkChar) 274 && (linkChar != '\n') 275 && (linkChar != '\r')) { 276 linkTitleValid = true; 277 } 278 } 279 280 // either url or title is only whitespace --> 281 // not a link 282 if (!linkTargetValid || !linkTitleValid) { 283 characters.append('['); 284 i++; 285 } 286 287 // actual link 288 else { 289 // read target and title 290 linkTarget = source.substring(i + 1, 291 linkDelimIndex); 292 linkTitle = source.substring( 293 linkDelimIndex + 1, linkEndIndex); 294 295 // emit characters this far 296 if (characters.length() > 0) { 297 this.characters(characters.toString() 298 .toCharArray(), 0, characters 299 .length()); 300 characters.delete(0, characters 301 .length()); 302 } 303 304 // emit link element 305 AttributesImpl attributes = new AttributesImpl(); 306 attributes.addAttribute("", "href", "href", 307 "CDATA", linkTarget); // namespace 308 // removed -- 309 // hsivonen 310 this.startElement(Globals.XHTML_NS, "a", 311 "a", attributes); 312 this.characters(linkTitle.toCharArray(), 0, 313 linkTitle.length()); 314 this.endElement(Globals.XHTML_NS, "a", "a"); 315 316 // skip forward past the link 317 i = linkEndIndex + 1; 318 } // actual link 319 } // looks like a link 320 } // single '[' 321 } // '[' seen 322 323 // other characters 324 else { 325 characters.append(c); 326 i++; 327 } 328 } 329 } 330 331 // finish up 332 333 // emit possible characters not yet emitted 334 if (characters.length() > 0) { 335 this.characters(characters.toString().toCharArray(), 0, 336 characters.length()); 337 characters.delete(0, characters.length()); 338 } 339 340 // close possible ending p 341 if (pOpen) { 342 this.endElement(Globals.XHTML_NS, "p", "p"); 343 } 344 // close body and end document 345 this.endElement(Globals.XHTML_NS, "body", "body"); 346 } finally { 347 this.endDocument(); 348 } 349 } 350 351 /** 352 * Checks if the given char is whitespace. 353 * 354 * For now whitespace means ' ' or '\t'. 355 * @param c 356 * @return true if give char is whitespace 357 */ 358 private boolean isWhiteSpace(char c) { 359 if (c == ' ' || c == '\t') { 360 return true; 361 } else { 362 return false; 363 } 364 } 365 366 private void characters(char[] a, int b, int c) throws SAXException { 367 if (this.contentHandler != null) { 368 this.contentHandler.characters(a, b, c); 369 } 370 } 371 372 private void startElement(String a, String b, String c, Attributes attrs) 373 throws SAXException { 374 if (this.contentHandler != null) { 375 this.contentHandler.startElement(a, b, c, attrs); 376 } 377 } 378 379 private void endElement(String a, String b, String c) throws SAXException { 380 if (this.contentHandler != null) { 381 this.contentHandler.endElement(a, b, c); 382 } 383 } 384 385 private void startDocument() throws SAXException { 386 if (this.contentHandler != null) { 387 this.contentHandler.startDocument(); 388 } 389 } 390 391 private void endDocument() throws SAXException { 392 if (this.contentHandler != null) { 393 this.contentHandler.endDocument(); 394 } 395 } 396 397 }