001 /*
002 * Copyright (c) 2003, 2004 Taavi Hupponen and 2004 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package fi.iki.hsivonen.xml;
024
025 import java.io.IOException;
026
027 import org.xml.sax.Attributes;
028 import org.xml.sax.ContentHandler;
029 import org.xml.sax.ErrorHandler;
030 import org.xml.sax.SAXException;
031 import org.xml.sax.helpers.AttributesImpl;
032
033 import fi.iki.hsivonen.webcms.application.Globals;
034
035 /**
036 * TODO Warning: Not tested after refactoring. Need to check for permissible characters.
037 *
038 *
039 * @version $Id: SimpleWikiParser.java,v 1.2 2006/11/18 00:05:24 hsivonen Exp $
040 * @author taavi
041 * @author hsivonen
042 */
043 public class SimpleWikiParser {
044
045 private ContentHandler contentHandler;
046
047 private ErrorHandler errorHandler;
048
049 private String source;
050
051 private String title;
052
053 /**
054 * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler)
055 */
056 public void setContentHandler(ContentHandler arg0) {
057 this.contentHandler = arg0;
058 }
059
060 /**
061 * @see org.xml.sax.XMLReader#getContentHandler()
062 */
063 public ContentHandler getContentHandler() {
064 return this.contentHandler;
065 }
066
067 /**
068 * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
069 */
070 public void setErrorHandler(ErrorHandler arg0) {
071 this.errorHandler = arg0;
072 }
073
074 /**
075 * @see org.xml.sax.XMLReader#getErrorHandler()
076 */
077 public ErrorHandler getErrorHandler() {
078 return this.errorHandler;
079 }
080
081 public void parse() throws IOException, SAXException {
082 // TODO Auto-generated method stub
083
084 }
085
086 /**
087 * Takes a string and builds DOM Document of it by scanning the string and
088 * emitting SAX Events for DOMConsumer.
089 *
090 * The root element for the document will be <body>.
091 *
092 * Multiple linefeeds (1 linefeed here means \n, \r or \r\n) invoke
093 * </p>
094 * <p>. Non-empty body always contains at least one p element.
095 *
096 * [linkTarget link title] is turned into <a href="linkTarget">link title
097 * </a>. Link title
098 * [[ can be used to insert a single '['.
099 *
100 * @param source
101 * the string of which the document is built
102 * @return the Document built from the source
103 * @throws SAXException
104 * @throws IOException
105 * if there are problems when loading the DOM class
106 */
107 private void foo() throws SAXException, IOException {
108
109 try {
110 // buffer for characters to be emitted
111 // TODO change to char[]
112 StringBuilder characters = new StringBuilder();
113
114 boolean pOpen = false;
115 int lineFeedCount = 0;
116
117 String linkTarget, linkTitle;
118 int linkEndIndex = -1;
119 int linkDelimIndex = -1;
120 char linkChar;
121 boolean linkTargetValid, linkTitleValid;
122 char highSurrogate = '\u0000';
123
124 char c;
125 int i = 0;
126 int sourceLength = source.length();
127
128 // start parsing the string, one char at a time
129 this.startDocument();
130 this.startElement(Globals.XHTML_NS, "body", "body",
131 new AttributesImpl());
132 while (i < sourceLength) {
133 c = source.charAt(i);
134
135 // high surrogate
136 if('\uD800' <= c && c <= '\uDBFF') {
137 if(highSurrogate != '\u0000') {
138
139 }
140 highSurrogate = c;
141 i++;
142 continue;
143 }
144 // low surrogate
145 // else if('\uDC00' <= c && c <= '\uDFFF') {
146 //
147 //
148 // }
149 // // forbidden
150 // else if() {
151 //
152 // }
153 // // BMP
154 // else {
155 //
156 // }
157 // whitespace
158 if (isWhiteSpace(c)) {
159 // if not looking for additional lineFeed, add this
160 // whiteSpace
161 // to characters to be emitted
162 if (lineFeedCount < 1) {
163 characters.append(c);
164 }
165 i++;
166 }
167
168 // line feed
169 else if (c == '\n' || c == '\r') {
170 // emit characters this far
171 if (characters.length() > 0) {
172 this.characters(characters.toString().toCharArray(), 0,
173 characters.length());
174 characters.delete(0, characters.length());
175 }
176
177 // increase lineFeedCount
178 lineFeedCount++;
179
180 // peek for \r\n and step by 2 if found
181 if (c == '\r' && (i < (source.length() - 1))
182 && (source.charAt(i + 1) == '\n')) {
183 i += 2;
184 } else {
185 i++;
186 }
187 }
188
189 // other than whitespace or linefeeds
190 // if lineFeedCount > 0, characters shoud be empty here
191 else {
192 // emit line feed if one line feed before, reset
193 // lineFeedCount
194 if (lineFeedCount == 1) {
195 lineFeedCount = 0;
196 characters.append("\n");
197 }
198
199 // emit </p><p> if more than one line feeds before, reset
200 // lineFeedCount
201 else if (lineFeedCount > 1) {
202 if (pOpen) {
203 this.endElement(Globals.XHTML_NS, "p", "p");
204 this.startElement(Globals.XHTML_NS, "p", "p",
205 new AttributesImpl());
206 } else {
207 // TODO poista chekki
208 System.err.println("KERRO TAAVILLE 2");
209 }
210 lineFeedCount = 0;
211 }
212
213 // no lineFeed previously
214 // open p, if not yet open (at the beginning of the string)
215 else if (!pOpen) {
216 // TODO poista chekki
217 if (characters.length() > 0) {
218 System.err.println("KERRO TAAVILLE!");
219 }
220
221 this.startElement(Globals.XHTML_NS, "p", "p",
222 new AttributesImpl());
223 pOpen = true;
224 }
225
226 // now handle this character
227
228 // a link or a possible link i.e. '['
229 if (c == '[') {
230
231 // peek for [[
232 if (i < (sourceLength - 1)
233 && source.charAt(i + 1) == '[') {
234 // add [ to characters, and skip the other [
235 characters.append('[');
236 i += 2;
237
238 }
239
240 // single [, try to make a link
241 else {
242 linkEndIndex = source.indexOf("]", i);
243 linkDelimIndex = source.indexOf(" ", i);
244
245 // either no ending ] or no delim --> not a link
246 if (linkEndIndex < 0 || linkDelimIndex < 0
247 || linkEndIndex < linkDelimIndex) {
248 characters.append('[');
249 i++;
250 }
251
252 // ending ] and delim exist
253 else {
254 linkTargetValid = false;
255 linkTitleValid = false;
256 // check that link url contains something else
257 // than whitespace
258 for (int k = i + 1; k < linkDelimIndex - 1
259 && !linkTargetValid; k++) {
260 linkChar = source.charAt(k);
261 if (!isWhiteSpace(linkChar)
262 && (linkChar != '\n')
263 && (linkChar != '\r')) {
264 linkTargetValid = true;
265 }
266 }
267
268 // check that link title contains something else
269 // than whitespace
270 for (int k = linkDelimIndex; k < linkEndIndex
271 && linkTargetValid && !linkTitleValid; k++) {
272 linkChar = source.charAt(k);
273 if (!isWhiteSpace(linkChar)
274 && (linkChar != '\n')
275 && (linkChar != '\r')) {
276 linkTitleValid = true;
277 }
278 }
279
280 // either url or title is only whitespace -->
281 // not a link
282 if (!linkTargetValid || !linkTitleValid) {
283 characters.append('[');
284 i++;
285 }
286
287 // actual link
288 else {
289 // read target and title
290 linkTarget = source.substring(i + 1,
291 linkDelimIndex);
292 linkTitle = source.substring(
293 linkDelimIndex + 1, linkEndIndex);
294
295 // emit characters this far
296 if (characters.length() > 0) {
297 this.characters(characters.toString()
298 .toCharArray(), 0, characters
299 .length());
300 characters.delete(0, characters
301 .length());
302 }
303
304 // emit link element
305 AttributesImpl attributes = new AttributesImpl();
306 attributes.addAttribute("", "href", "href",
307 "CDATA", linkTarget); // namespace
308 // removed --
309 // hsivonen
310 this.startElement(Globals.XHTML_NS, "a",
311 "a", attributes);
312 this.characters(linkTitle.toCharArray(), 0,
313 linkTitle.length());
314 this.endElement(Globals.XHTML_NS, "a", "a");
315
316 // skip forward past the link
317 i = linkEndIndex + 1;
318 } // actual link
319 } // looks like a link
320 } // single '['
321 } // '[' seen
322
323 // other characters
324 else {
325 characters.append(c);
326 i++;
327 }
328 }
329 }
330
331 // finish up
332
333 // emit possible characters not yet emitted
334 if (characters.length() > 0) {
335 this.characters(characters.toString().toCharArray(), 0,
336 characters.length());
337 characters.delete(0, characters.length());
338 }
339
340 // close possible ending p
341 if (pOpen) {
342 this.endElement(Globals.XHTML_NS, "p", "p");
343 }
344 // close body and end document
345 this.endElement(Globals.XHTML_NS, "body", "body");
346 } finally {
347 this.endDocument();
348 }
349 }
350
351 /**
352 * Checks if the given char is whitespace.
353 *
354 * For now whitespace means ' ' or '\t'.
355 * @param c
356 * @return true if give char is whitespace
357 */
358 private boolean isWhiteSpace(char c) {
359 if (c == ' ' || c == '\t') {
360 return true;
361 } else {
362 return false;
363 }
364 }
365
366 private void characters(char[] a, int b, int c) throws SAXException {
367 if (this.contentHandler != null) {
368 this.contentHandler.characters(a, b, c);
369 }
370 }
371
372 private void startElement(String a, String b, String c, Attributes attrs)
373 throws SAXException {
374 if (this.contentHandler != null) {
375 this.contentHandler.startElement(a, b, c, attrs);
376 }
377 }
378
379 private void endElement(String a, String b, String c) throws SAXException {
380 if (this.contentHandler != null) {
381 this.contentHandler.endElement(a, b, c);
382 }
383 }
384
385 private void startDocument() throws SAXException {
386 if (this.contentHandler != null) {
387 this.contentHandler.startDocument();
388 }
389 }
390
391 private void endDocument() throws SAXException {
392 if (this.contentHandler != null) {
393 this.contentHandler.endDocument();
394 }
395 }
396
397 }