001    /*
002     * Copyright (c) 2003, 2004 Taavi Hupponen and 2004 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.xml;
024    
025    import java.io.IOException;
026    
027    import org.xml.sax.Attributes;
028    import org.xml.sax.ContentHandler;
029    import org.xml.sax.ErrorHandler;
030    import org.xml.sax.SAXException;
031    import org.xml.sax.helpers.AttributesImpl;
032    
033    import fi.iki.hsivonen.webcms.application.Globals;
034    
035    /**
036     * TODO Warning: Not tested after refactoring. Need to check for permissible characters.
037     * 
038     * 
039     * @version $Id: SimpleWikiParser.java,v 1.2 2006/11/18 00:05:24 hsivonen Exp $
040     * @author taavi
041     * @author hsivonen
042     */
043    public class SimpleWikiParser {
044    
045        private ContentHandler contentHandler;
046    
047        private ErrorHandler errorHandler;
048    
049        private String source;
050        
051        private String title;
052    
053        /**
054         * @see org.xml.sax.XMLReader#setContentHandler(org.xml.sax.ContentHandler)
055         */
056        public void setContentHandler(ContentHandler arg0) {
057            this.contentHandler = arg0;
058        }
059    
060        /**
061         * @see org.xml.sax.XMLReader#getContentHandler()
062         */
063        public ContentHandler getContentHandler() {
064            return this.contentHandler;
065        }
066    
067        /**
068         * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler)
069         */
070        public void setErrorHandler(ErrorHandler arg0) {
071            this.errorHandler = arg0;
072        }
073    
074        /**
075         * @see org.xml.sax.XMLReader#getErrorHandler()
076         */
077        public ErrorHandler getErrorHandler() {
078            return this.errorHandler;
079        }
080    
081        public void parse() throws IOException, SAXException {
082            // TODO Auto-generated method stub
083    
084        }
085    
086        /**
087         * Takes a string and builds DOM Document of it by scanning the string and
088         * emitting SAX Events for DOMConsumer.
089         * 
090         * The root element for the document will be <body>.
091         * 
092         * Multiple linefeeds (1 linefeed here means \n, \r or \r\n) invoke
093         * </p>
094         * <p>. Non-empty body always contains at least one p element.
095         * 
096         * [linkTarget link title] is turned into <a href="linkTarget">link title
097         * </a>. Link title
098         *  [[ can be used to insert a single '['.
099         * 
100         * @param source
101         *            the string of which the document is built
102         * @return the Document built from the source
103         * @throws SAXException
104         * @throws IOException
105         *             if there are problems when loading the DOM class
106         */
107        private void foo() throws SAXException, IOException {
108    
109            try {
110                // buffer for characters to be emitted
111                // TODO change to char[]
112                StringBuilder characters = new StringBuilder();
113    
114                boolean pOpen = false;
115                int lineFeedCount = 0;
116    
117                String linkTarget, linkTitle;
118                int linkEndIndex = -1;
119                int linkDelimIndex = -1;
120                char linkChar;
121                boolean linkTargetValid, linkTitleValid;
122                char highSurrogate = '\u0000';
123    
124                char c;
125                int i = 0;
126                int sourceLength = source.length();
127    
128                // start parsing the string, one char at a time
129                this.startDocument();
130                this.startElement(Globals.XHTML_NS, "body", "body",
131                        new AttributesImpl());
132                while (i < sourceLength) {
133                    c = source.charAt(i);
134    
135                    // high surrogate
136                    if('\uD800' <= c && c <= '\uDBFF') {
137                        if(highSurrogate != '\u0000') {
138                            
139                        }
140                       highSurrogate = c;
141                       i++;
142                       continue;
143                    } 
144                    // low surrogate
145    //                else if('\uDC00' <= c && c <= '\uDFFF') {
146    //                    
147    //                    
148    //                } 
149    //                // forbidden 
150    //                else if() {
151    //                    
152    //                }
153    //                // BMP
154    //                else {
155    //                    
156    //                }
157                    // whitespace
158                    if (isWhiteSpace(c)) {
159                        // if not looking for additional lineFeed, add this
160                        // whiteSpace
161                        // to characters to be emitted
162                        if (lineFeedCount < 1) {
163                            characters.append(c);
164                        }
165                        i++;
166                    }
167    
168                    // line feed
169                    else if (c == '\n' || c == '\r') {
170                        // emit characters this far
171                        if (characters.length() > 0) {
172                            this.characters(characters.toString().toCharArray(), 0,
173                                    characters.length());
174                            characters.delete(0, characters.length());
175                        }
176    
177                        // increase lineFeedCount
178                        lineFeedCount++;
179    
180                        // peek for \r\n and step by 2 if found
181                        if (c == '\r' && (i < (source.length() - 1))
182                                && (source.charAt(i + 1) == '\n')) {
183                            i += 2;
184                        } else {
185                            i++;
186                        }
187                    }
188    
189                    // other than whitespace or linefeeds
190                    // if lineFeedCount > 0, characters shoud be empty here
191                    else {
192                        // emit line feed if one line feed before, reset
193                        // lineFeedCount
194                        if (lineFeedCount == 1) {
195                            lineFeedCount = 0;
196                            characters.append("\n");
197                        }
198    
199                        // emit </p><p> if more than one line feeds before, reset
200                        // lineFeedCount
201                        else if (lineFeedCount > 1) {
202                            if (pOpen) {
203                                this.endElement(Globals.XHTML_NS, "p", "p");
204                                this.startElement(Globals.XHTML_NS, "p", "p",
205                                        new AttributesImpl());
206                            } else {
207                                // TODO poista chekki
208                                System.err.println("KERRO TAAVILLE 2");
209                            }
210                            lineFeedCount = 0;
211                        }
212    
213                        // no lineFeed previously
214                        // open p, if not yet open (at the beginning of the string)
215                        else if (!pOpen) {
216                            // TODO poista chekki
217                            if (characters.length() > 0) {
218                                System.err.println("KERRO TAAVILLE!");
219                            }
220    
221                            this.startElement(Globals.XHTML_NS, "p", "p",
222                                    new AttributesImpl());
223                            pOpen = true;
224                        }
225    
226                        // now handle this character
227    
228                        // a link or a possible link i.e. '['
229                        if (c == '[') {
230    
231                            // peek for [[
232                            if (i < (sourceLength - 1)
233                                    && source.charAt(i + 1) == '[') {
234                                // add [ to characters, and skip the other [
235                                characters.append('[');
236                                i += 2;
237    
238                            }
239    
240                            // single [, try to make a link
241                            else {
242                                linkEndIndex = source.indexOf("]", i);
243                                linkDelimIndex = source.indexOf(" ", i);
244    
245                                // either no ending ] or no delim --> not a link
246                                if (linkEndIndex < 0 || linkDelimIndex < 0
247                                        || linkEndIndex < linkDelimIndex) {
248                                    characters.append('[');
249                                    i++;
250                                }
251    
252                                // ending ] and delim exist
253                                else {
254                                    linkTargetValid = false;
255                                    linkTitleValid = false;
256                                    // check that link url contains something else
257                                    // than whitespace
258                                    for (int k = i + 1; k < linkDelimIndex - 1
259                                            && !linkTargetValid; k++) {
260                                        linkChar = source.charAt(k);
261                                        if (!isWhiteSpace(linkChar)
262                                                && (linkChar != '\n')
263                                                && (linkChar != '\r')) {
264                                            linkTargetValid = true;
265                                        }
266                                    }
267    
268                                    // check that link title contains something else
269                                    // than whitespace
270                                    for (int k = linkDelimIndex; k < linkEndIndex
271                                            && linkTargetValid && !linkTitleValid; k++) {
272                                        linkChar = source.charAt(k);
273                                        if (!isWhiteSpace(linkChar)
274                                                && (linkChar != '\n')
275                                                && (linkChar != '\r')) {
276                                            linkTitleValid = true;
277                                        }
278                                    }
279    
280                                    // either url or title is only whitespace -->
281                                    // not a link
282                                    if (!linkTargetValid || !linkTitleValid) {
283                                        characters.append('[');
284                                        i++;
285                                    }
286    
287                                    // actual link
288                                    else {
289                                        // read target and title
290                                        linkTarget = source.substring(i + 1,
291                                                linkDelimIndex);
292                                        linkTitle = source.substring(
293                                                linkDelimIndex + 1, linkEndIndex);
294    
295                                        // emit characters this far
296                                        if (characters.length() > 0) {
297                                            this.characters(characters.toString()
298                                                    .toCharArray(), 0, characters
299                                                    .length());
300                                            characters.delete(0, characters
301                                                    .length());
302                                        }
303    
304                                        // emit link element
305                                        AttributesImpl attributes = new AttributesImpl();
306                                        attributes.addAttribute("", "href", "href",
307                                                "CDATA", linkTarget); // namespace
308                                        // removed --
309                                        // hsivonen
310                                        this.startElement(Globals.XHTML_NS, "a",
311                                                "a", attributes);
312                                        this.characters(linkTitle.toCharArray(), 0,
313                                                linkTitle.length());
314                                        this.endElement(Globals.XHTML_NS, "a", "a");
315    
316                                        // skip forward past the link
317                                        i = linkEndIndex + 1;
318                                    } // actual link
319                                } // looks like a link
320                            } // single '['
321                        } // '[' seen
322    
323                        // other characters
324                        else {
325                            characters.append(c);
326                            i++;
327                        }
328                    }
329                }
330    
331                // finish up
332    
333                // emit possible characters not yet emitted
334                if (characters.length() > 0) {
335                    this.characters(characters.toString().toCharArray(), 0,
336                            characters.length());
337                    characters.delete(0, characters.length());
338                }
339    
340                // close possible ending p
341                if (pOpen) {
342                    this.endElement(Globals.XHTML_NS, "p", "p");
343                }
344                // close body and end document
345                this.endElement(Globals.XHTML_NS, "body", "body");
346            } finally {
347                this.endDocument();
348            }
349        }
350    
351        /**
352         * Checks if the given char is whitespace.
353         * 
354         * For now whitespace means ' ' or '\t'.
355         * @param c
356         * @return true if give char is whitespace
357         */
358        private boolean isWhiteSpace(char c) {
359            if (c == ' ' || c == '\t') {
360                return true;
361            } else {
362                return false;
363            }
364        }
365    
366        private void characters(char[] a, int b, int c) throws SAXException {
367            if (this.contentHandler != null) {
368                this.contentHandler.characters(a, b, c);
369            }
370        }
371    
372        private void startElement(String a, String b, String c, Attributes attrs)
373                throws SAXException {
374            if (this.contentHandler != null) {
375                this.contentHandler.startElement(a, b, c, attrs);
376            }
377        }
378    
379        private void endElement(String a, String b, String c) throws SAXException {
380            if (this.contentHandler != null) {
381                this.contentHandler.endElement(a, b, c);
382            }
383        }
384    
385        private void startDocument() throws SAXException {
386            if (this.contentHandler != null) {
387                this.contentHandler.startDocument();
388            }
389        }
390    
391        private void endDocument() throws SAXException {
392            if (this.contentHandler != null) {
393                this.contentHandler.endDocument();
394            }
395        }
396    
397    }