001    /*
002     * Copyright (c) 2006 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.xml.checker;
024    
025    import org.xml.sax.Attributes;
026    import org.xml.sax.SAXException;
027    
028    import com.ibm.icu.lang.UCharacter;
029    import com.ibm.icu.text.Normalizer;
030    import com.ibm.icu.text.UnicodeSet;
031    
032    /**
033     * Checks that the following constructs do not start with a composing character:
034     * <ul>
035     * <li>Local names of elements
036     * <li>Local names of attributes
037     * <li>Attribute values
038     * <li>Declared namespace prefixes
039     * <li>Declared namespace URIs
040     * <li>PI targets
041     * <li>PI data
042     * <li>Concatenations of consecutive character data between element
043     *  boundaries and PIs ignoring comments and CDATA section boundaries.
044     * </ul>
045     * <p>Checks that the following constructs are in the Unicode Normalization 
046     * Form C. (It is assumed the normalization of the rest of the constructs 
047     * is enforced by other means, such as checking the document source for 
048     * normalization.)
049     * <ul>
050     * <li>Attribute values
051     * <li>PI data
052     * <li>Concatenations of consecutive character data between element
053     *  boundaries and PIs ignoring comments and CDATA section boundaries.
054     * </ul>
055     * <p>All <code>String</code>s must be valid UTF-16!
056     * <p>This class can also be used as a source code mode where the source 
057     * code of the document is fed to <code>characters()</code>. The mode 
058     * modifies the error messages appropriately.
059     * 
060     * @version $Id: NormalizationChecker.java,v 1.6 2006/12/01 12:34:31 hsivonen Exp $
061     * @author hsivonen
062     */
063    public final class NormalizationChecker extends Checker {
064    
065        /**
066         * A thread-safe set of composing characters as per Charmod Norm.
067         */
068        @SuppressWarnings("deprecation")
069        private static final UnicodeSet COMPOSING_CHARACTERS = (UnicodeSet) new UnicodeSet(
070                "[[:nfc_qc=maybe:][:^ccc=0:]]").freeze();
071        // see http://sourceforge.net/mailarchive/message.php?msg_id=37279908
072    
073        /**
074         * A buffer for holding sequences overlap the SAX buffer boundary.
075         */
076        private char[] buf = new char[128];
077    
078        /**
079         * A holder for the original buffer (for the memory leak prevention 
080         * mechanism).
081         */
082        private char[] bufHolder = null;    
083        
084        /**
085         * The current used length of the buffer, i.e. the index of the first slot 
086         * that does not hold current data.
087         */
088        private int pos;
089    
090        /**
091         * Indicates whether the checker the next call to <code>characters()</code> 
092         * is the first call in a run.
093         */
094        private boolean atStartOfRun;
095    
096        /**
097         * Indicates whether the current run has already caused an error.
098         */
099        private boolean alreadyComplainedAboutThisRun;
100    
101        /**
102         * Indicates whether error messages related to source code checking should 
103         * be used.
104         */
105        private final boolean sourceTextMode;
106    
107        /**
108         * Returns <code>true</code> if the argument is a composing BMP character 
109         * or a surrogate and <code>false</code> otherwise.
110         * 
111         * @param c a UTF-16 code unit
112         * @return <code>true</code> if the argument is a composing BMP character 
113         * or a surrogate and <code>false</code> otherwise
114         */
115        private static boolean isComposingCharOrSurrogate(char c) {
116            if (UCharacter.isHighSurrogate(c) || UCharacter.isLowSurrogate(c)) {
117                return true;
118            }
119            return isComposingChar(c);
120        }
121    
122        /**
123         * Returns <code>true</code> if the argument is a composing character 
124         * and <code>false</code> otherwise.
125         * 
126         * @param c a Unicode code point
127         * @return <code>true</code> if the argument is a composing character 
128         * <code>false</code> otherwise
129         */
130        private static boolean isComposingChar(int c) {
131            return COMPOSING_CHARACTERS.contains(c);
132        }
133    
134        /**
135         * Returns <code>true</code> if the argument starts with a composing 
136         * character and <code>false</code> otherwise.
137         * 
138         * @param str a string
139         * @return <code>true</code> if the argument starts with a composing 
140         * character and <code>false</code> otherwise.
141         * @throws SAXException on malformed UTF-16
142         */
143        public static boolean startsWithComposingChar(String str)
144                throws SAXException {
145            if (str.length() == 0) {
146                return false;
147            }
148            int first32;
149            char first = str.charAt(0);
150            if (UCharacter.isHighSurrogate(first)) {
151                try {
152                    char second = str.charAt(1);
153                    first32 = UCharacter.getCodePoint(first, second);
154                } catch (StringIndexOutOfBoundsException e) {
155                    throw new SAXException("Malformed UTF-16!");
156                }
157            } else {
158                first32 = first;
159            }
160            return isComposingChar(first32);
161        }
162    
163        /**
164         * Constructor for non-source mode.
165         */
166        public NormalizationChecker() {
167            this(false);
168        }
169    
170        /**
171         * Constructor with mode selection.
172         * 
173         * @param sourceTextMode whether the source text-related messages 
174         * should be enabled.
175         */
176        public NormalizationChecker(boolean sourceTextMode) {
177            super();
178            this.sourceTextMode = sourceTextMode;
179            reset();
180        }
181    
182        /**
183         * @see fi.iki.hsivonen.xml.checker.Checker#reset()
184         */
185        public void reset() {
186            atStartOfRun = true;
187            alreadyComplainedAboutThisRun = false;
188            pos = 0;
189            if (bufHolder != null) {
190                // restore the original small buffer to avoid leaking
191                // memory if this checker is recycled
192                buf = bufHolder;
193                bufHolder = null;
194            }
195        }
196    
197        /**
198         * In the normal mode, this method has the usual SAX semantics. In the 
199         * source text mode, this method is used for reporting the source text.
200         * 
201         * @see fi.iki.hsivonen.xml.checker.Checker#characters(char[], int, int)
202         */
203        public void characters(char[] ch, int start, int length)
204                throws SAXException {
205            if (alreadyComplainedAboutThisRun) {
206                return;
207            }
208            if (atStartOfRun) {
209                char c = ch[start];
210                if (pos == 1) {
211                    // there's a single high surrogate in buf
212                    if (isComposingChar(UCharacter.getCodePoint(buf[0], c))) {
213                        err("Text run starts with a composing character.");
214                    }
215                    atStartOfRun = false;
216                } else {
217                    if (length == 1 && UCharacter.isHighSurrogate(c)) {
218                        buf[0] = c;
219                        pos = 1;
220                        return;
221                    } else {
222                        if (UCharacter.isHighSurrogate(c)) {
223                            if (isComposingChar(UCharacter.getCodePoint(c,
224                                    ch[start + 1]))) {
225                                err("Text run starts with a composing character.");
226                            }
227                        } else {
228                            if (isComposingCharOrSurrogate(c)) {
229                                err("Text run starts with a composing character.");
230                            }
231                        }
232                        atStartOfRun = false;
233                    }
234                }
235            }
236            int i = start;
237            int stop = start + length;
238            if (pos > 0) {
239                // there's stuff in buf
240                while (i < stop && isComposingCharOrSurrogate(ch[i])) {
241                    i++;
242                }
243                appendToBuf(ch, start, i);
244                if (i == stop) {
245                    return;
246                } else {
247                    if (!Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
248                        errAboutTextRun();
249                    }
250                    pos = 0;
251                }
252            }
253            if (i < stop) {
254                start = i;
255                i = stop - 1;
256                while (i > start && isComposingCharOrSurrogate(ch[i])) {
257                    i--;
258                }
259                if (i > start) {
260                    if (!Normalizer.isNormalized(ch, start, i, Normalizer.NFC, 0)) {
261                        errAboutTextRun();
262                    }
263                }
264                appendToBuf(ch, i, stop);
265            }
266        }
267    
268        /**
269         * Emits an error stating that the current text run or the source 
270         * text is not in NFC.
271         * 
272         * @throws SAXException if the <code>ErrorHandler</code> throws
273         */
274        private void errAboutTextRun() throws SAXException {
275            if (sourceTextMode) {
276                err("Source text is not in Unicode Normalization Form C.");
277            } else {
278                err("Text run is not in Unicode Normalization Form C.");
279            }
280            alreadyComplainedAboutThisRun = true;
281        }
282    
283        /**
284         * Appends a slice of an UTF-16 code unit array to the internal 
285         * buffer.
286         * 
287         * @param ch the array from which to copy
288         * @param start the index of the first element that is copied
289         * @param end the index of the first element that is not copied
290         */
291        private void appendToBuf(char[] ch, int start, int end) {
292            if (start == end) {
293                return;
294            }
295            int neededBufLen = pos + (end - start);
296            if (neededBufLen > buf.length) {
297                char[] newBuf = new char[neededBufLen];
298                System.arraycopy(buf, 0, newBuf, 0, pos);
299                if (bufHolder == null) {
300                    bufHolder = buf; // keep the original around
301                }
302                buf = newBuf;
303            }
304            System.arraycopy(ch, start, buf, pos, end - start);
305            pos += (end - start);
306        }
307    
308        /**
309         * @see fi.iki.hsivonen.xml.checker.Checker#endElement(java.lang.String,
310         *      java.lang.String, java.lang.String)
311         */
312        public void endElement(String uri, String localName, String qName)
313                throws SAXException {
314            flush();
315        }
316    
317        /**
318         * @see fi.iki.hsivonen.xml.checker.Checker#processingInstruction(java.lang.String,
319         *      java.lang.String)
320         */
321        public void processingInstruction(String target, String data)
322                throws SAXException {
323            flush();
324            if (!"".equals(target)) {
325                if (startsWithComposingChar(target)) {
326                    err("Processing instruction target starts with a composing character.");
327                }
328            }
329            if (!"".equals(data)) {
330                if (startsWithComposingChar(data)) {
331                    err("Processing instruction data starts with a composing character.");
332                } else if (!Normalizer.isNormalized(data, Normalizer.NFC, 0)) {
333                    err("Processing instruction data in not in Unicode Normalization Form C.");
334                }
335            }
336        }
337    
338        /**
339         * @see fi.iki.hsivonen.xml.checker.Checker#startElement(java.lang.String,
340         *      java.lang.String, java.lang.String, org.xml.sax.Attributes)
341         */
342        public void startElement(String uri, String localName, String qName,
343                Attributes atts) throws SAXException {
344            flush();
345            if (startsWithComposingChar(localName)) {
346                err("Element name \u201C " + localName
347                        + "\u201D starts with a composing character.");
348            }
349    
350            int len = atts.getLength();
351            for (int i = 0; i < len; i++) {
352                String name = atts.getLocalName(i);
353                if (startsWithComposingChar(name)) {
354                    err("Attribute name \u201C " + localName
355                            + "\u201D starts with a composing character.");
356                }
357    
358                String value = atts.getValue(i);
359                if (!"".equals(value)) {
360                    if (startsWithComposingChar(value)) {
361                        err("The value of attribute \u201C"
362                                + atts.getLocalName(i)
363                                + "\u201D"
364                                + ("".equals(atts.getURI(i)) ? ""
365                                        : " in namespace \u201C" + atts.getURI(i)
366                                                + "\u201D") + " on element \u201C"
367                                + localName + "\u201D from namespace \u201C" + uri
368                                + "\u201D starts with a composing character.");
369                    } else if (!Normalizer.isNormalized(value, Normalizer.NFC, 0)) {
370                        err("The value of attribute \u201C"
371                                + atts.getLocalName(i)
372                                + "\u201D"
373                                + ("".equals(atts.getURI(i)) ? ""
374                                        : " in namespace \u201C" + atts.getURI(i)
375                                                + "\u201D") + " on element \u201C"
376                                + localName + "\u201D from namespace \u201C" + uri
377                                + "\u201D is not in Unicode Normalization Form C.");
378                    }
379                }
380            }
381        }
382    
383        /**
384         * @see fi.iki.hsivonen.xml.checker.Checker#startPrefixMapping(java.lang.String, java.lang.String)
385         */
386        public void startPrefixMapping(String prefix, String uri)
387                throws SAXException {
388            if (startsWithComposingChar(prefix)) {
389                err("Namespace prefix \u201C " + prefix
390                        + "\u201D starts with a composing character.");
391            }
392            if (startsWithComposingChar(uri)) {
393                err("Namespace URI \u201C " + uri
394                        + "\u201D starts with a composing character.");
395            }
396        }
397    
398        /**
399         * Called to indicate the end of a run of characters. When this class is 
400         * used for checking source text, this method should be called after all 
401         * the calls to <code>characters()</code>.
402         * 
403         * @throws SAXException if the <code>ErrorHandler</code> throws.
404         */
405        public void flush() throws SAXException {
406            if (!alreadyComplainedAboutThisRun
407                    && !Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
408                errAboutTextRun();
409            }
410            reset();
411        }
412    
413    }