001 /*
002 * Copyright (c) 2006 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package fi.iki.hsivonen.xml.checker;
024
025 import org.xml.sax.Attributes;
026 import org.xml.sax.SAXException;
027
028 import com.ibm.icu.lang.UCharacter;
029 import com.ibm.icu.text.Normalizer;
030 import com.ibm.icu.text.UnicodeSet;
031
032 /**
033 * Checks that the following constructs do not start with a composing character:
034 * <ul>
035 * <li>Local names of elements
036 * <li>Local names of attributes
037 * <li>Attribute values
038 * <li>Declared namespace prefixes
039 * <li>Declared namespace URIs
040 * <li>PI targets
041 * <li>PI data
042 * <li>Concatenations of consecutive character data between element
043 * boundaries and PIs ignoring comments and CDATA section boundaries.
044 * </ul>
045 * <p>Checks that the following constructs are in the Unicode Normalization
046 * Form C. (It is assumed the normalization of the rest of the constructs
047 * is enforced by other means, such as checking the document source for
048 * normalization.)
049 * <ul>
050 * <li>Attribute values
051 * <li>PI data
052 * <li>Concatenations of consecutive character data between element
053 * boundaries and PIs ignoring comments and CDATA section boundaries.
054 * </ul>
055 * <p>All <code>String</code>s must be valid UTF-16!
056 * <p>This class can also be used as a source code mode where the source
057 * code of the document is fed to <code>characters()</code>. The mode
058 * modifies the error messages appropriately.
059 *
060 * @version $Id: NormalizationChecker.java,v 1.6 2006/12/01 12:34:31 hsivonen Exp $
061 * @author hsivonen
062 */
063 public final class NormalizationChecker extends Checker {
064
065 /**
066 * A thread-safe set of composing characters as per Charmod Norm.
067 */
068 @SuppressWarnings("deprecation")
069 private static final UnicodeSet COMPOSING_CHARACTERS = (UnicodeSet) new UnicodeSet(
070 "[[:nfc_qc=maybe:][:^ccc=0:]]").freeze();
071 // see http://sourceforge.net/mailarchive/message.php?msg_id=37279908
072
073 /**
074 * A buffer for holding sequences overlap the SAX buffer boundary.
075 */
076 private char[] buf = new char[128];
077
078 /**
079 * A holder for the original buffer (for the memory leak prevention
080 * mechanism).
081 */
082 private char[] bufHolder = null;
083
084 /**
085 * The current used length of the buffer, i.e. the index of the first slot
086 * that does not hold current data.
087 */
088 private int pos;
089
090 /**
091 * Indicates whether the checker the next call to <code>characters()</code>
092 * is the first call in a run.
093 */
094 private boolean atStartOfRun;
095
096 /**
097 * Indicates whether the current run has already caused an error.
098 */
099 private boolean alreadyComplainedAboutThisRun;
100
101 /**
102 * Indicates whether error messages related to source code checking should
103 * be used.
104 */
105 private final boolean sourceTextMode;
106
107 /**
108 * Returns <code>true</code> if the argument is a composing BMP character
109 * or a surrogate and <code>false</code> otherwise.
110 *
111 * @param c a UTF-16 code unit
112 * @return <code>true</code> if the argument is a composing BMP character
113 * or a surrogate and <code>false</code> otherwise
114 */
115 private static boolean isComposingCharOrSurrogate(char c) {
116 if (UCharacter.isHighSurrogate(c) || UCharacter.isLowSurrogate(c)) {
117 return true;
118 }
119 return isComposingChar(c);
120 }
121
122 /**
123 * Returns <code>true</code> if the argument is a composing character
124 * and <code>false</code> otherwise.
125 *
126 * @param c a Unicode code point
127 * @return <code>true</code> if the argument is a composing character
128 * <code>false</code> otherwise
129 */
130 private static boolean isComposingChar(int c) {
131 return COMPOSING_CHARACTERS.contains(c);
132 }
133
134 /**
135 * Returns <code>true</code> if the argument starts with a composing
136 * character and <code>false</code> otherwise.
137 *
138 * @param str a string
139 * @return <code>true</code> if the argument starts with a composing
140 * character and <code>false</code> otherwise.
141 * @throws SAXException on malformed UTF-16
142 */
143 public static boolean startsWithComposingChar(String str)
144 throws SAXException {
145 if (str.length() == 0) {
146 return false;
147 }
148 int first32;
149 char first = str.charAt(0);
150 if (UCharacter.isHighSurrogate(first)) {
151 try {
152 char second = str.charAt(1);
153 first32 = UCharacter.getCodePoint(first, second);
154 } catch (StringIndexOutOfBoundsException e) {
155 throw new SAXException("Malformed UTF-16!");
156 }
157 } else {
158 first32 = first;
159 }
160 return isComposingChar(first32);
161 }
162
163 /**
164 * Constructor for non-source mode.
165 */
166 public NormalizationChecker() {
167 this(false);
168 }
169
170 /**
171 * Constructor with mode selection.
172 *
173 * @param sourceTextMode whether the source text-related messages
174 * should be enabled.
175 */
176 public NormalizationChecker(boolean sourceTextMode) {
177 super();
178 this.sourceTextMode = sourceTextMode;
179 reset();
180 }
181
182 /**
183 * @see fi.iki.hsivonen.xml.checker.Checker#reset()
184 */
185 public void reset() {
186 atStartOfRun = true;
187 alreadyComplainedAboutThisRun = false;
188 pos = 0;
189 if (bufHolder != null) {
190 // restore the original small buffer to avoid leaking
191 // memory if this checker is recycled
192 buf = bufHolder;
193 bufHolder = null;
194 }
195 }
196
197 /**
198 * In the normal mode, this method has the usual SAX semantics. In the
199 * source text mode, this method is used for reporting the source text.
200 *
201 * @see fi.iki.hsivonen.xml.checker.Checker#characters(char[], int, int)
202 */
203 public void characters(char[] ch, int start, int length)
204 throws SAXException {
205 if (alreadyComplainedAboutThisRun) {
206 return;
207 }
208 if (atStartOfRun) {
209 char c = ch[start];
210 if (pos == 1) {
211 // there's a single high surrogate in buf
212 if (isComposingChar(UCharacter.getCodePoint(buf[0], c))) {
213 err("Text run starts with a composing character.");
214 }
215 atStartOfRun = false;
216 } else {
217 if (length == 1 && UCharacter.isHighSurrogate(c)) {
218 buf[0] = c;
219 pos = 1;
220 return;
221 } else {
222 if (UCharacter.isHighSurrogate(c)) {
223 if (isComposingChar(UCharacter.getCodePoint(c,
224 ch[start + 1]))) {
225 err("Text run starts with a composing character.");
226 }
227 } else {
228 if (isComposingCharOrSurrogate(c)) {
229 err("Text run starts with a composing character.");
230 }
231 }
232 atStartOfRun = false;
233 }
234 }
235 }
236 int i = start;
237 int stop = start + length;
238 if (pos > 0) {
239 // there's stuff in buf
240 while (i < stop && isComposingCharOrSurrogate(ch[i])) {
241 i++;
242 }
243 appendToBuf(ch, start, i);
244 if (i == stop) {
245 return;
246 } else {
247 if (!Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
248 errAboutTextRun();
249 }
250 pos = 0;
251 }
252 }
253 if (i < stop) {
254 start = i;
255 i = stop - 1;
256 while (i > start && isComposingCharOrSurrogate(ch[i])) {
257 i--;
258 }
259 if (i > start) {
260 if (!Normalizer.isNormalized(ch, start, i, Normalizer.NFC, 0)) {
261 errAboutTextRun();
262 }
263 }
264 appendToBuf(ch, i, stop);
265 }
266 }
267
268 /**
269 * Emits an error stating that the current text run or the source
270 * text is not in NFC.
271 *
272 * @throws SAXException if the <code>ErrorHandler</code> throws
273 */
274 private void errAboutTextRun() throws SAXException {
275 if (sourceTextMode) {
276 err("Source text is not in Unicode Normalization Form C.");
277 } else {
278 err("Text run is not in Unicode Normalization Form C.");
279 }
280 alreadyComplainedAboutThisRun = true;
281 }
282
283 /**
284 * Appends a slice of an UTF-16 code unit array to the internal
285 * buffer.
286 *
287 * @param ch the array from which to copy
288 * @param start the index of the first element that is copied
289 * @param end the index of the first element that is not copied
290 */
291 private void appendToBuf(char[] ch, int start, int end) {
292 if (start == end) {
293 return;
294 }
295 int neededBufLen = pos + (end - start);
296 if (neededBufLen > buf.length) {
297 char[] newBuf = new char[neededBufLen];
298 System.arraycopy(buf, 0, newBuf, 0, pos);
299 if (bufHolder == null) {
300 bufHolder = buf; // keep the original around
301 }
302 buf = newBuf;
303 }
304 System.arraycopy(ch, start, buf, pos, end - start);
305 pos += (end - start);
306 }
307
308 /**
309 * @see fi.iki.hsivonen.xml.checker.Checker#endElement(java.lang.String,
310 * java.lang.String, java.lang.String)
311 */
312 public void endElement(String uri, String localName, String qName)
313 throws SAXException {
314 flush();
315 }
316
317 /**
318 * @see fi.iki.hsivonen.xml.checker.Checker#processingInstruction(java.lang.String,
319 * java.lang.String)
320 */
321 public void processingInstruction(String target, String data)
322 throws SAXException {
323 flush();
324 if (!"".equals(target)) {
325 if (startsWithComposingChar(target)) {
326 err("Processing instruction target starts with a composing character.");
327 }
328 }
329 if (!"".equals(data)) {
330 if (startsWithComposingChar(data)) {
331 err("Processing instruction data starts with a composing character.");
332 } else if (!Normalizer.isNormalized(data, Normalizer.NFC, 0)) {
333 err("Processing instruction data in not in Unicode Normalization Form C.");
334 }
335 }
336 }
337
338 /**
339 * @see fi.iki.hsivonen.xml.checker.Checker#startElement(java.lang.String,
340 * java.lang.String, java.lang.String, org.xml.sax.Attributes)
341 */
342 public void startElement(String uri, String localName, String qName,
343 Attributes atts) throws SAXException {
344 flush();
345 if (startsWithComposingChar(localName)) {
346 err("Element name \u201C " + localName
347 + "\u201D starts with a composing character.");
348 }
349
350 int len = atts.getLength();
351 for (int i = 0; i < len; i++) {
352 String name = atts.getLocalName(i);
353 if (startsWithComposingChar(name)) {
354 err("Attribute name \u201C " + localName
355 + "\u201D starts with a composing character.");
356 }
357
358 String value = atts.getValue(i);
359 if (!"".equals(value)) {
360 if (startsWithComposingChar(value)) {
361 err("The value of attribute \u201C"
362 + atts.getLocalName(i)
363 + "\u201D"
364 + ("".equals(atts.getURI(i)) ? ""
365 : " in namespace \u201C" + atts.getURI(i)
366 + "\u201D") + " on element \u201C"
367 + localName + "\u201D from namespace \u201C" + uri
368 + "\u201D starts with a composing character.");
369 } else if (!Normalizer.isNormalized(value, Normalizer.NFC, 0)) {
370 err("The value of attribute \u201C"
371 + atts.getLocalName(i)
372 + "\u201D"
373 + ("".equals(atts.getURI(i)) ? ""
374 : " in namespace \u201C" + atts.getURI(i)
375 + "\u201D") + " on element \u201C"
376 + localName + "\u201D from namespace \u201C" + uri
377 + "\u201D is not in Unicode Normalization Form C.");
378 }
379 }
380 }
381 }
382
383 /**
384 * @see fi.iki.hsivonen.xml.checker.Checker#startPrefixMapping(java.lang.String, java.lang.String)
385 */
386 public void startPrefixMapping(String prefix, String uri)
387 throws SAXException {
388 if (startsWithComposingChar(prefix)) {
389 err("Namespace prefix \u201C " + prefix
390 + "\u201D starts with a composing character.");
391 }
392 if (startsWithComposingChar(uri)) {
393 err("Namespace URI \u201C " + uri
394 + "\u201D starts with a composing character.");
395 }
396 }
397
398 /**
399 * Called to indicate the end of a run of characters. When this class is
400 * used for checking source text, this method should be called after all
401 * the calls to <code>characters()</code>.
402 *
403 * @throws SAXException if the <code>ErrorHandler</code> throws.
404 */
405 public void flush() throws SAXException {
406 if (!alreadyComplainedAboutThisRun
407 && !Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) {
408 errAboutTextRun();
409 }
410 reset();
411 }
412
413 }