001 /* 002 * Copyright (c) 2006 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.iki.hsivonen.xml.checker; 024 025 import org.xml.sax.Attributes; 026 import org.xml.sax.SAXException; 027 028 import com.ibm.icu.lang.UCharacter; 029 import com.ibm.icu.text.Normalizer; 030 import com.ibm.icu.text.UnicodeSet; 031 032 /** 033 * Checks that the following constructs do not start with a composing character: 034 * <ul> 035 * <li>Local names of elements 036 * <li>Local names of attributes 037 * <li>Attribute values 038 * <li>Declared namespace prefixes 039 * <li>Declared namespace URIs 040 * <li>PI targets 041 * <li>PI data 042 * <li>Concatenations of consecutive character data between element 043 * boundaries and PIs ignoring comments and CDATA section boundaries. 044 * </ul> 045 * <p>Checks that the following constructs are in the Unicode Normalization 046 * Form C. (It is assumed the normalization of the rest of the constructs 047 * is enforced by other means, such as checking the document source for 048 * normalization.) 049 * <ul> 050 * <li>Attribute values 051 * <li>PI data 052 * <li>Concatenations of consecutive character data between element 053 * boundaries and PIs ignoring comments and CDATA section boundaries. 054 * </ul> 055 * <p>All <code>String</code>s must be valid UTF-16! 056 * <p>This class can also be used as a source code mode where the source 057 * code of the document is fed to <code>characters()</code>. The mode 058 * modifies the error messages appropriately. 059 * 060 * @version $Id: NormalizationChecker.java,v 1.6 2006/12/01 12:34:31 hsivonen Exp $ 061 * @author hsivonen 062 */ 063 public final class NormalizationChecker extends Checker { 064 065 /** 066 * A thread-safe set of composing characters as per Charmod Norm. 067 */ 068 @SuppressWarnings("deprecation") 069 private static final UnicodeSet COMPOSING_CHARACTERS = (UnicodeSet) new UnicodeSet( 070 "[[:nfc_qc=maybe:][:^ccc=0:]]").freeze(); 071 // see http://sourceforge.net/mailarchive/message.php?msg_id=37279908 072 073 /** 074 * A buffer for holding sequences overlap the SAX buffer boundary. 075 */ 076 private char[] buf = new char[128]; 077 078 /** 079 * A holder for the original buffer (for the memory leak prevention 080 * mechanism). 081 */ 082 private char[] bufHolder = null; 083 084 /** 085 * The current used length of the buffer, i.e. the index of the first slot 086 * that does not hold current data. 087 */ 088 private int pos; 089 090 /** 091 * Indicates whether the checker the next call to <code>characters()</code> 092 * is the first call in a run. 093 */ 094 private boolean atStartOfRun; 095 096 /** 097 * Indicates whether the current run has already caused an error. 098 */ 099 private boolean alreadyComplainedAboutThisRun; 100 101 /** 102 * Indicates whether error messages related to source code checking should 103 * be used. 104 */ 105 private final boolean sourceTextMode; 106 107 /** 108 * Returns <code>true</code> if the argument is a composing BMP character 109 * or a surrogate and <code>false</code> otherwise. 110 * 111 * @param c a UTF-16 code unit 112 * @return <code>true</code> if the argument is a composing BMP character 113 * or a surrogate and <code>false</code> otherwise 114 */ 115 private static boolean isComposingCharOrSurrogate(char c) { 116 if (UCharacter.isHighSurrogate(c) || UCharacter.isLowSurrogate(c)) { 117 return true; 118 } 119 return isComposingChar(c); 120 } 121 122 /** 123 * Returns <code>true</code> if the argument is a composing character 124 * and <code>false</code> otherwise. 125 * 126 * @param c a Unicode code point 127 * @return <code>true</code> if the argument is a composing character 128 * <code>false</code> otherwise 129 */ 130 private static boolean isComposingChar(int c) { 131 return COMPOSING_CHARACTERS.contains(c); 132 } 133 134 /** 135 * Returns <code>true</code> if the argument starts with a composing 136 * character and <code>false</code> otherwise. 137 * 138 * @param str a string 139 * @return <code>true</code> if the argument starts with a composing 140 * character and <code>false</code> otherwise. 141 * @throws SAXException on malformed UTF-16 142 */ 143 public static boolean startsWithComposingChar(String str) 144 throws SAXException { 145 if (str.length() == 0) { 146 return false; 147 } 148 int first32; 149 char first = str.charAt(0); 150 if (UCharacter.isHighSurrogate(first)) { 151 try { 152 char second = str.charAt(1); 153 first32 = UCharacter.getCodePoint(first, second); 154 } catch (StringIndexOutOfBoundsException e) { 155 throw new SAXException("Malformed UTF-16!"); 156 } 157 } else { 158 first32 = first; 159 } 160 return isComposingChar(first32); 161 } 162 163 /** 164 * Constructor for non-source mode. 165 */ 166 public NormalizationChecker() { 167 this(false); 168 } 169 170 /** 171 * Constructor with mode selection. 172 * 173 * @param sourceTextMode whether the source text-related messages 174 * should be enabled. 175 */ 176 public NormalizationChecker(boolean sourceTextMode) { 177 super(); 178 this.sourceTextMode = sourceTextMode; 179 reset(); 180 } 181 182 /** 183 * @see fi.iki.hsivonen.xml.checker.Checker#reset() 184 */ 185 public void reset() { 186 atStartOfRun = true; 187 alreadyComplainedAboutThisRun = false; 188 pos = 0; 189 if (bufHolder != null) { 190 // restore the original small buffer to avoid leaking 191 // memory if this checker is recycled 192 buf = bufHolder; 193 bufHolder = null; 194 } 195 } 196 197 /** 198 * In the normal mode, this method has the usual SAX semantics. In the 199 * source text mode, this method is used for reporting the source text. 200 * 201 * @see fi.iki.hsivonen.xml.checker.Checker#characters(char[], int, int) 202 */ 203 public void characters(char[] ch, int start, int length) 204 throws SAXException { 205 if (alreadyComplainedAboutThisRun) { 206 return; 207 } 208 if (atStartOfRun) { 209 char c = ch[start]; 210 if (pos == 1) { 211 // there's a single high surrogate in buf 212 if (isComposingChar(UCharacter.getCodePoint(buf[0], c))) { 213 err("Text run starts with a composing character."); 214 } 215 atStartOfRun = false; 216 } else { 217 if (length == 1 && UCharacter.isHighSurrogate(c)) { 218 buf[0] = c; 219 pos = 1; 220 return; 221 } else { 222 if (UCharacter.isHighSurrogate(c)) { 223 if (isComposingChar(UCharacter.getCodePoint(c, 224 ch[start + 1]))) { 225 err("Text run starts with a composing character."); 226 } 227 } else { 228 if (isComposingCharOrSurrogate(c)) { 229 err("Text run starts with a composing character."); 230 } 231 } 232 atStartOfRun = false; 233 } 234 } 235 } 236 int i = start; 237 int stop = start + length; 238 if (pos > 0) { 239 // there's stuff in buf 240 while (i < stop && isComposingCharOrSurrogate(ch[i])) { 241 i++; 242 } 243 appendToBuf(ch, start, i); 244 if (i == stop) { 245 return; 246 } else { 247 if (!Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) { 248 errAboutTextRun(); 249 } 250 pos = 0; 251 } 252 } 253 if (i < stop) { 254 start = i; 255 i = stop - 1; 256 while (i > start && isComposingCharOrSurrogate(ch[i])) { 257 i--; 258 } 259 if (i > start) { 260 if (!Normalizer.isNormalized(ch, start, i, Normalizer.NFC, 0)) { 261 errAboutTextRun(); 262 } 263 } 264 appendToBuf(ch, i, stop); 265 } 266 } 267 268 /** 269 * Emits an error stating that the current text run or the source 270 * text is not in NFC. 271 * 272 * @throws SAXException if the <code>ErrorHandler</code> throws 273 */ 274 private void errAboutTextRun() throws SAXException { 275 if (sourceTextMode) { 276 err("Source text is not in Unicode Normalization Form C."); 277 } else { 278 err("Text run is not in Unicode Normalization Form C."); 279 } 280 alreadyComplainedAboutThisRun = true; 281 } 282 283 /** 284 * Appends a slice of an UTF-16 code unit array to the internal 285 * buffer. 286 * 287 * @param ch the array from which to copy 288 * @param start the index of the first element that is copied 289 * @param end the index of the first element that is not copied 290 */ 291 private void appendToBuf(char[] ch, int start, int end) { 292 if (start == end) { 293 return; 294 } 295 int neededBufLen = pos + (end - start); 296 if (neededBufLen > buf.length) { 297 char[] newBuf = new char[neededBufLen]; 298 System.arraycopy(buf, 0, newBuf, 0, pos); 299 if (bufHolder == null) { 300 bufHolder = buf; // keep the original around 301 } 302 buf = newBuf; 303 } 304 System.arraycopy(ch, start, buf, pos, end - start); 305 pos += (end - start); 306 } 307 308 /** 309 * @see fi.iki.hsivonen.xml.checker.Checker#endElement(java.lang.String, 310 * java.lang.String, java.lang.String) 311 */ 312 public void endElement(String uri, String localName, String qName) 313 throws SAXException { 314 flush(); 315 } 316 317 /** 318 * @see fi.iki.hsivonen.xml.checker.Checker#processingInstruction(java.lang.String, 319 * java.lang.String) 320 */ 321 public void processingInstruction(String target, String data) 322 throws SAXException { 323 flush(); 324 if (!"".equals(target)) { 325 if (startsWithComposingChar(target)) { 326 err("Processing instruction target starts with a composing character."); 327 } 328 } 329 if (!"".equals(data)) { 330 if (startsWithComposingChar(data)) { 331 err("Processing instruction data starts with a composing character."); 332 } else if (!Normalizer.isNormalized(data, Normalizer.NFC, 0)) { 333 err("Processing instruction data in not in Unicode Normalization Form C."); 334 } 335 } 336 } 337 338 /** 339 * @see fi.iki.hsivonen.xml.checker.Checker#startElement(java.lang.String, 340 * java.lang.String, java.lang.String, org.xml.sax.Attributes) 341 */ 342 public void startElement(String uri, String localName, String qName, 343 Attributes atts) throws SAXException { 344 flush(); 345 if (startsWithComposingChar(localName)) { 346 err("Element name \u201C " + localName 347 + "\u201D starts with a composing character."); 348 } 349 350 int len = atts.getLength(); 351 for (int i = 0; i < len; i++) { 352 String name = atts.getLocalName(i); 353 if (startsWithComposingChar(name)) { 354 err("Attribute name \u201C " + localName 355 + "\u201D starts with a composing character."); 356 } 357 358 String value = atts.getValue(i); 359 if (!"".equals(value)) { 360 if (startsWithComposingChar(value)) { 361 err("The value of attribute \u201C" 362 + atts.getLocalName(i) 363 + "\u201D" 364 + ("".equals(atts.getURI(i)) ? "" 365 : " in namespace \u201C" + atts.getURI(i) 366 + "\u201D") + " on element \u201C" 367 + localName + "\u201D from namespace \u201C" + uri 368 + "\u201D starts with a composing character."); 369 } else if (!Normalizer.isNormalized(value, Normalizer.NFC, 0)) { 370 err("The value of attribute \u201C" 371 + atts.getLocalName(i) 372 + "\u201D" 373 + ("".equals(atts.getURI(i)) ? "" 374 : " in namespace \u201C" + atts.getURI(i) 375 + "\u201D") + " on element \u201C" 376 + localName + "\u201D from namespace \u201C" + uri 377 + "\u201D is not in Unicode Normalization Form C."); 378 } 379 } 380 } 381 } 382 383 /** 384 * @see fi.iki.hsivonen.xml.checker.Checker#startPrefixMapping(java.lang.String, java.lang.String) 385 */ 386 public void startPrefixMapping(String prefix, String uri) 387 throws SAXException { 388 if (startsWithComposingChar(prefix)) { 389 err("Namespace prefix \u201C " + prefix 390 + "\u201D starts with a composing character."); 391 } 392 if (startsWithComposingChar(uri)) { 393 err("Namespace URI \u201C " + uri 394 + "\u201D starts with a composing character."); 395 } 396 } 397 398 /** 399 * Called to indicate the end of a run of characters. When this class is 400 * used for checking source text, this method should be called after all 401 * the calls to <code>characters()</code>. 402 * 403 * @throws SAXException if the <code>ErrorHandler</code> throws. 404 */ 405 public void flush() throws SAXException { 406 if (!alreadyComplainedAboutThisRun 407 && !Normalizer.isNormalized(buf, 0, pos, Normalizer.NFC, 0)) { 408 errAboutTextRun(); 409 } 410 reset(); 411 } 412 413 }