001 /* 002 * Copyright (c) 2006 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package org.whattf.datatype; 024 025 import java.io.IOException; 026 import java.util.Arrays; 027 import java.util.regex.Pattern; 028 029 import org.relaxng.datatype.DatatypeException; 030 import org.whattf.datatype.data.LanguageData; 031 032 /** 033 * 034 * @version $Id: Language.java,v 1.5 2006/11/18 11:51:44 hsivonen Exp $ 035 * @author hsivonen 036 */ 037 public final class Language extends AbstractDatatype { 038 039 /** 040 * The singleton instance. 041 */ 042 public static final Language THE_INSTANCE = new Language(); 043 044 private static final Pattern HYPHEN = Pattern.compile("-"); 045 046 private static String[] languages = null; 047 048 private static String[] scripts = null; 049 050 private static String[] regions = null; 051 052 private static String[] variants = null; 053 054 private static int[] suppressedScriptByLanguage = null; 055 056 private static String[][] prefixesByVariant = null; 057 058 static { 059 try { 060 LanguageData data = new LanguageData(); 061 languages = data.getLanguages(); 062 scripts = data.getScripts(); 063 regions = data.getRegions(); 064 variants = data.getScripts(); 065 suppressedScriptByLanguage = data.getSuppressedScriptByLanguage(); 066 prefixesByVariant = data.getPrefixesByVariant(); 067 } catch (IOException e) { 068 throw new RuntimeException(e); 069 } 070 } 071 072 /** 073 * List extracted from http://www.iana.org/assignments/language-tags on 074 * 2006-04-13. List dated 2005-09-09. 075 */ 076 private static final String[] GRANDFATHERED = { "art-lojban", // deprecated 077 "az-arab", "az-cyrl", "az-latn", "be-latn", "bs-cyrl", "bs-latn", 078 "cel-gaulish", "de-1901", "de-1996", "de-at-1901", "de-at-1996", 079 "de-ch-1901", "de-ch-1996", "de-de-1901", "de-de-1996", "en-boont", 080 "en-gb-oed", "en-scouse", "es-419", "i-ami", "i-bnn", "i-default", // inappropriate 081 // for 082 // HTML5? 083 "i-enochian", "i-hak", // deprecated 084 "i-klingon", // deprecated 085 "i-lux", // deprecated 086 "i-mingo", "i-navajo", // deprecated 087 "i-pwn", "i-tao", "i-tay", "i-tsu", "iu-cans", "iu-latn", 088 "mn-cyrl", "mn-mong", "no-bok", // deprecated 089 "no-nyn", // deprecated 090 "sgn-be-fr", "sgn-be-nl", "sgn-br", "sgn-ch-de", "sgn-co", 091 "sgn-de", "sgn-dk", "sgn-es", "sgn-fr", "sgn-gb", "sgn-gr", 092 "sgn-ie", "sgn-it", "sgn-jp", "sgn-mx", "sgn-ni", "sgn-nl", 093 "sgn-no", "sgn-pt", "sgn-se", "sgn-us", "sgn-za", "sl-nedis", 094 "sl-rozaj", "sr-cyrl", "sr-latn", "tg-arab", "tg-cyrl", "uz-cyrl", 095 "uz-latn", "yi-latn", "zh-cmn", "zh-cmn-hans", "zh-cmn-hant", 096 "zh-gan", "zh-guoyu", // deprecated 097 "zh-hakka", "zh-hans", "zh-hans-cn", "zh-hans-hk", "zh-hans-mo", 098 "zh-hans-sg", "zh-hans-tw", "zh-hant", "zh-hant-cn", "zh-hant-hk", 099 "zh-hant-mo", "zh-hant-sg", "zh-hant-tw", "zh-min", "zh-min-nan", 100 "zh-wuu", "zh-xiang", "zh-yue" }; 101 102 /** 103 * Package-private constructor 104 */ 105 private Language() { 106 super(); 107 } 108 109 public void checkValid(CharSequence lit) 110 throws DatatypeException { 111 String literal = lit.toString(); 112 if (literal.length() == 0) { 113 throw new DatatypeException( 114 "The empty string is not a valid language tag."); 115 } 116 literal = toAsciiLowerCase(literal); 117 if (isGrandfathered(literal)) { 118 return; 119 } 120 if (literal.startsWith("-")) { 121 throw new DatatypeException( 122 "Language tag must not start with HYPHEN-MINUS."); 123 } 124 if (literal.endsWith("-")) { 125 throw new DatatypeException( 126 "Language tag must not end with HYPHEN-MINUS."); 127 } 128 String[] subtags = HYPHEN.split(literal); 129 int i = 0; 130 String subtag = subtags[i]; 131 int len = subtag.length(); 132 if ("x".equals(subtag)) { 133 checkPrivateUse(i, subtags); 134 return; 135 } 136 if ((len == 2 || len == 3) && isLowerCaseAlpha(subtag)) { 137 if (!isLanguage(subtag)) { 138 throw new DatatypeException( 139 "Bad ISO language part in language tag"); 140 } 141 i++; 142 subtag = subtags[i]; 143 len = subtag.length(); 144 if (len == 3) { 145 throw new DatatypeException( 146 "Found reserved language extension subtag."); 147 } 148 } else if (len == 4 && isLowerCaseAlpha(subtag)) { 149 throw new DatatypeException("Found reserved language tag."); 150 } else if (len == 5 && isLowerCaseAlpha(subtag)) { 151 if (!isLanguage(subtag)) { 152 throw new DatatypeException( 153 "Bad IANA language part in language tag"); 154 } 155 i++; 156 subtag = subtags[i]; 157 len = subtag.length(); 158 } 159 if ("x".equals(subtag)) { 160 checkPrivateUse(i, subtags); 161 return; 162 } 163 if (subtag.length() == 4) { 164 if (!isScript(subtag)) { 165 throw new DatatypeException("Bad script subtag"); 166 } 167 i++; 168 subtag = subtags[i]; 169 len = subtag.length(); 170 } 171 if ((len == 3 && isDigit(subtag)) 172 || (len == 2 && isLowerCaseAlpha(subtag))) { 173 if (!isRegion(subtag)) { 174 throw new DatatypeException("Bad region subtag"); 175 } 176 i++; 177 subtag = subtags[i]; 178 len = subtag.length(); 179 } 180 while (i < subtags.length) { 181 if ("x".equals(subtag)) { 182 checkPrivateUse(i, subtags); 183 return; 184 } 185 // cutting corners here a bit 186 if (len == 1) { 187 throw new DatatypeException("Unknown extension."); 188 } else { 189 if (!isVariant(subtag)) { 190 throw new DatatypeException("Bad variant subtag"); 191 } 192 } 193 i++; 194 subtag = subtags[i]; 195 len = subtag.length(); 196 } 197 } 198 199 private boolean isVariant(String subtag) { 200 // TODO Auto-generated method stub 201 return false; 202 } 203 204 private boolean isRegion(String subtag) { 205 // TODO Auto-generated method stub 206 return false; 207 } 208 209 private boolean isScript(String subtag) { 210 // TODO Auto-generated method stub 211 return false; 212 } 213 214 private boolean isLanguage(String language) { 215 // TODO Auto-generated method stub 216 return false; 217 } 218 219 private void checkPrivateUse(int i, String[] subtags) 220 throws DatatypeException { 221 int len = subtags.length; 222 i++; 223 if (i == len) { 224 throw new DatatypeException("No subtags in private use sequence."); 225 } 226 while (i < len) { 227 String subtag = subtags[i]; 228 if (subtag.length() < 1) { 229 throw new DatatypeException("Zero-length private use subtag."); 230 } 231 if (subtag.length() > 8) { 232 throw new DatatypeException("Private use subtag too long."); 233 } 234 if (!isLowerCaseAlphaNumeric(subtag)) { 235 throw new DatatypeException( 236 "Bad character in private use subtag."); 237 } 238 i++; 239 } 240 } 241 242 private final boolean isLowerCaseAlphaNumeric(char c) { 243 return isLowerCaseAlpha(c) || isDigit(c); 244 } 245 246 private final boolean isLowerCaseAlphaNumeric(String str) { 247 for (int i = 0; i < str.length(); i++) { 248 if (!isLowerCaseAlphaNumeric(str.charAt(i))) { 249 return false; 250 } 251 } 252 return true; 253 } 254 255 /** 256 * @param c 257 * @return 258 */ 259 private final boolean isDigit(char c) { 260 return (c >= '0' && c <= '9'); 261 } 262 263 private final boolean isDigit(String str) { 264 for (int i = 0; i < str.length(); i++) { 265 if (!isDigit(str.charAt(i))) { 266 return false; 267 } 268 } 269 return true; 270 } 271 272 /** 273 * @param c 274 * @return 275 */ 276 private final boolean isLowerCaseAlpha(char c) { 277 return (c >= 'a' && c <= 'z'); 278 } 279 280 private final boolean isLowerCaseAlpha(String str) { 281 for (int i = 0; i < str.length(); i++) { 282 if (!isLowerCaseAlpha(str.charAt(i))) { 283 return false; 284 } 285 } 286 return true; 287 } 288 289 private boolean isGrandfathered(String literal) { 290 return Arrays.binarySearch(GRANDFATHERED, literal) > -1; 291 } 292 293 }