001 /*
002 * Copyright (c) 2006 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package org.whattf.datatype;
024
025 import java.io.IOException;
026 import java.util.Arrays;
027 import java.util.regex.Pattern;
028
029 import org.relaxng.datatype.DatatypeException;
030 import org.whattf.datatype.data.LanguageData;
031
032 /**
033 *
034 * @version $Id: Language.java,v 1.5 2006/11/18 11:51:44 hsivonen Exp $
035 * @author hsivonen
036 */
037 public final class Language extends AbstractDatatype {
038
039 /**
040 * The singleton instance.
041 */
042 public static final Language THE_INSTANCE = new Language();
043
044 private static final Pattern HYPHEN = Pattern.compile("-");
045
046 private static String[] languages = null;
047
048 private static String[] scripts = null;
049
050 private static String[] regions = null;
051
052 private static String[] variants = null;
053
054 private static int[] suppressedScriptByLanguage = null;
055
056 private static String[][] prefixesByVariant = null;
057
058 static {
059 try {
060 LanguageData data = new LanguageData();
061 languages = data.getLanguages();
062 scripts = data.getScripts();
063 regions = data.getRegions();
064 variants = data.getScripts();
065 suppressedScriptByLanguage = data.getSuppressedScriptByLanguage();
066 prefixesByVariant = data.getPrefixesByVariant();
067 } catch (IOException e) {
068 throw new RuntimeException(e);
069 }
070 }
071
072 /**
073 * List extracted from http://www.iana.org/assignments/language-tags on
074 * 2006-04-13. List dated 2005-09-09.
075 */
076 private static final String[] GRANDFATHERED = { "art-lojban", // deprecated
077 "az-arab", "az-cyrl", "az-latn", "be-latn", "bs-cyrl", "bs-latn",
078 "cel-gaulish", "de-1901", "de-1996", "de-at-1901", "de-at-1996",
079 "de-ch-1901", "de-ch-1996", "de-de-1901", "de-de-1996", "en-boont",
080 "en-gb-oed", "en-scouse", "es-419", "i-ami", "i-bnn", "i-default", // inappropriate
081 // for
082 // HTML5?
083 "i-enochian", "i-hak", // deprecated
084 "i-klingon", // deprecated
085 "i-lux", // deprecated
086 "i-mingo", "i-navajo", // deprecated
087 "i-pwn", "i-tao", "i-tay", "i-tsu", "iu-cans", "iu-latn",
088 "mn-cyrl", "mn-mong", "no-bok", // deprecated
089 "no-nyn", // deprecated
090 "sgn-be-fr", "sgn-be-nl", "sgn-br", "sgn-ch-de", "sgn-co",
091 "sgn-de", "sgn-dk", "sgn-es", "sgn-fr", "sgn-gb", "sgn-gr",
092 "sgn-ie", "sgn-it", "sgn-jp", "sgn-mx", "sgn-ni", "sgn-nl",
093 "sgn-no", "sgn-pt", "sgn-se", "sgn-us", "sgn-za", "sl-nedis",
094 "sl-rozaj", "sr-cyrl", "sr-latn", "tg-arab", "tg-cyrl", "uz-cyrl",
095 "uz-latn", "yi-latn", "zh-cmn", "zh-cmn-hans", "zh-cmn-hant",
096 "zh-gan", "zh-guoyu", // deprecated
097 "zh-hakka", "zh-hans", "zh-hans-cn", "zh-hans-hk", "zh-hans-mo",
098 "zh-hans-sg", "zh-hans-tw", "zh-hant", "zh-hant-cn", "zh-hant-hk",
099 "zh-hant-mo", "zh-hant-sg", "zh-hant-tw", "zh-min", "zh-min-nan",
100 "zh-wuu", "zh-xiang", "zh-yue" };
101
102 /**
103 * Package-private constructor
104 */
105 private Language() {
106 super();
107 }
108
109 public void checkValid(CharSequence lit)
110 throws DatatypeException {
111 String literal = lit.toString();
112 if (literal.length() == 0) {
113 throw new DatatypeException(
114 "The empty string is not a valid language tag.");
115 }
116 literal = toAsciiLowerCase(literal);
117 if (isGrandfathered(literal)) {
118 return;
119 }
120 if (literal.startsWith("-")) {
121 throw new DatatypeException(
122 "Language tag must not start with HYPHEN-MINUS.");
123 }
124 if (literal.endsWith("-")) {
125 throw new DatatypeException(
126 "Language tag must not end with HYPHEN-MINUS.");
127 }
128 String[] subtags = HYPHEN.split(literal);
129 int i = 0;
130 String subtag = subtags[i];
131 int len = subtag.length();
132 if ("x".equals(subtag)) {
133 checkPrivateUse(i, subtags);
134 return;
135 }
136 if ((len == 2 || len == 3) && isLowerCaseAlpha(subtag)) {
137 if (!isLanguage(subtag)) {
138 throw new DatatypeException(
139 "Bad ISO language part in language tag");
140 }
141 i++;
142 subtag = subtags[i];
143 len = subtag.length();
144 if (len == 3) {
145 throw new DatatypeException(
146 "Found reserved language extension subtag.");
147 }
148 } else if (len == 4 && isLowerCaseAlpha(subtag)) {
149 throw new DatatypeException("Found reserved language tag.");
150 } else if (len == 5 && isLowerCaseAlpha(subtag)) {
151 if (!isLanguage(subtag)) {
152 throw new DatatypeException(
153 "Bad IANA language part in language tag");
154 }
155 i++;
156 subtag = subtags[i];
157 len = subtag.length();
158 }
159 if ("x".equals(subtag)) {
160 checkPrivateUse(i, subtags);
161 return;
162 }
163 if (subtag.length() == 4) {
164 if (!isScript(subtag)) {
165 throw new DatatypeException("Bad script subtag");
166 }
167 i++;
168 subtag = subtags[i];
169 len = subtag.length();
170 }
171 if ((len == 3 && isDigit(subtag))
172 || (len == 2 && isLowerCaseAlpha(subtag))) {
173 if (!isRegion(subtag)) {
174 throw new DatatypeException("Bad region subtag");
175 }
176 i++;
177 subtag = subtags[i];
178 len = subtag.length();
179 }
180 while (i < subtags.length) {
181 if ("x".equals(subtag)) {
182 checkPrivateUse(i, subtags);
183 return;
184 }
185 // cutting corners here a bit
186 if (len == 1) {
187 throw new DatatypeException("Unknown extension.");
188 } else {
189 if (!isVariant(subtag)) {
190 throw new DatatypeException("Bad variant subtag");
191 }
192 }
193 i++;
194 subtag = subtags[i];
195 len = subtag.length();
196 }
197 }
198
199 private boolean isVariant(String subtag) {
200 // TODO Auto-generated method stub
201 return false;
202 }
203
204 private boolean isRegion(String subtag) {
205 // TODO Auto-generated method stub
206 return false;
207 }
208
209 private boolean isScript(String subtag) {
210 // TODO Auto-generated method stub
211 return false;
212 }
213
214 private boolean isLanguage(String language) {
215 // TODO Auto-generated method stub
216 return false;
217 }
218
219 private void checkPrivateUse(int i, String[] subtags)
220 throws DatatypeException {
221 int len = subtags.length;
222 i++;
223 if (i == len) {
224 throw new DatatypeException("No subtags in private use sequence.");
225 }
226 while (i < len) {
227 String subtag = subtags[i];
228 if (subtag.length() < 1) {
229 throw new DatatypeException("Zero-length private use subtag.");
230 }
231 if (subtag.length() > 8) {
232 throw new DatatypeException("Private use subtag too long.");
233 }
234 if (!isLowerCaseAlphaNumeric(subtag)) {
235 throw new DatatypeException(
236 "Bad character in private use subtag.");
237 }
238 i++;
239 }
240 }
241
242 private final boolean isLowerCaseAlphaNumeric(char c) {
243 return isLowerCaseAlpha(c) || isDigit(c);
244 }
245
246 private final boolean isLowerCaseAlphaNumeric(String str) {
247 for (int i = 0; i < str.length(); i++) {
248 if (!isLowerCaseAlphaNumeric(str.charAt(i))) {
249 return false;
250 }
251 }
252 return true;
253 }
254
255 /**
256 * @param c
257 * @return
258 */
259 private final boolean isDigit(char c) {
260 return (c >= '0' && c <= '9');
261 }
262
263 private final boolean isDigit(String str) {
264 for (int i = 0; i < str.length(); i++) {
265 if (!isDigit(str.charAt(i))) {
266 return false;
267 }
268 }
269 return true;
270 }
271
272 /**
273 * @param c
274 * @return
275 */
276 private final boolean isLowerCaseAlpha(char c) {
277 return (c >= 'a' && c <= 'z');
278 }
279
280 private final boolean isLowerCaseAlpha(String str) {
281 for (int i = 0; i < str.length(); i++) {
282 if (!isLowerCaseAlpha(str.charAt(i))) {
283 return false;
284 }
285 }
286 return true;
287 }
288
289 private boolean isGrandfathered(String literal) {
290 return Arrays.binarySearch(GRANDFATHERED, literal) > -1;
291 }
292
293 }