001    /*
002     * Copyright (c) 2006 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package org.whattf.datatype;
024    
025    import java.io.IOException;
026    import java.util.Arrays;
027    import java.util.regex.Pattern;
028    
029    import org.relaxng.datatype.DatatypeException;
030    import org.whattf.datatype.data.LanguageData;
031    
032    /**
033     * 
034     * @version $Id: Language.java,v 1.5 2006/11/18 11:51:44 hsivonen Exp $
035     * @author hsivonen
036     */
037    public final class Language extends AbstractDatatype {
038    
039        /**
040         * The singleton instance.
041         */
042        public static final Language THE_INSTANCE = new Language();
043        
044        private static final Pattern HYPHEN = Pattern.compile("-");
045        
046        private static String[] languages = null;
047        
048        private static String[] scripts = null;
049        
050        private static String[] regions = null;
051        
052        private static String[] variants = null;
053        
054        private static int[] suppressedScriptByLanguage = null;
055        
056        private static String[][] prefixesByVariant = null;
057        
058        static {
059            try {
060                LanguageData data = new LanguageData();
061                languages = data.getLanguages();
062                scripts = data.getScripts();
063                regions = data.getRegions();
064                variants = data.getScripts();
065                suppressedScriptByLanguage = data.getSuppressedScriptByLanguage();
066                prefixesByVariant = data.getPrefixesByVariant();
067            } catch (IOException e) {
068                throw new RuntimeException(e);
069            }
070        }
071        
072        /**
073         * List extracted from http://www.iana.org/assignments/language-tags on
074         * 2006-04-13. List dated 2005-09-09.
075         */
076        private static final String[] GRANDFATHERED = { "art-lojban", // deprecated
077                "az-arab", "az-cyrl", "az-latn", "be-latn", "bs-cyrl", "bs-latn",
078                "cel-gaulish", "de-1901", "de-1996", "de-at-1901", "de-at-1996",
079                "de-ch-1901", "de-ch-1996", "de-de-1901", "de-de-1996", "en-boont",
080                "en-gb-oed", "en-scouse", "es-419", "i-ami", "i-bnn", "i-default", // inappropriate
081                // for
082                // HTML5?
083                "i-enochian", "i-hak", // deprecated
084                "i-klingon", // deprecated
085                "i-lux", // deprecated
086                "i-mingo", "i-navajo", // deprecated
087                "i-pwn", "i-tao", "i-tay", "i-tsu", "iu-cans", "iu-latn",
088                "mn-cyrl", "mn-mong", "no-bok", // deprecated
089                "no-nyn", // deprecated
090                "sgn-be-fr", "sgn-be-nl", "sgn-br", "sgn-ch-de", "sgn-co",
091                "sgn-de", "sgn-dk", "sgn-es", "sgn-fr", "sgn-gb", "sgn-gr",
092                "sgn-ie", "sgn-it", "sgn-jp", "sgn-mx", "sgn-ni", "sgn-nl",
093                "sgn-no", "sgn-pt", "sgn-se", "sgn-us", "sgn-za", "sl-nedis",
094                "sl-rozaj", "sr-cyrl", "sr-latn", "tg-arab", "tg-cyrl", "uz-cyrl",
095                "uz-latn", "yi-latn", "zh-cmn", "zh-cmn-hans", "zh-cmn-hant",
096                "zh-gan", "zh-guoyu", // deprecated
097                "zh-hakka", "zh-hans", "zh-hans-cn", "zh-hans-hk", "zh-hans-mo",
098                "zh-hans-sg", "zh-hans-tw", "zh-hant", "zh-hant-cn", "zh-hant-hk",
099                "zh-hant-mo", "zh-hant-sg", "zh-hant-tw", "zh-min", "zh-min-nan",
100                "zh-wuu", "zh-xiang", "zh-yue" };
101    
102        /**
103         * Package-private constructor
104         */
105        private Language() {
106            super();
107        }
108    
109        public void checkValid(CharSequence lit)
110                throws DatatypeException {
111            String literal = lit.toString();
112            if (literal.length() == 0) {
113                throw new DatatypeException(
114                        "The empty string is not a valid language tag.");
115            }
116            literal = toAsciiLowerCase(literal);
117            if (isGrandfathered(literal)) {
118                return;
119            }
120            if (literal.startsWith("-")) {
121                throw new DatatypeException(
122                        "Language tag must not start with HYPHEN-MINUS.");
123            }
124            if (literal.endsWith("-")) {
125                throw new DatatypeException(
126                        "Language tag must not end with HYPHEN-MINUS.");
127            }
128            String[] subtags = HYPHEN.split(literal);
129            int i = 0;
130            String subtag = subtags[i];
131            int len = subtag.length();
132            if ("x".equals(subtag)) {
133                checkPrivateUse(i, subtags);
134                return;
135            }
136            if ((len == 2 || len == 3) && isLowerCaseAlpha(subtag)) {
137                if (!isLanguage(subtag)) {
138                    throw new DatatypeException(
139                            "Bad ISO language part in language tag");
140                }
141                i++;
142                subtag = subtags[i];
143                len = subtag.length();
144                if (len == 3) {
145                    throw new DatatypeException(
146                            "Found reserved language extension subtag.");
147                }
148            } else if (len == 4 && isLowerCaseAlpha(subtag)) {
149                throw new DatatypeException("Found reserved language tag.");
150            } else if (len == 5 && isLowerCaseAlpha(subtag)) {
151                if (!isLanguage(subtag)) {
152                    throw new DatatypeException(
153                            "Bad IANA language part in language tag");
154                }
155                i++;
156                subtag = subtags[i];
157                len = subtag.length();
158            }
159            if ("x".equals(subtag)) {
160                checkPrivateUse(i, subtags);
161                return;
162            }
163            if (subtag.length() == 4) {
164                if (!isScript(subtag)) {
165                    throw new DatatypeException("Bad script subtag");
166                }
167                i++;
168                subtag = subtags[i];
169                len = subtag.length();
170            }
171            if ((len == 3 && isDigit(subtag))
172                    || (len == 2 && isLowerCaseAlpha(subtag))) {
173                if (!isRegion(subtag)) {
174                    throw new DatatypeException("Bad region subtag");
175                }
176                i++;
177                subtag = subtags[i];
178                len = subtag.length();
179            }
180            while (i < subtags.length) {
181                if ("x".equals(subtag)) {
182                    checkPrivateUse(i, subtags);
183                    return;
184                }
185                // cutting corners here a bit
186                if (len == 1) {
187                    throw new DatatypeException("Unknown extension.");
188                } else {
189                    if (!isVariant(subtag)) {
190                        throw new DatatypeException("Bad variant subtag");
191                    }
192                }
193                i++;
194                subtag = subtags[i];
195                len = subtag.length();
196            }
197        }
198    
199        private boolean isVariant(String subtag) {
200            // TODO Auto-generated method stub
201            return false;
202        }
203    
204        private boolean isRegion(String subtag) {
205            // TODO Auto-generated method stub
206            return false;
207        }
208    
209        private boolean isScript(String subtag) {
210            // TODO Auto-generated method stub
211            return false;
212        }
213    
214        private boolean isLanguage(String language) {
215            // TODO Auto-generated method stub
216            return false;
217        }
218    
219        private void checkPrivateUse(int i, String[] subtags)
220                throws DatatypeException {
221            int len = subtags.length;
222            i++;
223            if (i == len) {
224                throw new DatatypeException("No subtags in private use sequence.");
225            }
226            while (i < len) {
227                String subtag = subtags[i];
228                if (subtag.length() < 1) {
229                    throw new DatatypeException("Zero-length private use subtag.");
230                }
231                if (subtag.length() > 8) {
232                    throw new DatatypeException("Private use subtag too long.");
233                }
234                if (!isLowerCaseAlphaNumeric(subtag)) {
235                    throw new DatatypeException(
236                            "Bad character in private use subtag.");
237                }
238                i++;
239            }
240        }
241    
242        private final boolean isLowerCaseAlphaNumeric(char c) {
243            return isLowerCaseAlpha(c) || isDigit(c);
244        }
245    
246        private final boolean isLowerCaseAlphaNumeric(String str) {
247            for (int i = 0; i < str.length(); i++) {
248                if (!isLowerCaseAlphaNumeric(str.charAt(i))) {
249                    return false;
250                }
251            }
252            return true;
253        }
254    
255        /**
256         * @param c
257         * @return
258         */
259        private final boolean isDigit(char c) {
260            return (c >= '0' && c <= '9');
261        }
262    
263        private final boolean isDigit(String str) {
264            for (int i = 0; i < str.length(); i++) {
265                if (!isDigit(str.charAt(i))) {
266                    return false;
267                }
268            }
269            return true;
270        }
271    
272        /**
273         * @param c
274         * @return
275         */
276        private final boolean isLowerCaseAlpha(char c) {
277            return (c >= 'a' && c <= 'z');
278        }
279    
280        private final boolean isLowerCaseAlpha(String str) {
281            for (int i = 0; i < str.length(); i++) {
282                if (!isLowerCaseAlpha(str.charAt(i))) {
283                    return false;
284                }
285            }
286            return true;
287        }
288    
289        private boolean isGrandfathered(String literal) {
290            return Arrays.binarySearch(GRANDFATHERED, literal) > -1;
291        }
292    
293    }