001    /*
002     * Copyright (c) 2005 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.htmlparser;
024    
025    import java.util.Arrays;
026    
027    /**
028     * @version $Id: Entities.java,v 1.3 2006/10/30 20:03:10 hsivonen Exp $
029     * @author hsivonen
030     */
031    public final class Entities {
032        private final static String[] NAMES = { "AElig", "Aacute", "Acirc",
033                "Agrave", "Alpha", "Aring", "Atilde", "Auml", "Beta", "Ccedil",
034                "Chi", "Dagger", "Delta", "ETH", "Eacute", "Ecirc", "Egrave",
035                "Epsilon", "Eta", "Euml", "Gamma", "Iacute", "Icirc", "Igrave",
036                "Iota", "Iuml", "Kappa", "Lambda", "Mu", "Ntilde", "Nu", "OElig",
037                "Oacute", "Ocirc", "Ograve", "Omega", "Omicron", "Oslash",
038                "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi", "Rho", "Scaron",
039                "Sigma", "THORN", "Tau", "Theta", "Uacute", "Ucirc", "Ugrave",
040                "Upsilon", "Uuml", "Xi", "Yacute", "Yuml", "Zeta", "aacute",
041                "acirc", "acute", "aelig", "agrave", "alefsym", "alpha", "amp",
042                "and", "ang", "aring", "asymp", "atilde", "auml", "bdquo", "beta",
043                "brvbar", "bull", "cap", "ccedil", "cedil", "cent", "chi", "circ",
044                "clubs", "cong", "copy", "crarr", "cup", "curren", "dArr",
045                "dagger", "darr", "deg", "delta", "diams", "divide", "eacute",
046                "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon", "equiv",
047                "eta", "eth", "euml", "euro", "exist", "fnof", "forall", "frac12",
048                "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr", "harr",
049                "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave", "image",
050                "infin", "int", "iota", "iquest", "isin", "iuml", "kappa", "lArr",
051                "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
052                "lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
053                "mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp",
054                "ndash", "ne", "ni", "not", "notin", "nsub", "ntilde", "nu",
055                "oacute", "ocirc", "oelig", "ograve", "oline", "omega", "omicron",
056                "oplus", "or", "ordf", "ordm", "oslash", "otilde", "otimes",
057                "ouml", "para", "part", "permil", "perp", "phi", "pi", "piv",
058                "plusmn", "pound", "prime", "prod", "prop", "psi", "quot", "rArr",
059                "radic", "rang", "raquo", "rarr", "rceil", "rdquo", "real", "reg",
060                "rfloor", "rho", "rlm", "rsaquo", "rsquo", "sbquo", "scaron",
061                "sdot", "sect", "shy", "sigma", "sigmaf", "sim", "spades", "sub",
062                "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe", "szlig",
063                "tau", "there4", "theta", "thetasym", "thinsp", "thorn", "tilde",
064                "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
065                "uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
066                "yuml", "zeta", "zwj", "zwnj" };
067    
068        private final static char[][] VALUES = { { '\u00c6' }, { '\u00c1' },
069                { '\u00c2' }, { '\u00c0' }, { '\u0391' }, { '\u00c5' },
070                { '\u00c3' }, { '\u00c4' }, { '\u0392' }, { '\u00c7' },
071                { '\u03a7' }, { '\u2021' }, { '\u0394' }, { '\u00d0' },
072                { '\u00c9' }, { '\u00ca' }, { '\u00c8' }, { '\u0395' },
073                { '\u0397' }, { '\u00cb' }, { '\u0393' }, { '\u00cd' },
074                { '\u00ce' }, { '\u00cc' }, { '\u0399' }, { '\u00cf' },
075                { '\u039a' }, { '\u039b' }, { '\u039c' }, { '\u00d1' },
076                { '\u039d' }, { '\u0152' }, { '\u00d3' }, { '\u00d4' },
077                { '\u00d2' }, { '\u03a9' }, { '\u039f' }, { '\u00d8' },
078                { '\u00d5' }, { '\u00d6' }, { '\u03a6' }, { '\u03a0' },
079                { '\u2033' }, { '\u03a8' }, { '\u03a1' }, { '\u0160' },
080                { '\u03a3' }, { '\u00de' }, { '\u03a4' }, { '\u0398' },
081                { '\u00da' }, { '\u00db' }, { '\u00d9' }, { '\u03a5' },
082                { '\u00dc' }, { '\u039e' }, { '\u00dd' }, { '\u0178' },
083                { '\u0396' }, { '\u00e1' }, { '\u00e2' }, { '\u00b4' },
084                { '\u00e6' }, { '\u00e0' }, { '\u2135' }, { '\u03b1' },
085                { '\u0026' }, { '\u2227' }, { '\u2220' }, { '\u00e5' },
086                { '\u2248' }, { '\u00e3' }, { '\u00e4' }, { '\u201e' },
087                { '\u03b2' }, { '\u00a6' }, { '\u2022' }, { '\u2229' },
088                { '\u00e7' }, { '\u00b8' }, { '\u00a2' }, { '\u03c7' },
089                { '\u02c6' }, { '\u2663' }, { '\u2245' }, { '\u00a9' },
090                { '\u21b5' }, { '\u222a' }, { '\u00a4' }, { '\u21d3' },
091                { '\u2020' }, { '\u2193' }, { '\u00b0' }, { '\u03b4' },
092                { '\u2666' }, { '\u00f7' }, { '\u00e9' }, { '\u00ea' },
093                { '\u00e8' }, { '\u2205' }, { '\u2003' }, { '\u2002' },
094                { '\u03b5' }, { '\u2261' }, { '\u03b7' }, { '\u00f0' },
095                { '\u00eb' }, { '\u20ac' }, { '\u2203' }, { '\u0192' },
096                { '\u2200' }, { '\u00bd' }, { '\u00bc' }, { '\u00be' },
097                { '\u2044' }, { '\u03b3' }, { '\u2265' }, { '\u003e' },
098                { '\u21d4' }, { '\u2194' }, { '\u2665' }, { '\u2026' },
099                { '\u00ed' }, { '\u00ee' }, { '\u00a1' }, { '\u00ec' },
100                { '\u2111' }, { '\u221e' }, { '\u222b' }, { '\u03b9' },
101                { '\u00bf' }, { '\u2208' }, { '\u00ef' }, { '\u03ba' },
102                { '\u21d0' }, { '\u03bb' }, { '\u2329' }, { '\u00ab' },
103                { '\u2190' }, { '\u2308' }, { '\u201c' }, { '\u2264' },
104                { '\u230a' }, { '\u2217' }, { '\u25ca' }, { '\u200e' },
105                { '\u2039' }, { '\u2018' }, { '\u003c' }, { '\u00af' },
106                { '\u2014' }, { '\u00b5' }, { '\u00b7' }, { '\u2212' },
107                { '\u03bc' }, { '\u2207' }, { '\u00a0' }, { '\u2013' },
108                { '\u2260' }, { '\u220b' }, { '\u00ac' }, { '\u2209' },
109                { '\u2284' }, { '\u00f1' }, { '\u03bd' }, { '\u00f3' },
110                { '\u00f4' }, { '\u0153' }, { '\u00f2' }, { '\u203e' },
111                { '\u03c9' }, { '\u03bf' }, { '\u2295' }, { '\u2228' },
112                { '\u00aa' }, { '\u00ba' }, { '\u00f8' }, { '\u00f5' },
113                { '\u2297' }, { '\u00f6' }, { '\u00b6' }, { '\u2202' },
114                { '\u2030' }, { '\u22a5' }, { '\u03c6' }, { '\u03c0' },
115                { '\u03d6' }, { '\u00b1' }, { '\u00a3' }, { '\u2032' },
116                { '\u220f' }, { '\u221d' }, { '\u03c8' }, { '\u0022' },
117                { '\u21d2' }, { '\u221a' }, { '\u232a' }, { '\u00bb' },
118                { '\u2192' }, { '\u2309' }, { '\u201d' }, { '\u211c' },
119                { '\u00ae' }, { '\u230b' }, { '\u03c1' }, { '\u200f' },
120                { '\u203a' }, { '\u2019' }, { '\u201a' }, { '\u0161' },
121                { '\u22c5' }, { '\u00a7' }, { '\u00ad' }, { '\u03c3' },
122                { '\u03c2' }, { '\u223c' }, { '\u2660' }, { '\u2282' },
123                { '\u2286' }, { '\u2211' }, { '\u2283' }, { '\u00b9' },
124                { '\u00b2' }, { '\u00b3' }, { '\u2287' }, { '\u00df' },
125                { '\u03c4' }, { '\u2234' }, { '\u03b8' }, { '\u03d1' },
126                { '\u2009' }, { '\u00fe' }, { '\u02dc' }, { '\u00d7' },
127                { '\u2122' }, { '\u21d1' }, { '\u00fa' }, { '\u2191' },
128                { '\u00fb' }, { '\u00f9' }, { '\u00a8' }, { '\u03d2' },
129                { '\u03c5' }, { '\u00fc' }, { '\u2118' }, { '\u03be' },
130                { '\u00fd' }, { '\u00a5' }, { '\u00ff' }, { '\u03b6' },
131                { '\u200d' }, { '\u200c' } };
132    
133        public static final char[] resolve(String entity) {
134            int i = Arrays.binarySearch(NAMES, entity);
135            if (i < 0) {
136                return null;
137            } else {
138                return VALUES[i];
139            }
140        }
141    
142        private static final String[] NAMES_5 = { "AElig", "AMP", "Aacute",
143                "Acirc", "Agrave", "Alpha", "Aring", "Atilde", "Auml", "Beta",
144                "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH", "Eacute",
145                "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "GT", "Gamma",
146                "Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "LT",
147                "Lambda", "Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc",
148                "Ograve", "Omega", "Omicron", "Oslash", "Otilde", "Ouml", "Phi",
149                "Pi", "Prime", "Psi", "QUOT", "REG", "Rho", "Scaron", "Sigma",
150                "THORN", "Tau", "Theta", "Uacute", "Ucirc", "Ugrave", "Upsilon",
151                "Uuml", "Xi", "Yacute", "Yuml", "Zeta", "aacute", "acirc", "acute",
152                "aelig", "agrave", "alefsym", "alpha", "amp", "and", "ang",
153                "aring", "asymp", "atilde", "auml", "bdquo", "beta", "brvbar",
154                "bull", "cap", "ccedil", "cedil", "cent", "chi", "circ", "clubs",
155                "cong", "copy", "crarr", "cup", "curren", "dArr", "dagger", "darr",
156                "deg", "delta", "diams", "divide", "eacute", "ecirc", "egrave",
157                "empty", "emsp", "ensp", "epsilon", "equiv", "eta", "eth", "euml",
158                "euro", "exist", "fnof", "forall", "frac12", "frac14", "frac34",
159                "frasl", "gamma", "ge", "gt", "hArr", "harr", "hearts", "hellip",
160                "iacute", "icirc", "iexcl", "igrave", "image", "infin", "int",
161                "iota", "iquest", "isin", "iuml", "kappa", "lArr", "lambda",
162                "lang", "laquo", "larr", "lceil", "ldquo", "le", "lfloor",
163                "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr", "mdash",
164                "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash", "ne",
165                "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
166                "oelig", "ograve", "oline", "omega", "omicron", "oplus", "or",
167                "ordf", "ordm", "oslash", "otilde", "otimes", "ouml", "para",
168                "part", "permil", "perp", "phi", "pi", "piv", "plusmn", "pound",
169                "prime", "prod", "prop", "psi", "quot", "rArr", "radic", "rang",
170                "raquo", "rarr", "rceil", "rdquo", "real", "reg", "rfloor", "rho",
171                "rlm", "rsaquo", "rsquo", "sbquo", "scaron", "sdot", "sect", "shy",
172                "sigma", "sigmaf", "sim", "spades", "sub", "sube", "sum", "sup",
173                "sup1", "sup2", "sup3", "supe", "szlig", "tau", "there4", "theta",
174                "thetasym", "thinsp", "thorn", "tilde", "times", "trade", "uArr",
175                "uacute", "uarr", "ucirc", "ugrave", "uml", "upsih", "upsilon",
176                "uuml", "weierp", "xi", "yacute", "yen", "yuml", "zeta", "zwj",
177                "zwnj" };
178    
179        private final static char[][] VALUES_5 = { { '\u00C6' }, { '\u0026' },
180                { '\u00C1' }, { '\u00C2' }, { '\u00C0' }, { '\u0391' },
181                { '\u00C5' }, { '\u00C3' }, { '\u00C4' }, { '\u0392' },
182                { '\u00A9' }, { '\u00C7' }, { '\u03A7' }, { '\u2021' },
183                { '\u0394' }, { '\u00D0' }, { '\u00C9' }, { '\u00CA' },
184                { '\u00C8' }, { '\u0395' }, { '\u0397' }, { '\u00CB' },
185                { '\u003E' }, { '\u0393' }, { '\u00CD' }, { '\u00CE' },
186                { '\u00CC' }, { '\u0399' }, { '\u00CF' }, { '\u039A' },
187                { '\u003C' }, { '\u039B' }, { '\u039C' }, { '\u00D1' },
188                { '\u039D' }, { '\u0152' }, { '\u00D3' }, { '\u00D4' },
189                { '\u00D2' }, { '\u03A9' }, { '\u039F' }, { '\u00D8' },
190                { '\u00D5' }, { '\u00D6' }, { '\u03A6' }, { '\u03A0' },
191                { '\u2033' }, { '\u03A8' }, { '\u0022' }, { '\u00AE' },
192                { '\u03A1' }, { '\u0160' }, { '\u03A3' }, { '\u00DE' },
193                { '\u03A4' }, { '\u0398' }, { '\u00DA' }, { '\u00DB' },
194                { '\u00D9' }, { '\u03A5' }, { '\u00DC' }, { '\u039E' },
195                { '\u00DD' }, { '\u0178' }, { '\u0396' }, { '\u00E1' },
196                { '\u00E2' }, { '\u00B4' }, { '\u00E6' }, { '\u00E0' },
197                { '\u2135' }, { '\u03B1' }, { '\u0026' }, { '\u2227' },
198                { '\u2220' }, { '\u00E5' }, { '\u2248' }, { '\u00E3' },
199                { '\u00E4' }, { '\u201E' }, { '\u03B2' }, { '\u00A6' },
200                { '\u2022' }, { '\u2229' }, { '\u00E7' }, { '\u00B8' },
201                { '\u00A2' }, { '\u03C7' }, { '\u02C6' }, { '\u2663' },
202                { '\u2245' }, { '\u00A9' }, { '\u21B5' }, { '\u222A' },
203                { '\u00A4' }, { '\u21D3' }, { '\u2020' }, { '\u2193' },
204                { '\u00B0' }, { '\u03B4' }, { '\u2666' }, { '\u00F7' },
205                { '\u00E9' }, { '\u00EA' }, { '\u00E8' }, { '\u2205' },
206                { '\u2003' }, { '\u2002' }, { '\u03B5' }, { '\u2261' },
207                { '\u03B7' }, { '\u00F0' }, { '\u00EB' }, { '\u20AC' },
208                { '\u2203' }, { '\u0192' }, { '\u2200' }, { '\u00BD' },
209                { '\u00BC' }, { '\u00BE' }, { '\u2044' }, { '\u03B3' },
210                { '\u2265' }, { '\u003E' }, { '\u21D4' }, { '\u2194' },
211                { '\u2665' }, { '\u2026' }, { '\u00ED' }, { '\u00EE' },
212                { '\u00A1' }, { '\u00EC' }, { '\u2111' }, { '\u221E' },
213                { '\u222B' }, { '\u03B9' }, { '\u00BF' }, { '\u2208' },
214                { '\u00EF' }, { '\u03BA' }, { '\u21D0' }, { '\u03BB' },
215                { '\u2329' }, { '\u00AB' }, { '\u2190' }, { '\u2308' },
216                { '\u201C' }, { '\u2264' }, { '\u230A' }, { '\u2217' },
217                { '\u25CA' }, { '\u200E' }, { '\u2039' }, { '\u2018' },
218                { '\u003C' }, { '\u00AF' }, { '\u2014' }, { '\u00B5' },
219                { '\u00B7' }, { '\u2212' }, { '\u03BC' }, { '\u2207' },
220                { '\u00A0' }, { '\u2013' }, { '\u2260' }, { '\u220B' },
221                { '\u00AC' }, { '\u2209' }, { '\u2284' }, { '\u00F1' },
222                { '\u03BD' }, { '\u00F3' }, { '\u00F4' }, { '\u0153' },
223                { '\u00F2' }, { '\u203E' }, { '\u03C9' }, { '\u03BF' },
224                { '\u2295' }, { '\u2228' }, { '\u00AA' }, { '\u00BA' },
225                { '\u00F8' }, { '\u00F5' }, { '\u2297' }, { '\u00F6' },
226                { '\u00B6' }, { '\u2202' }, { '\u2030' }, { '\u22A5' },
227                { '\u03C6' }, { '\u03C0' }, { '\u03D6' }, { '\u00B1' },
228                { '\u00A3' }, { '\u2032' }, { '\u220F' }, { '\u221D' },
229                { '\u03C8' }, { '\u0022' }, { '\u21D2' }, { '\u221A' },
230                { '\u232A' }, { '\u00BB' }, { '\u2192' }, { '\u2309' },
231                { '\u201D' }, { '\u211C' }, { '\u00AE' }, { '\u230B' },
232                { '\u03C1' }, { '\u200F' }, { '\u203A' }, { '\u2019' },
233                { '\u201A' }, { '\u0161' }, { '\u22C5' }, { '\u00A7' },
234                { '\u00AD' }, { '\u03C3' }, { '\u03C2' }, { '\u223C' },
235                { '\u2660' }, { '\u2282' }, { '\u2286' }, { '\u2211' },
236                { '\u2283' }, { '\u00B9' }, { '\u00B2' }, { '\u00B3' },
237                { '\u2287' }, { '\u00DF' }, { '\u03C4' }, { '\u2234' },
238                { '\u03B8' }, { '\u03D1' }, { '\u2009' }, { '\u00FE' },
239                { '\u02DC' }, { '\u00D7' }, { '\u2122' }, { '\u21D1' },
240                { '\u00FA' }, { '\u2191' }, { '\u00FB' }, { '\u00F9' },
241                { '\u00A8' }, { '\u03D2' }, { '\u03C5' }, { '\u00FC' },
242                { '\u2118' }, { '\u03BE' }, { '\u00FD' }, { '\u00A5' },
243                { '\u00FF' }, { '\u03B6' }, { '\u200D' }, { '\u200C' } };
244    
245        public static final char[] resolve5(String entity) {
246            int i = Arrays.binarySearch(NAMES_5, entity);
247            if (i < 0) {
248                return null;
249            } else {
250                return VALUES_5[i];
251            }
252        }
253    }