001 /*
002 * Copyright (c) 2005 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package fi.iki.hsivonen.htmlparser;
024
025 import java.util.Arrays;
026
027 /**
028 * @version $Id: Entities.java,v 1.3 2006/10/30 20:03:10 hsivonen Exp $
029 * @author hsivonen
030 */
031 public final class Entities {
032 private final static String[] NAMES = { "AElig", "Aacute", "Acirc",
033 "Agrave", "Alpha", "Aring", "Atilde", "Auml", "Beta", "Ccedil",
034 "Chi", "Dagger", "Delta", "ETH", "Eacute", "Ecirc", "Egrave",
035 "Epsilon", "Eta", "Euml", "Gamma", "Iacute", "Icirc", "Igrave",
036 "Iota", "Iuml", "Kappa", "Lambda", "Mu", "Ntilde", "Nu", "OElig",
037 "Oacute", "Ocirc", "Ograve", "Omega", "Omicron", "Oslash",
038 "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi", "Rho", "Scaron",
039 "Sigma", "THORN", "Tau", "Theta", "Uacute", "Ucirc", "Ugrave",
040 "Upsilon", "Uuml", "Xi", "Yacute", "Yuml", "Zeta", "aacute",
041 "acirc", "acute", "aelig", "agrave", "alefsym", "alpha", "amp",
042 "and", "ang", "aring", "asymp", "atilde", "auml", "bdquo", "beta",
043 "brvbar", "bull", "cap", "ccedil", "cedil", "cent", "chi", "circ",
044 "clubs", "cong", "copy", "crarr", "cup", "curren", "dArr",
045 "dagger", "darr", "deg", "delta", "diams", "divide", "eacute",
046 "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon", "equiv",
047 "eta", "eth", "euml", "euro", "exist", "fnof", "forall", "frac12",
048 "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr", "harr",
049 "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave", "image",
050 "infin", "int", "iota", "iquest", "isin", "iuml", "kappa", "lArr",
051 "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le",
052 "lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr",
053 "mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp",
054 "ndash", "ne", "ni", "not", "notin", "nsub", "ntilde", "nu",
055 "oacute", "ocirc", "oelig", "ograve", "oline", "omega", "omicron",
056 "oplus", "or", "ordf", "ordm", "oslash", "otilde", "otimes",
057 "ouml", "para", "part", "permil", "perp", "phi", "pi", "piv",
058 "plusmn", "pound", "prime", "prod", "prop", "psi", "quot", "rArr",
059 "radic", "rang", "raquo", "rarr", "rceil", "rdquo", "real", "reg",
060 "rfloor", "rho", "rlm", "rsaquo", "rsquo", "sbquo", "scaron",
061 "sdot", "sect", "shy", "sigma", "sigmaf", "sim", "spades", "sub",
062 "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe", "szlig",
063 "tau", "there4", "theta", "thetasym", "thinsp", "thorn", "tilde",
064 "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave",
065 "uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen",
066 "yuml", "zeta", "zwj", "zwnj" };
067
068 private final static char[][] VALUES = { { '\u00c6' }, { '\u00c1' },
069 { '\u00c2' }, { '\u00c0' }, { '\u0391' }, { '\u00c5' },
070 { '\u00c3' }, { '\u00c4' }, { '\u0392' }, { '\u00c7' },
071 { '\u03a7' }, { '\u2021' }, { '\u0394' }, { '\u00d0' },
072 { '\u00c9' }, { '\u00ca' }, { '\u00c8' }, { '\u0395' },
073 { '\u0397' }, { '\u00cb' }, { '\u0393' }, { '\u00cd' },
074 { '\u00ce' }, { '\u00cc' }, { '\u0399' }, { '\u00cf' },
075 { '\u039a' }, { '\u039b' }, { '\u039c' }, { '\u00d1' },
076 { '\u039d' }, { '\u0152' }, { '\u00d3' }, { '\u00d4' },
077 { '\u00d2' }, { '\u03a9' }, { '\u039f' }, { '\u00d8' },
078 { '\u00d5' }, { '\u00d6' }, { '\u03a6' }, { '\u03a0' },
079 { '\u2033' }, { '\u03a8' }, { '\u03a1' }, { '\u0160' },
080 { '\u03a3' }, { '\u00de' }, { '\u03a4' }, { '\u0398' },
081 { '\u00da' }, { '\u00db' }, { '\u00d9' }, { '\u03a5' },
082 { '\u00dc' }, { '\u039e' }, { '\u00dd' }, { '\u0178' },
083 { '\u0396' }, { '\u00e1' }, { '\u00e2' }, { '\u00b4' },
084 { '\u00e6' }, { '\u00e0' }, { '\u2135' }, { '\u03b1' },
085 { '\u0026' }, { '\u2227' }, { '\u2220' }, { '\u00e5' },
086 { '\u2248' }, { '\u00e3' }, { '\u00e4' }, { '\u201e' },
087 { '\u03b2' }, { '\u00a6' }, { '\u2022' }, { '\u2229' },
088 { '\u00e7' }, { '\u00b8' }, { '\u00a2' }, { '\u03c7' },
089 { '\u02c6' }, { '\u2663' }, { '\u2245' }, { '\u00a9' },
090 { '\u21b5' }, { '\u222a' }, { '\u00a4' }, { '\u21d3' },
091 { '\u2020' }, { '\u2193' }, { '\u00b0' }, { '\u03b4' },
092 { '\u2666' }, { '\u00f7' }, { '\u00e9' }, { '\u00ea' },
093 { '\u00e8' }, { '\u2205' }, { '\u2003' }, { '\u2002' },
094 { '\u03b5' }, { '\u2261' }, { '\u03b7' }, { '\u00f0' },
095 { '\u00eb' }, { '\u20ac' }, { '\u2203' }, { '\u0192' },
096 { '\u2200' }, { '\u00bd' }, { '\u00bc' }, { '\u00be' },
097 { '\u2044' }, { '\u03b3' }, { '\u2265' }, { '\u003e' },
098 { '\u21d4' }, { '\u2194' }, { '\u2665' }, { '\u2026' },
099 { '\u00ed' }, { '\u00ee' }, { '\u00a1' }, { '\u00ec' },
100 { '\u2111' }, { '\u221e' }, { '\u222b' }, { '\u03b9' },
101 { '\u00bf' }, { '\u2208' }, { '\u00ef' }, { '\u03ba' },
102 { '\u21d0' }, { '\u03bb' }, { '\u2329' }, { '\u00ab' },
103 { '\u2190' }, { '\u2308' }, { '\u201c' }, { '\u2264' },
104 { '\u230a' }, { '\u2217' }, { '\u25ca' }, { '\u200e' },
105 { '\u2039' }, { '\u2018' }, { '\u003c' }, { '\u00af' },
106 { '\u2014' }, { '\u00b5' }, { '\u00b7' }, { '\u2212' },
107 { '\u03bc' }, { '\u2207' }, { '\u00a0' }, { '\u2013' },
108 { '\u2260' }, { '\u220b' }, { '\u00ac' }, { '\u2209' },
109 { '\u2284' }, { '\u00f1' }, { '\u03bd' }, { '\u00f3' },
110 { '\u00f4' }, { '\u0153' }, { '\u00f2' }, { '\u203e' },
111 { '\u03c9' }, { '\u03bf' }, { '\u2295' }, { '\u2228' },
112 { '\u00aa' }, { '\u00ba' }, { '\u00f8' }, { '\u00f5' },
113 { '\u2297' }, { '\u00f6' }, { '\u00b6' }, { '\u2202' },
114 { '\u2030' }, { '\u22a5' }, { '\u03c6' }, { '\u03c0' },
115 { '\u03d6' }, { '\u00b1' }, { '\u00a3' }, { '\u2032' },
116 { '\u220f' }, { '\u221d' }, { '\u03c8' }, { '\u0022' },
117 { '\u21d2' }, { '\u221a' }, { '\u232a' }, { '\u00bb' },
118 { '\u2192' }, { '\u2309' }, { '\u201d' }, { '\u211c' },
119 { '\u00ae' }, { '\u230b' }, { '\u03c1' }, { '\u200f' },
120 { '\u203a' }, { '\u2019' }, { '\u201a' }, { '\u0161' },
121 { '\u22c5' }, { '\u00a7' }, { '\u00ad' }, { '\u03c3' },
122 { '\u03c2' }, { '\u223c' }, { '\u2660' }, { '\u2282' },
123 { '\u2286' }, { '\u2211' }, { '\u2283' }, { '\u00b9' },
124 { '\u00b2' }, { '\u00b3' }, { '\u2287' }, { '\u00df' },
125 { '\u03c4' }, { '\u2234' }, { '\u03b8' }, { '\u03d1' },
126 { '\u2009' }, { '\u00fe' }, { '\u02dc' }, { '\u00d7' },
127 { '\u2122' }, { '\u21d1' }, { '\u00fa' }, { '\u2191' },
128 { '\u00fb' }, { '\u00f9' }, { '\u00a8' }, { '\u03d2' },
129 { '\u03c5' }, { '\u00fc' }, { '\u2118' }, { '\u03be' },
130 { '\u00fd' }, { '\u00a5' }, { '\u00ff' }, { '\u03b6' },
131 { '\u200d' }, { '\u200c' } };
132
133 public static final char[] resolve(String entity) {
134 int i = Arrays.binarySearch(NAMES, entity);
135 if (i < 0) {
136 return null;
137 } else {
138 return VALUES[i];
139 }
140 }
141
142 private static final String[] NAMES_5 = { "AElig", "AMP", "Aacute",
143 "Acirc", "Agrave", "Alpha", "Aring", "Atilde", "Auml", "Beta",
144 "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH", "Eacute",
145 "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "GT", "Gamma",
146 "Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "LT",
147 "Lambda", "Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc",
148 "Ograve", "Omega", "Omicron", "Oslash", "Otilde", "Ouml", "Phi",
149 "Pi", "Prime", "Psi", "QUOT", "REG", "Rho", "Scaron", "Sigma",
150 "THORN", "Tau", "Theta", "Uacute", "Ucirc", "Ugrave", "Upsilon",
151 "Uuml", "Xi", "Yacute", "Yuml", "Zeta", "aacute", "acirc", "acute",
152 "aelig", "agrave", "alefsym", "alpha", "amp", "and", "ang",
153 "aring", "asymp", "atilde", "auml", "bdquo", "beta", "brvbar",
154 "bull", "cap", "ccedil", "cedil", "cent", "chi", "circ", "clubs",
155 "cong", "copy", "crarr", "cup", "curren", "dArr", "dagger", "darr",
156 "deg", "delta", "diams", "divide", "eacute", "ecirc", "egrave",
157 "empty", "emsp", "ensp", "epsilon", "equiv", "eta", "eth", "euml",
158 "euro", "exist", "fnof", "forall", "frac12", "frac14", "frac34",
159 "frasl", "gamma", "ge", "gt", "hArr", "harr", "hearts", "hellip",
160 "iacute", "icirc", "iexcl", "igrave", "image", "infin", "int",
161 "iota", "iquest", "isin", "iuml", "kappa", "lArr", "lambda",
162 "lang", "laquo", "larr", "lceil", "ldquo", "le", "lfloor",
163 "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr", "mdash",
164 "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash", "ne",
165 "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc",
166 "oelig", "ograve", "oline", "omega", "omicron", "oplus", "or",
167 "ordf", "ordm", "oslash", "otilde", "otimes", "ouml", "para",
168 "part", "permil", "perp", "phi", "pi", "piv", "plusmn", "pound",
169 "prime", "prod", "prop", "psi", "quot", "rArr", "radic", "rang",
170 "raquo", "rarr", "rceil", "rdquo", "real", "reg", "rfloor", "rho",
171 "rlm", "rsaquo", "rsquo", "sbquo", "scaron", "sdot", "sect", "shy",
172 "sigma", "sigmaf", "sim", "spades", "sub", "sube", "sum", "sup",
173 "sup1", "sup2", "sup3", "supe", "szlig", "tau", "there4", "theta",
174 "thetasym", "thinsp", "thorn", "tilde", "times", "trade", "uArr",
175 "uacute", "uarr", "ucirc", "ugrave", "uml", "upsih", "upsilon",
176 "uuml", "weierp", "xi", "yacute", "yen", "yuml", "zeta", "zwj",
177 "zwnj" };
178
179 private final static char[][] VALUES_5 = { { '\u00C6' }, { '\u0026' },
180 { '\u00C1' }, { '\u00C2' }, { '\u00C0' }, { '\u0391' },
181 { '\u00C5' }, { '\u00C3' }, { '\u00C4' }, { '\u0392' },
182 { '\u00A9' }, { '\u00C7' }, { '\u03A7' }, { '\u2021' },
183 { '\u0394' }, { '\u00D0' }, { '\u00C9' }, { '\u00CA' },
184 { '\u00C8' }, { '\u0395' }, { '\u0397' }, { '\u00CB' },
185 { '\u003E' }, { '\u0393' }, { '\u00CD' }, { '\u00CE' },
186 { '\u00CC' }, { '\u0399' }, { '\u00CF' }, { '\u039A' },
187 { '\u003C' }, { '\u039B' }, { '\u039C' }, { '\u00D1' },
188 { '\u039D' }, { '\u0152' }, { '\u00D3' }, { '\u00D4' },
189 { '\u00D2' }, { '\u03A9' }, { '\u039F' }, { '\u00D8' },
190 { '\u00D5' }, { '\u00D6' }, { '\u03A6' }, { '\u03A0' },
191 { '\u2033' }, { '\u03A8' }, { '\u0022' }, { '\u00AE' },
192 { '\u03A1' }, { '\u0160' }, { '\u03A3' }, { '\u00DE' },
193 { '\u03A4' }, { '\u0398' }, { '\u00DA' }, { '\u00DB' },
194 { '\u00D9' }, { '\u03A5' }, { '\u00DC' }, { '\u039E' },
195 { '\u00DD' }, { '\u0178' }, { '\u0396' }, { '\u00E1' },
196 { '\u00E2' }, { '\u00B4' }, { '\u00E6' }, { '\u00E0' },
197 { '\u2135' }, { '\u03B1' }, { '\u0026' }, { '\u2227' },
198 { '\u2220' }, { '\u00E5' }, { '\u2248' }, { '\u00E3' },
199 { '\u00E4' }, { '\u201E' }, { '\u03B2' }, { '\u00A6' },
200 { '\u2022' }, { '\u2229' }, { '\u00E7' }, { '\u00B8' },
201 { '\u00A2' }, { '\u03C7' }, { '\u02C6' }, { '\u2663' },
202 { '\u2245' }, { '\u00A9' }, { '\u21B5' }, { '\u222A' },
203 { '\u00A4' }, { '\u21D3' }, { '\u2020' }, { '\u2193' },
204 { '\u00B0' }, { '\u03B4' }, { '\u2666' }, { '\u00F7' },
205 { '\u00E9' }, { '\u00EA' }, { '\u00E8' }, { '\u2205' },
206 { '\u2003' }, { '\u2002' }, { '\u03B5' }, { '\u2261' },
207 { '\u03B7' }, { '\u00F0' }, { '\u00EB' }, { '\u20AC' },
208 { '\u2203' }, { '\u0192' }, { '\u2200' }, { '\u00BD' },
209 { '\u00BC' }, { '\u00BE' }, { '\u2044' }, { '\u03B3' },
210 { '\u2265' }, { '\u003E' }, { '\u21D4' }, { '\u2194' },
211 { '\u2665' }, { '\u2026' }, { '\u00ED' }, { '\u00EE' },
212 { '\u00A1' }, { '\u00EC' }, { '\u2111' }, { '\u221E' },
213 { '\u222B' }, { '\u03B9' }, { '\u00BF' }, { '\u2208' },
214 { '\u00EF' }, { '\u03BA' }, { '\u21D0' }, { '\u03BB' },
215 { '\u2329' }, { '\u00AB' }, { '\u2190' }, { '\u2308' },
216 { '\u201C' }, { '\u2264' }, { '\u230A' }, { '\u2217' },
217 { '\u25CA' }, { '\u200E' }, { '\u2039' }, { '\u2018' },
218 { '\u003C' }, { '\u00AF' }, { '\u2014' }, { '\u00B5' },
219 { '\u00B7' }, { '\u2212' }, { '\u03BC' }, { '\u2207' },
220 { '\u00A0' }, { '\u2013' }, { '\u2260' }, { '\u220B' },
221 { '\u00AC' }, { '\u2209' }, { '\u2284' }, { '\u00F1' },
222 { '\u03BD' }, { '\u00F3' }, { '\u00F4' }, { '\u0153' },
223 { '\u00F2' }, { '\u203E' }, { '\u03C9' }, { '\u03BF' },
224 { '\u2295' }, { '\u2228' }, { '\u00AA' }, { '\u00BA' },
225 { '\u00F8' }, { '\u00F5' }, { '\u2297' }, { '\u00F6' },
226 { '\u00B6' }, { '\u2202' }, { '\u2030' }, { '\u22A5' },
227 { '\u03C6' }, { '\u03C0' }, { '\u03D6' }, { '\u00B1' },
228 { '\u00A3' }, { '\u2032' }, { '\u220F' }, { '\u221D' },
229 { '\u03C8' }, { '\u0022' }, { '\u21D2' }, { '\u221A' },
230 { '\u232A' }, { '\u00BB' }, { '\u2192' }, { '\u2309' },
231 { '\u201D' }, { '\u211C' }, { '\u00AE' }, { '\u230B' },
232 { '\u03C1' }, { '\u200F' }, { '\u203A' }, { '\u2019' },
233 { '\u201A' }, { '\u0161' }, { '\u22C5' }, { '\u00A7' },
234 { '\u00AD' }, { '\u03C3' }, { '\u03C2' }, { '\u223C' },
235 { '\u2660' }, { '\u2282' }, { '\u2286' }, { '\u2211' },
236 { '\u2283' }, { '\u00B9' }, { '\u00B2' }, { '\u00B3' },
237 { '\u2287' }, { '\u00DF' }, { '\u03C4' }, { '\u2234' },
238 { '\u03B8' }, { '\u03D1' }, { '\u2009' }, { '\u00FE' },
239 { '\u02DC' }, { '\u00D7' }, { '\u2122' }, { '\u21D1' },
240 { '\u00FA' }, { '\u2191' }, { '\u00FB' }, { '\u00F9' },
241 { '\u00A8' }, { '\u03D2' }, { '\u03C5' }, { '\u00FC' },
242 { '\u2118' }, { '\u03BE' }, { '\u00FD' }, { '\u00A5' },
243 { '\u00FF' }, { '\u03B6' }, { '\u200D' }, { '\u200C' } };
244
245 public static final char[] resolve5(String entity) {
246 int i = Arrays.binarySearch(NAMES_5, entity);
247 if (i < 0) {
248 return null;
249 } else {
250 return VALUES_5[i];
251 }
252 }
253 }