001 /* 002 * Copyright (c) 2005 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.iki.hsivonen.htmlparser; 024 025 import java.util.Arrays; 026 027 /** 028 * @version $Id: Entities.java,v 1.3 2006/10/30 20:03:10 hsivonen Exp $ 029 * @author hsivonen 030 */ 031 public final class Entities { 032 private final static String[] NAMES = { "AElig", "Aacute", "Acirc", 033 "Agrave", "Alpha", "Aring", "Atilde", "Auml", "Beta", "Ccedil", 034 "Chi", "Dagger", "Delta", "ETH", "Eacute", "Ecirc", "Egrave", 035 "Epsilon", "Eta", "Euml", "Gamma", "Iacute", "Icirc", "Igrave", 036 "Iota", "Iuml", "Kappa", "Lambda", "Mu", "Ntilde", "Nu", "OElig", 037 "Oacute", "Ocirc", "Ograve", "Omega", "Omicron", "Oslash", 038 "Otilde", "Ouml", "Phi", "Pi", "Prime", "Psi", "Rho", "Scaron", 039 "Sigma", "THORN", "Tau", "Theta", "Uacute", "Ucirc", "Ugrave", 040 "Upsilon", "Uuml", "Xi", "Yacute", "Yuml", "Zeta", "aacute", 041 "acirc", "acute", "aelig", "agrave", "alefsym", "alpha", "amp", 042 "and", "ang", "aring", "asymp", "atilde", "auml", "bdquo", "beta", 043 "brvbar", "bull", "cap", "ccedil", "cedil", "cent", "chi", "circ", 044 "clubs", "cong", "copy", "crarr", "cup", "curren", "dArr", 045 "dagger", "darr", "deg", "delta", "diams", "divide", "eacute", 046 "ecirc", "egrave", "empty", "emsp", "ensp", "epsilon", "equiv", 047 "eta", "eth", "euml", "euro", "exist", "fnof", "forall", "frac12", 048 "frac14", "frac34", "frasl", "gamma", "ge", "gt", "hArr", "harr", 049 "hearts", "hellip", "iacute", "icirc", "iexcl", "igrave", "image", 050 "infin", "int", "iota", "iquest", "isin", "iuml", "kappa", "lArr", 051 "lambda", "lang", "laquo", "larr", "lceil", "ldquo", "le", 052 "lfloor", "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr", 053 "mdash", "micro", "middot", "minus", "mu", "nabla", "nbsp", 054 "ndash", "ne", "ni", "not", "notin", "nsub", "ntilde", "nu", 055 "oacute", "ocirc", "oelig", "ograve", "oline", "omega", "omicron", 056 "oplus", "or", "ordf", "ordm", "oslash", "otilde", "otimes", 057 "ouml", "para", "part", "permil", "perp", "phi", "pi", "piv", 058 "plusmn", "pound", "prime", "prod", "prop", "psi", "quot", "rArr", 059 "radic", "rang", "raquo", "rarr", "rceil", "rdquo", "real", "reg", 060 "rfloor", "rho", "rlm", "rsaquo", "rsquo", "sbquo", "scaron", 061 "sdot", "sect", "shy", "sigma", "sigmaf", "sim", "spades", "sub", 062 "sube", "sum", "sup", "sup1", "sup2", "sup3", "supe", "szlig", 063 "tau", "there4", "theta", "thetasym", "thinsp", "thorn", "tilde", 064 "times", "trade", "uArr", "uacute", "uarr", "ucirc", "ugrave", 065 "uml", "upsih", "upsilon", "uuml", "weierp", "xi", "yacute", "yen", 066 "yuml", "zeta", "zwj", "zwnj" }; 067 068 private final static char[][] VALUES = { { '\u00c6' }, { '\u00c1' }, 069 { '\u00c2' }, { '\u00c0' }, { '\u0391' }, { '\u00c5' }, 070 { '\u00c3' }, { '\u00c4' }, { '\u0392' }, { '\u00c7' }, 071 { '\u03a7' }, { '\u2021' }, { '\u0394' }, { '\u00d0' }, 072 { '\u00c9' }, { '\u00ca' }, { '\u00c8' }, { '\u0395' }, 073 { '\u0397' }, { '\u00cb' }, { '\u0393' }, { '\u00cd' }, 074 { '\u00ce' }, { '\u00cc' }, { '\u0399' }, { '\u00cf' }, 075 { '\u039a' }, { '\u039b' }, { '\u039c' }, { '\u00d1' }, 076 { '\u039d' }, { '\u0152' }, { '\u00d3' }, { '\u00d4' }, 077 { '\u00d2' }, { '\u03a9' }, { '\u039f' }, { '\u00d8' }, 078 { '\u00d5' }, { '\u00d6' }, { '\u03a6' }, { '\u03a0' }, 079 { '\u2033' }, { '\u03a8' }, { '\u03a1' }, { '\u0160' }, 080 { '\u03a3' }, { '\u00de' }, { '\u03a4' }, { '\u0398' }, 081 { '\u00da' }, { '\u00db' }, { '\u00d9' }, { '\u03a5' }, 082 { '\u00dc' }, { '\u039e' }, { '\u00dd' }, { '\u0178' }, 083 { '\u0396' }, { '\u00e1' }, { '\u00e2' }, { '\u00b4' }, 084 { '\u00e6' }, { '\u00e0' }, { '\u2135' }, { '\u03b1' }, 085 { '\u0026' }, { '\u2227' }, { '\u2220' }, { '\u00e5' }, 086 { '\u2248' }, { '\u00e3' }, { '\u00e4' }, { '\u201e' }, 087 { '\u03b2' }, { '\u00a6' }, { '\u2022' }, { '\u2229' }, 088 { '\u00e7' }, { '\u00b8' }, { '\u00a2' }, { '\u03c7' }, 089 { '\u02c6' }, { '\u2663' }, { '\u2245' }, { '\u00a9' }, 090 { '\u21b5' }, { '\u222a' }, { '\u00a4' }, { '\u21d3' }, 091 { '\u2020' }, { '\u2193' }, { '\u00b0' }, { '\u03b4' }, 092 { '\u2666' }, { '\u00f7' }, { '\u00e9' }, { '\u00ea' }, 093 { '\u00e8' }, { '\u2205' }, { '\u2003' }, { '\u2002' }, 094 { '\u03b5' }, { '\u2261' }, { '\u03b7' }, { '\u00f0' }, 095 { '\u00eb' }, { '\u20ac' }, { '\u2203' }, { '\u0192' }, 096 { '\u2200' }, { '\u00bd' }, { '\u00bc' }, { '\u00be' }, 097 { '\u2044' }, { '\u03b3' }, { '\u2265' }, { '\u003e' }, 098 { '\u21d4' }, { '\u2194' }, { '\u2665' }, { '\u2026' }, 099 { '\u00ed' }, { '\u00ee' }, { '\u00a1' }, { '\u00ec' }, 100 { '\u2111' }, { '\u221e' }, { '\u222b' }, { '\u03b9' }, 101 { '\u00bf' }, { '\u2208' }, { '\u00ef' }, { '\u03ba' }, 102 { '\u21d0' }, { '\u03bb' }, { '\u2329' }, { '\u00ab' }, 103 { '\u2190' }, { '\u2308' }, { '\u201c' }, { '\u2264' }, 104 { '\u230a' }, { '\u2217' }, { '\u25ca' }, { '\u200e' }, 105 { '\u2039' }, { '\u2018' }, { '\u003c' }, { '\u00af' }, 106 { '\u2014' }, { '\u00b5' }, { '\u00b7' }, { '\u2212' }, 107 { '\u03bc' }, { '\u2207' }, { '\u00a0' }, { '\u2013' }, 108 { '\u2260' }, { '\u220b' }, { '\u00ac' }, { '\u2209' }, 109 { '\u2284' }, { '\u00f1' }, { '\u03bd' }, { '\u00f3' }, 110 { '\u00f4' }, { '\u0153' }, { '\u00f2' }, { '\u203e' }, 111 { '\u03c9' }, { '\u03bf' }, { '\u2295' }, { '\u2228' }, 112 { '\u00aa' }, { '\u00ba' }, { '\u00f8' }, { '\u00f5' }, 113 { '\u2297' }, { '\u00f6' }, { '\u00b6' }, { '\u2202' }, 114 { '\u2030' }, { '\u22a5' }, { '\u03c6' }, { '\u03c0' }, 115 { '\u03d6' }, { '\u00b1' }, { '\u00a3' }, { '\u2032' }, 116 { '\u220f' }, { '\u221d' }, { '\u03c8' }, { '\u0022' }, 117 { '\u21d2' }, { '\u221a' }, { '\u232a' }, { '\u00bb' }, 118 { '\u2192' }, { '\u2309' }, { '\u201d' }, { '\u211c' }, 119 { '\u00ae' }, { '\u230b' }, { '\u03c1' }, { '\u200f' }, 120 { '\u203a' }, { '\u2019' }, { '\u201a' }, { '\u0161' }, 121 { '\u22c5' }, { '\u00a7' }, { '\u00ad' }, { '\u03c3' }, 122 { '\u03c2' }, { '\u223c' }, { '\u2660' }, { '\u2282' }, 123 { '\u2286' }, { '\u2211' }, { '\u2283' }, { '\u00b9' }, 124 { '\u00b2' }, { '\u00b3' }, { '\u2287' }, { '\u00df' }, 125 { '\u03c4' }, { '\u2234' }, { '\u03b8' }, { '\u03d1' }, 126 { '\u2009' }, { '\u00fe' }, { '\u02dc' }, { '\u00d7' }, 127 { '\u2122' }, { '\u21d1' }, { '\u00fa' }, { '\u2191' }, 128 { '\u00fb' }, { '\u00f9' }, { '\u00a8' }, { '\u03d2' }, 129 { '\u03c5' }, { '\u00fc' }, { '\u2118' }, { '\u03be' }, 130 { '\u00fd' }, { '\u00a5' }, { '\u00ff' }, { '\u03b6' }, 131 { '\u200d' }, { '\u200c' } }; 132 133 public static final char[] resolve(String entity) { 134 int i = Arrays.binarySearch(NAMES, entity); 135 if (i < 0) { 136 return null; 137 } else { 138 return VALUES[i]; 139 } 140 } 141 142 private static final String[] NAMES_5 = { "AElig", "AMP", "Aacute", 143 "Acirc", "Agrave", "Alpha", "Aring", "Atilde", "Auml", "Beta", 144 "COPY", "Ccedil", "Chi", "Dagger", "Delta", "ETH", "Eacute", 145 "Ecirc", "Egrave", "Epsilon", "Eta", "Euml", "GT", "Gamma", 146 "Iacute", "Icirc", "Igrave", "Iota", "Iuml", "Kappa", "LT", 147 "Lambda", "Mu", "Ntilde", "Nu", "OElig", "Oacute", "Ocirc", 148 "Ograve", "Omega", "Omicron", "Oslash", "Otilde", "Ouml", "Phi", 149 "Pi", "Prime", "Psi", "QUOT", "REG", "Rho", "Scaron", "Sigma", 150 "THORN", "Tau", "Theta", "Uacute", "Ucirc", "Ugrave", "Upsilon", 151 "Uuml", "Xi", "Yacute", "Yuml", "Zeta", "aacute", "acirc", "acute", 152 "aelig", "agrave", "alefsym", "alpha", "amp", "and", "ang", 153 "aring", "asymp", "atilde", "auml", "bdquo", "beta", "brvbar", 154 "bull", "cap", "ccedil", "cedil", "cent", "chi", "circ", "clubs", 155 "cong", "copy", "crarr", "cup", "curren", "dArr", "dagger", "darr", 156 "deg", "delta", "diams", "divide", "eacute", "ecirc", "egrave", 157 "empty", "emsp", "ensp", "epsilon", "equiv", "eta", "eth", "euml", 158 "euro", "exist", "fnof", "forall", "frac12", "frac14", "frac34", 159 "frasl", "gamma", "ge", "gt", "hArr", "harr", "hearts", "hellip", 160 "iacute", "icirc", "iexcl", "igrave", "image", "infin", "int", 161 "iota", "iquest", "isin", "iuml", "kappa", "lArr", "lambda", 162 "lang", "laquo", "larr", "lceil", "ldquo", "le", "lfloor", 163 "lowast", "loz", "lrm", "lsaquo", "lsquo", "lt", "macr", "mdash", 164 "micro", "middot", "minus", "mu", "nabla", "nbsp", "ndash", "ne", 165 "ni", "not", "notin", "nsub", "ntilde", "nu", "oacute", "ocirc", 166 "oelig", "ograve", "oline", "omega", "omicron", "oplus", "or", 167 "ordf", "ordm", "oslash", "otilde", "otimes", "ouml", "para", 168 "part", "permil", "perp", "phi", "pi", "piv", "plusmn", "pound", 169 "prime", "prod", "prop", "psi", "quot", "rArr", "radic", "rang", 170 "raquo", "rarr", "rceil", "rdquo", "real", "reg", "rfloor", "rho", 171 "rlm", "rsaquo", "rsquo", "sbquo", "scaron", "sdot", "sect", "shy", 172 "sigma", "sigmaf", "sim", "spades", "sub", "sube", "sum", "sup", 173 "sup1", "sup2", "sup3", "supe", "szlig", "tau", "there4", "theta", 174 "thetasym", "thinsp", "thorn", "tilde", "times", "trade", "uArr", 175 "uacute", "uarr", "ucirc", "ugrave", "uml", "upsih", "upsilon", 176 "uuml", "weierp", "xi", "yacute", "yen", "yuml", "zeta", "zwj", 177 "zwnj" }; 178 179 private final static char[][] VALUES_5 = { { '\u00C6' }, { '\u0026' }, 180 { '\u00C1' }, { '\u00C2' }, { '\u00C0' }, { '\u0391' }, 181 { '\u00C5' }, { '\u00C3' }, { '\u00C4' }, { '\u0392' }, 182 { '\u00A9' }, { '\u00C7' }, { '\u03A7' }, { '\u2021' }, 183 { '\u0394' }, { '\u00D0' }, { '\u00C9' }, { '\u00CA' }, 184 { '\u00C8' }, { '\u0395' }, { '\u0397' }, { '\u00CB' }, 185 { '\u003E' }, { '\u0393' }, { '\u00CD' }, { '\u00CE' }, 186 { '\u00CC' }, { '\u0399' }, { '\u00CF' }, { '\u039A' }, 187 { '\u003C' }, { '\u039B' }, { '\u039C' }, { '\u00D1' }, 188 { '\u039D' }, { '\u0152' }, { '\u00D3' }, { '\u00D4' }, 189 { '\u00D2' }, { '\u03A9' }, { '\u039F' }, { '\u00D8' }, 190 { '\u00D5' }, { '\u00D6' }, { '\u03A6' }, { '\u03A0' }, 191 { '\u2033' }, { '\u03A8' }, { '\u0022' }, { '\u00AE' }, 192 { '\u03A1' }, { '\u0160' }, { '\u03A3' }, { '\u00DE' }, 193 { '\u03A4' }, { '\u0398' }, { '\u00DA' }, { '\u00DB' }, 194 { '\u00D9' }, { '\u03A5' }, { '\u00DC' }, { '\u039E' }, 195 { '\u00DD' }, { '\u0178' }, { '\u0396' }, { '\u00E1' }, 196 { '\u00E2' }, { '\u00B4' }, { '\u00E6' }, { '\u00E0' }, 197 { '\u2135' }, { '\u03B1' }, { '\u0026' }, { '\u2227' }, 198 { '\u2220' }, { '\u00E5' }, { '\u2248' }, { '\u00E3' }, 199 { '\u00E4' }, { '\u201E' }, { '\u03B2' }, { '\u00A6' }, 200 { '\u2022' }, { '\u2229' }, { '\u00E7' }, { '\u00B8' }, 201 { '\u00A2' }, { '\u03C7' }, { '\u02C6' }, { '\u2663' }, 202 { '\u2245' }, { '\u00A9' }, { '\u21B5' }, { '\u222A' }, 203 { '\u00A4' }, { '\u21D3' }, { '\u2020' }, { '\u2193' }, 204 { '\u00B0' }, { '\u03B4' }, { '\u2666' }, { '\u00F7' }, 205 { '\u00E9' }, { '\u00EA' }, { '\u00E8' }, { '\u2205' }, 206 { '\u2003' }, { '\u2002' }, { '\u03B5' }, { '\u2261' }, 207 { '\u03B7' }, { '\u00F0' }, { '\u00EB' }, { '\u20AC' }, 208 { '\u2203' }, { '\u0192' }, { '\u2200' }, { '\u00BD' }, 209 { '\u00BC' }, { '\u00BE' }, { '\u2044' }, { '\u03B3' }, 210 { '\u2265' }, { '\u003E' }, { '\u21D4' }, { '\u2194' }, 211 { '\u2665' }, { '\u2026' }, { '\u00ED' }, { '\u00EE' }, 212 { '\u00A1' }, { '\u00EC' }, { '\u2111' }, { '\u221E' }, 213 { '\u222B' }, { '\u03B9' }, { '\u00BF' }, { '\u2208' }, 214 { '\u00EF' }, { '\u03BA' }, { '\u21D0' }, { '\u03BB' }, 215 { '\u2329' }, { '\u00AB' }, { '\u2190' }, { '\u2308' }, 216 { '\u201C' }, { '\u2264' }, { '\u230A' }, { '\u2217' }, 217 { '\u25CA' }, { '\u200E' }, { '\u2039' }, { '\u2018' }, 218 { '\u003C' }, { '\u00AF' }, { '\u2014' }, { '\u00B5' }, 219 { '\u00B7' }, { '\u2212' }, { '\u03BC' }, { '\u2207' }, 220 { '\u00A0' }, { '\u2013' }, { '\u2260' }, { '\u220B' }, 221 { '\u00AC' }, { '\u2209' }, { '\u2284' }, { '\u00F1' }, 222 { '\u03BD' }, { '\u00F3' }, { '\u00F4' }, { '\u0153' }, 223 { '\u00F2' }, { '\u203E' }, { '\u03C9' }, { '\u03BF' }, 224 { '\u2295' }, { '\u2228' }, { '\u00AA' }, { '\u00BA' }, 225 { '\u00F8' }, { '\u00F5' }, { '\u2297' }, { '\u00F6' }, 226 { '\u00B6' }, { '\u2202' }, { '\u2030' }, { '\u22A5' }, 227 { '\u03C6' }, { '\u03C0' }, { '\u03D6' }, { '\u00B1' }, 228 { '\u00A3' }, { '\u2032' }, { '\u220F' }, { '\u221D' }, 229 { '\u03C8' }, { '\u0022' }, { '\u21D2' }, { '\u221A' }, 230 { '\u232A' }, { '\u00BB' }, { '\u2192' }, { '\u2309' }, 231 { '\u201D' }, { '\u211C' }, { '\u00AE' }, { '\u230B' }, 232 { '\u03C1' }, { '\u200F' }, { '\u203A' }, { '\u2019' }, 233 { '\u201A' }, { '\u0161' }, { '\u22C5' }, { '\u00A7' }, 234 { '\u00AD' }, { '\u03C3' }, { '\u03C2' }, { '\u223C' }, 235 { '\u2660' }, { '\u2282' }, { '\u2286' }, { '\u2211' }, 236 { '\u2283' }, { '\u00B9' }, { '\u00B2' }, { '\u00B3' }, 237 { '\u2287' }, { '\u00DF' }, { '\u03C4' }, { '\u2234' }, 238 { '\u03B8' }, { '\u03D1' }, { '\u2009' }, { '\u00FE' }, 239 { '\u02DC' }, { '\u00D7' }, { '\u2122' }, { '\u21D1' }, 240 { '\u00FA' }, { '\u2191' }, { '\u00FB' }, { '\u00F9' }, 241 { '\u00A8' }, { '\u03D2' }, { '\u03C5' }, { '\u00FC' }, 242 { '\u2118' }, { '\u03BE' }, { '\u00FD' }, { '\u00A5' }, 243 { '\u00FF' }, { '\u03B6' }, { '\u200D' }, { '\u200C' } }; 244 245 public static final char[] resolve5(String entity) { 246 int i = Arrays.binarySearch(NAMES_5, entity); 247 if (i < 0) { 248 return null; 249 } else { 250 return VALUES_5[i]; 251 } 252 } 253 }