001 package fi.iki.hsivonen.io; 002 003 import java.io.ByteArrayInputStream; 004 import java.io.IOException; 005 import java.io.InputStreamReader; 006 import java.io.Reader; 007 import java.nio.charset.Charset; 008 import java.nio.charset.CharsetDecoder; 009 import java.nio.charset.CodingErrorAction; 010 import java.util.Arrays; 011 import java.util.Iterator; 012 import java.util.Map; 013 import java.util.SortedMap; 014 import java.util.SortedSet; 015 import java.util.TreeSet; 016 017 public class EncodingInfo { 018 019 private static String[] NOT_OBSCURE = {"Big5", 020 "Big5-HKSCS", 021 "EUC-JP", 022 "EUC-KR", 023 "GB18030", 024 "GBK", 025 "ISO-2022-JP", 026 "ISO-2022-KR", 027 "ISO-8859-1", 028 "ISO-8859-13", 029 "ISO-8859-15", 030 "ISO-8859-2", 031 "ISO-8859-3", 032 "ISO-8859-4", 033 "ISO-8859-5", 034 "ISO-8859-6", 035 "ISO-8859-7", 036 "ISO-8859-8", 037 "ISO-8859-9", 038 "KOI8-R", 039 "Shift_JIS", 040 "TIS-620", 041 "US-ASCII", 042 "UTF-16", 043 "UTF-16BE", 044 "UTF-16LE", 045 "UTF-8", 046 "windows-1250", 047 "windows-1251", 048 "windows-1252", 049 "windows-1253", 050 "windows-1254", 051 "windows-1255", 052 "windows-1256", 053 "windows-1257", 054 "windows-1258"}; 055 056 private static String[] asciiSuperset; 057 058 private static String[] notAsciiSuperset; 059 060 static { 061 byte[] testBuf = new byte[0x63]; 062 for (int i = 0; i < 0x60; i++) { 063 testBuf[i] = (byte) (i + 0x20); 064 } 065 testBuf[0x60] = (byte) '\n'; 066 testBuf[0x61] = (byte) '\r'; 067 testBuf[0x62] = (byte) '\t'; 068 069 SortedSet<String> asciiSupersetSet = new TreeSet<String>(); 070 SortedSet<String> notAsciiSupersetSet = new TreeSet<String>(); 071 072 SortedMap charsets = Charset.availableCharsets(); 073 for (Iterator iter = charsets.entrySet().iterator(); iter.hasNext();) { 074 Map.Entry entry = (Map.Entry) iter.next(); 075 Charset cs = (Charset) entry.getValue(); 076 if (asciiMapsToBasicLatin(testBuf, cs)) { 077 asciiSupersetSet.add(cs.name().intern()); 078 } else { 079 notAsciiSupersetSet.add(cs.name().intern()); 080 } 081 } 082 083 asciiSuperset = (String[]) asciiSupersetSet.toArray(new String[0]); 084 notAsciiSuperset = (String[]) notAsciiSupersetSet.toArray(new String[0]); 085 } 086 087 public static boolean isAsciiSuperset(String preferredIanaName) { 088 return (Arrays.binarySearch(asciiSuperset, preferredIanaName) > -1); 089 } 090 091 public static boolean isNotAsciiSuperset(String preferredIanaName) { 092 return (Arrays.binarySearch(notAsciiSuperset, preferredIanaName) > -1); 093 } 094 095 public static boolean isObscure(String preferredIanaName) { 096 return !(Arrays.binarySearch(NOT_OBSCURE, preferredIanaName) > -1); 097 } 098 099 /** 100 * @param testBuf 101 * @param cs 102 */ 103 private static boolean asciiMapsToBasicLatin(byte[] testBuf, Charset cs) { 104 CharsetDecoder dec = cs.newDecoder(); 105 dec.onMalformedInput(CodingErrorAction.REPORT); 106 dec.onUnmappableCharacter(CodingErrorAction.REPORT); 107 Reader r = new InputStreamReader(new ByteArrayInputStream(testBuf), dec); 108 try { 109 for (int i = 0; i < 0x60; i++) { 110 if ((i + 0x20) != r.read()) { 111 return false; 112 } 113 } 114 if ('\n' != r.read()) { 115 return false; 116 } 117 if ('\r' != r.read()) { 118 return false; 119 } 120 if ('\t' != r.read()) { 121 return false; 122 } 123 } catch (IOException e) { 124 return false; 125 } catch (Exception e) { 126 return false; 127 } 128 129 return true; 130 } 131 132 public static void main(String[] args) { 133 System.out.println("ASCII maps to Basic Latin:"); 134 for (int i = 0; i < asciiSuperset.length; i++) { 135 System.out.println(asciiSuperset[i]); 136 } 137 System.out.println(); 138 System.out.println("ASCII does not map to Basic Latin:"); 139 for (int i = 0; i < notAsciiSuperset.length; i++) { 140 System.out.println(notAsciiSuperset[i]); 141 } 142 } 143 }