001 /* 002 * Copyright (c) 2003, 2004 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.iki.hsivonen.xml; 024 import gnu.xml.pipeline.EventConsumer; 025 import gnu.xml.pipeline.EventFilter; 026 027 import java.util.Arrays; 028 029 import org.xml.sax.Attributes; 030 import org.xml.sax.SAXException; 031 import org.xml.sax.helpers.AttributesImpl; 032 033 /** 034 * 035 * @author hsivonen 036 */ 037 public class XHTMLCruftDropper extends EventFilter { 038 039 private static final String[] normalAttrs = 040 { 041 "accesskey", 042 "cite", 043 "class", 044 "datetime", 045 "dir", 046 "id", 047 "lang", 048 "onblur", 049 "onclick", 050 "ondblclick", 051 "onfocus", 052 "onkeydown", 053 "onkeypress", 054 "onkeyup", 055 "onmousedown", 056 "onmousemove", 057 "onmouseout", 058 "onmouseup", 059 "style", 060 "tabindex", 061 "title" }; 062 063 private static final String[] normalElts = 064 { 065 "abbr", 066 "acronym", 067 "address", 068 "b", 069 "big", 070 "blockquote", 071 "br", 072 "caption", 073 "cite", 074 "code", 075 "dd", 076 "del", 077 "dfn", 078 "div", 079 "dl", 080 "dt", 081 "em", 082 "fieldset", 083 "h1", 084 "h2", 085 "h3", 086 "h4", 087 "h5", 088 "h6", 089 "hr", 090 "i", 091 "input", 092 "ins", 093 "kbd", 094 "li", 095 "noscript", 096 "ol", 097 "p", 098 "pre", 099 "q", 100 "samp", 101 "small", 102 "span", 103 "strong", 104 "sub", 105 "sup", 106 "tt", 107 "ul", 108 "var" }; 109 110 private static final String[] specialElts = 111 { 112 "a", 113 "bdo", 114 "body", 115 "button", 116 "col", 117 "colgroup", 118 "form", 119 "head", 120 "html", 121 "img", 122 "label", 123 "link", 124 "map", 125 "meta", 126 "object", 127 "script", 128 "script", 129 "select", 130 "style", 131 "table", 132 "tbody", 133 "td", 134 "textarea", 135 "tfoot", 136 "th", 137 "thead", 138 "title", 139 "tr" }; 140 141 private static final String[] specialAttrs = 142 { 143 "abbr", 144 "accept", 145 "accept-charset", 146 "accesskey", 147 "action", 148 "align", 149 "alt", 150 "archive", 151 "axis", 152 "border", 153 "cellpadding", 154 "char", 155 "charoff", 156 "charset", 157 "checked", 158 "classid", 159 "codebase", 160 "cols", 161 "colspan", 162 "content", 163 "coords", 164 "data", 165 "declare", 166 "defer", 167 "dir", 168 "disabled", 169 "enctype", 170 "for", 171 "frame", 172 "headers", 173 "height", 174 "href", 175 "hreflang", 176 "http-equiv", 177 "id", 178 "ismap", 179 "label", 180 "longdesc", 181 "maxlength", 182 "media", 183 "method", 184 "multiple", 185 "name", 186 "nohref", 187 "onblur", 188 "onchange", 189 "onchange", 190 "onfocus", 191 "onload", 192 "onreset", 193 "onselect", 194 "onselect", 195 "onsubmit", 196 "onunload", 197 "readonly", 198 "rel", 199 "rev", 200 "rows", 201 "rowspan", 202 "rules", 203 "scheme", 204 "scope", 205 "selected", 206 "shape", 207 "size", 208 "span", 209 "src", 210 "standby", 211 "summary", 212 "tabindex", 213 "type", 214 "usemap", 215 "valign", 216 "value", 217 "valuetype", 218 "width" }; 219 220 /** Creates a new instance of XHTMLStricter */ 221 public XHTMLCruftDropper(EventConsumer next) { 222 super(next); 223 setContentHandler(this); 224 } 225 226 public void endElement(String uri, String local, String qname) 227 throws SAXException { 228 if ("http://www.w3.org/1999/xhtml".equals(uri) 229 && (Arrays.binarySearch(normalElts, local) >= 0 230 || Arrays.binarySearch(specialElts, local) >= 0)) { 231 super.endElement(uri, local, qname); 232 } 233 } 234 235 public void ignorableWhitespace(char[] values, int param, int param2) { 236 } 237 238 public void skippedEntity(String str) throws SAXException { 239 throw new SAXException("enresolved entity"); 240 } 241 242 public void startElement( 243 String uri, 244 String local, 245 String qname, 246 Attributes attributes) 247 throws SAXException { 248 if ("http://www.w3.org/1999/xhtml".equals(uri)) { 249 if (Arrays.binarySearch(normalElts, local) >= 0) { 250 AttributesImpl attrs = new AttributesImpl(attributes); 251 int i = 0; 252 String attrLocal; 253 String attrNS; 254 while (i < attrs.getLength()) { 255 attrLocal = attrs.getLocalName(i); 256 attrNS = attrs.getURI(i); 257 if (("lang".equals(attrLocal) 258 && "http://www.w3.org/XML/1998/namespace".equals(attrNS)) 259 || ("space".equals(attrLocal) 260 && "http://www.w3.org/XML/1998/namespace".equals( 261 attrNS)) 262 || (Arrays.binarySearch(normalAttrs, attrLocal) >= 0 263 && ("".equals(attrNS) 264 || "http://www.w3.org/1999/xhtml".equals( 265 attrNS)))) { 266 i++; 267 } else { 268 attrs.removeAttribute(i); 269 } 270 } 271 super.startElement(uri, local, qname, attrs); 272 } else if (Arrays.binarySearch(specialElts, local) >= 0) { 273 AttributesImpl attrs = new AttributesImpl(attributes); 274 int i = 0; 275 String attrLocal; 276 String attrNS; 277 while (i < attrs.getLength()) { 278 attrLocal = attrs.getLocalName(i); 279 attrNS = attrs.getURI(i); 280 if (("lang".equals(attrLocal) 281 && "http://www.w3.org/XML/1998/namespace".equals(attrNS)) 282 || ("space".equals(attrLocal) 283 && "http://www.w3.org/XML/1998/namespace".equals( 284 attrNS)) 285 || ((Arrays.binarySearch(normalAttrs, attrLocal) >= 0 286 || Arrays.binarySearch(specialAttrs, attrLocal) >= 0) 287 && ("".equals(attrNS) 288 || "http://www.w3.org/1999/xhtml".equals( 289 attrNS)))) { 290 i++; 291 } else { 292 attrs.removeAttribute(i); 293 } 294 } 295 super.startElement(uri, local, qname, attrs); 296 } 297 } 298 } 299 }