001 /*
002 * Copyright (c) 2003, 2004 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package fi.iki.hsivonen.xml;
024 import gnu.xml.pipeline.EventConsumer;
025 import gnu.xml.pipeline.EventFilter;
026
027 import java.util.Arrays;
028
029 import org.xml.sax.Attributes;
030 import org.xml.sax.SAXException;
031 import org.xml.sax.helpers.AttributesImpl;
032
033 /**
034 *
035 * @author hsivonen
036 */
037 public class XHTMLCruftDropper extends EventFilter {
038
039 private static final String[] normalAttrs =
040 {
041 "accesskey",
042 "cite",
043 "class",
044 "datetime",
045 "dir",
046 "id",
047 "lang",
048 "onblur",
049 "onclick",
050 "ondblclick",
051 "onfocus",
052 "onkeydown",
053 "onkeypress",
054 "onkeyup",
055 "onmousedown",
056 "onmousemove",
057 "onmouseout",
058 "onmouseup",
059 "style",
060 "tabindex",
061 "title" };
062
063 private static final String[] normalElts =
064 {
065 "abbr",
066 "acronym",
067 "address",
068 "b",
069 "big",
070 "blockquote",
071 "br",
072 "caption",
073 "cite",
074 "code",
075 "dd",
076 "del",
077 "dfn",
078 "div",
079 "dl",
080 "dt",
081 "em",
082 "fieldset",
083 "h1",
084 "h2",
085 "h3",
086 "h4",
087 "h5",
088 "h6",
089 "hr",
090 "i",
091 "input",
092 "ins",
093 "kbd",
094 "li",
095 "noscript",
096 "ol",
097 "p",
098 "pre",
099 "q",
100 "samp",
101 "small",
102 "span",
103 "strong",
104 "sub",
105 "sup",
106 "tt",
107 "ul",
108 "var" };
109
110 private static final String[] specialElts =
111 {
112 "a",
113 "bdo",
114 "body",
115 "button",
116 "col",
117 "colgroup",
118 "form",
119 "head",
120 "html",
121 "img",
122 "label",
123 "link",
124 "map",
125 "meta",
126 "object",
127 "script",
128 "script",
129 "select",
130 "style",
131 "table",
132 "tbody",
133 "td",
134 "textarea",
135 "tfoot",
136 "th",
137 "thead",
138 "title",
139 "tr" };
140
141 private static final String[] specialAttrs =
142 {
143 "abbr",
144 "accept",
145 "accept-charset",
146 "accesskey",
147 "action",
148 "align",
149 "alt",
150 "archive",
151 "axis",
152 "border",
153 "cellpadding",
154 "char",
155 "charoff",
156 "charset",
157 "checked",
158 "classid",
159 "codebase",
160 "cols",
161 "colspan",
162 "content",
163 "coords",
164 "data",
165 "declare",
166 "defer",
167 "dir",
168 "disabled",
169 "enctype",
170 "for",
171 "frame",
172 "headers",
173 "height",
174 "href",
175 "hreflang",
176 "http-equiv",
177 "id",
178 "ismap",
179 "label",
180 "longdesc",
181 "maxlength",
182 "media",
183 "method",
184 "multiple",
185 "name",
186 "nohref",
187 "onblur",
188 "onchange",
189 "onchange",
190 "onfocus",
191 "onload",
192 "onreset",
193 "onselect",
194 "onselect",
195 "onsubmit",
196 "onunload",
197 "readonly",
198 "rel",
199 "rev",
200 "rows",
201 "rowspan",
202 "rules",
203 "scheme",
204 "scope",
205 "selected",
206 "shape",
207 "size",
208 "span",
209 "src",
210 "standby",
211 "summary",
212 "tabindex",
213 "type",
214 "usemap",
215 "valign",
216 "value",
217 "valuetype",
218 "width" };
219
220 /** Creates a new instance of XHTMLStricter */
221 public XHTMLCruftDropper(EventConsumer next) {
222 super(next);
223 setContentHandler(this);
224 }
225
226 public void endElement(String uri, String local, String qname)
227 throws SAXException {
228 if ("http://www.w3.org/1999/xhtml".equals(uri)
229 && (Arrays.binarySearch(normalElts, local) >= 0
230 || Arrays.binarySearch(specialElts, local) >= 0)) {
231 super.endElement(uri, local, qname);
232 }
233 }
234
235 public void ignorableWhitespace(char[] values, int param, int param2) {
236 }
237
238 public void skippedEntity(String str) throws SAXException {
239 throw new SAXException("enresolved entity");
240 }
241
242 public void startElement(
243 String uri,
244 String local,
245 String qname,
246 Attributes attributes)
247 throws SAXException {
248 if ("http://www.w3.org/1999/xhtml".equals(uri)) {
249 if (Arrays.binarySearch(normalElts, local) >= 0) {
250 AttributesImpl attrs = new AttributesImpl(attributes);
251 int i = 0;
252 String attrLocal;
253 String attrNS;
254 while (i < attrs.getLength()) {
255 attrLocal = attrs.getLocalName(i);
256 attrNS = attrs.getURI(i);
257 if (("lang".equals(attrLocal)
258 && "http://www.w3.org/XML/1998/namespace".equals(attrNS))
259 || ("space".equals(attrLocal)
260 && "http://www.w3.org/XML/1998/namespace".equals(
261 attrNS))
262 || (Arrays.binarySearch(normalAttrs, attrLocal) >= 0
263 && ("".equals(attrNS)
264 || "http://www.w3.org/1999/xhtml".equals(
265 attrNS)))) {
266 i++;
267 } else {
268 attrs.removeAttribute(i);
269 }
270 }
271 super.startElement(uri, local, qname, attrs);
272 } else if (Arrays.binarySearch(specialElts, local) >= 0) {
273 AttributesImpl attrs = new AttributesImpl(attributes);
274 int i = 0;
275 String attrLocal;
276 String attrNS;
277 while (i < attrs.getLength()) {
278 attrLocal = attrs.getLocalName(i);
279 attrNS = attrs.getURI(i);
280 if (("lang".equals(attrLocal)
281 && "http://www.w3.org/XML/1998/namespace".equals(attrNS))
282 || ("space".equals(attrLocal)
283 && "http://www.w3.org/XML/1998/namespace".equals(
284 attrNS))
285 || ((Arrays.binarySearch(normalAttrs, attrLocal) >= 0
286 || Arrays.binarySearch(specialAttrs, attrLocal) >= 0)
287 && ("".equals(attrNS)
288 || "http://www.w3.org/1999/xhtml".equals(
289 attrNS)))) {
290 i++;
291 } else {
292 attrs.removeAttribute(i);
293 }
294 }
295 super.startElement(uri, local, qname, attrs);
296 }
297 }
298 }
299 }