001    /*
002     * Copyright (c) 2003, 2004 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.xml;
024    import gnu.xml.pipeline.EventConsumer;
025    import gnu.xml.pipeline.EventFilter;
026    
027    import java.util.Arrays;
028    
029    import org.xml.sax.Attributes;
030    import org.xml.sax.SAXException;
031    import org.xml.sax.helpers.AttributesImpl;
032    
033    /**
034     *
035     * @author  hsivonen
036     */
037    public class XHTMLCruftDropper extends EventFilter {
038    
039        private static final String[] normalAttrs =
040            {
041                "accesskey",
042                "cite",
043                "class",
044                "datetime",
045                "dir",
046                "id",
047                "lang",
048                "onblur",
049                "onclick",
050                "ondblclick",
051                "onfocus",
052                "onkeydown",
053                "onkeypress",
054                "onkeyup",
055                "onmousedown",
056                "onmousemove",
057                "onmouseout",
058                "onmouseup",
059                "style",
060                "tabindex",
061                "title" };
062    
063        private static final String[] normalElts =
064            {
065                "abbr",
066                "acronym",
067                "address",
068                "b",
069                "big",
070                "blockquote",
071                "br",
072                "caption",
073                "cite",
074                "code",
075                "dd",
076                "del",
077                "dfn",
078                "div",
079                "dl",
080                "dt",
081                "em",
082                "fieldset",
083                "h1",
084                "h2",
085                "h3",
086                "h4",
087                "h5",
088                "h6",
089                "hr",
090                "i",
091                "input",
092                "ins",
093                "kbd",
094                "li",
095                "noscript",
096                "ol",
097                "p",
098                "pre",
099                "q",
100                "samp",
101                "small",
102                "span",
103                "strong",
104                "sub",
105                "sup",
106                "tt",
107                "ul",
108                "var" };
109    
110        private static final String[] specialElts =
111            {
112                "a",
113                "bdo",
114                "body",
115                "button",
116                "col",
117                "colgroup",
118                "form",
119                "head",
120                "html",
121                "img",
122                "label",
123                "link",
124                "map",
125                "meta",
126                "object",
127                "script",
128                "script",
129                "select",
130                "style",
131                "table",
132                "tbody",
133                "td",
134                "textarea",
135                "tfoot",
136                "th",
137                "thead",
138                "title",
139                "tr" };
140    
141        private static final String[] specialAttrs =
142            {
143                "abbr",
144                "accept",
145                "accept-charset",
146                "accesskey",
147                "action",
148                "align",
149                "alt",
150                "archive",
151                "axis",
152                "border",
153                "cellpadding",
154                "char",
155                "charoff",
156                "charset",
157                "checked",
158                "classid",
159                "codebase",
160                "cols",
161                "colspan",
162                "content",
163                "coords",
164                "data",
165                "declare",
166                "defer",
167                "dir",
168                "disabled",
169                "enctype",
170                "for",
171                "frame",
172                "headers",
173                "height",
174                "href",
175                "hreflang",
176                "http-equiv",
177                "id",
178                "ismap",
179                "label",
180                "longdesc",
181                "maxlength",
182                "media",
183                "method",
184                "multiple",
185                "name",
186                "nohref",
187                "onblur",
188                "onchange",
189                "onchange",
190                "onfocus",
191                "onload",
192                "onreset",
193                "onselect",
194                "onselect",
195                "onsubmit",
196                "onunload",
197                "readonly",
198                "rel",
199                "rev",
200                "rows",
201                "rowspan",
202                "rules",
203                "scheme",
204                "scope",
205                "selected",
206                "shape",
207                "size",
208                "span",
209                "src",
210                "standby",
211                "summary",
212                "tabindex",
213                "type",
214                "usemap",
215                "valign",
216                "value",
217                "valuetype",
218                "width" };
219    
220        /** Creates a new instance of XHTMLStricter */
221        public XHTMLCruftDropper(EventConsumer next) {
222            super(next);
223            setContentHandler(this);
224        }
225    
226        public void endElement(String uri, String local, String qname)
227            throws SAXException {
228            if ("http://www.w3.org/1999/xhtml".equals(uri)
229                && (Arrays.binarySearch(normalElts, local) >= 0
230                    || Arrays.binarySearch(specialElts, local) >= 0)) {
231                super.endElement(uri, local, qname);
232            }
233        }
234    
235        public void ignorableWhitespace(char[] values, int param, int param2) {
236        }
237    
238        public void skippedEntity(String str) throws SAXException {
239            throw new SAXException("enresolved entity");
240        }
241    
242        public void startElement(
243            String uri,
244            String local,
245            String qname,
246            Attributes attributes)
247            throws SAXException {
248            if ("http://www.w3.org/1999/xhtml".equals(uri)) {
249                if (Arrays.binarySearch(normalElts, local) >= 0) {
250                    AttributesImpl attrs = new AttributesImpl(attributes);
251                    int i = 0;
252                    String attrLocal;
253                    String attrNS;
254                    while (i < attrs.getLength()) {
255                        attrLocal = attrs.getLocalName(i);
256                        attrNS = attrs.getURI(i);
257                        if (("lang".equals(attrLocal)
258                            && "http://www.w3.org/XML/1998/namespace".equals(attrNS))
259                            || ("space".equals(attrLocal)
260                                && "http://www.w3.org/XML/1998/namespace".equals(
261                                    attrNS))
262                            || (Arrays.binarySearch(normalAttrs, attrLocal) >= 0
263                                && ("".equals(attrNS)
264                                    || "http://www.w3.org/1999/xhtml".equals(
265                                        attrNS)))) {
266                            i++;
267                        } else {
268                            attrs.removeAttribute(i);
269                        }
270                    }
271                    super.startElement(uri, local, qname, attrs);
272                } else if (Arrays.binarySearch(specialElts, local) >= 0) {
273                    AttributesImpl attrs = new AttributesImpl(attributes);
274                    int i = 0;
275                    String attrLocal;
276                    String attrNS;
277                    while (i < attrs.getLength()) {
278                        attrLocal = attrs.getLocalName(i);
279                        attrNS = attrs.getURI(i);
280                        if (("lang".equals(attrLocal)
281                            && "http://www.w3.org/XML/1998/namespace".equals(attrNS))
282                            || ("space".equals(attrLocal)
283                                && "http://www.w3.org/XML/1998/namespace".equals(
284                                    attrNS))
285                            || ((Arrays.binarySearch(normalAttrs, attrLocal) >= 0
286                                || Arrays.binarySearch(specialAttrs, attrLocal) >= 0)
287                                && ("".equals(attrNS)
288                                    || "http://www.w3.org/1999/xhtml".equals(
289                                        attrNS)))) {
290                            i++;
291                        } else {
292                            attrs.removeAttribute(i);
293                        }
294                    }
295                    super.startElement(uri, local, qname, attrs);
296                }
297            }
298        }
299    }