001    /*
002     * Copyright (c) 2005 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.htmlparser;
024    
025    import java.util.Arrays;
026    
027    import org.xml.sax.Attributes;
028    import org.xml.sax.SAXException;
029    
030    import fi.iki.hsivonen.xml.ContentHandlerFilter;
031    
032    /**
033     * @version $Id: EmptyElementFilter.java,v 1.7 2006/12/01 12:34:31 hsivonen Exp $
034     * @author hsivonen
035     */
036    public final class EmptyElementFilter extends ContentHandlerFilter {
037        private static final String XHTML_NS = "http://www.w3.org/1999/xhtml";
038    
039        /**
040         * HTML 4.01 Strict elements which don't have an end tag
041         */
042        private static final String[] EMPTY_ELEMENTS = { "area", "base",
043                "basefont", "br", "col", "command", "frame", "hr", "img", "input",
044                "isindex", "link", "meta", "param" };
045    
046        // should we include things like <spacer> and <image>?
047        
048    //    01:22 < Hixie> well, my list right now is     base, link, meta, hr, br, img, 
049    //    embed, param, area, col, input  
050    //01:22 < Hixie> plus command and event-source
051    //    
052        private static final boolean isEmpty(String name) {
053            return (Arrays.binarySearch(EMPTY_ELEMENTS, name) >= 0);
054        }
055    
056        /**
057         * @see org.xml.sax.ContentHandler#endElement(java.lang.String,
058         *      java.lang.String, java.lang.String)
059         */
060        public void endElement(String uri, String local, String qName)
061                throws SAXException {
062            if (XHTML_NS.equals(uri)) {
063                if (isEmpty(local)) {
064                    fatal("End tag \u201C"
065                            + local
066                            + "\u201D seen even though the element is an empty element.");
067                }
068            }
069            super.endElement(uri, local, qName);
070        }
071    
072        /**
073         * @see org.xml.sax.ContentHandler#startElement(java.lang.String,
074         *      java.lang.String, java.lang.String, org.xml.sax.Attributes)
075         */
076        public void startElement(String uri, String local, String qName,
077                Attributes attrs) throws SAXException {
078            // FIXME just dropping base for now
079            boolean drop = "base".equals(local);
080            if (!drop) {
081                super.startElement(uri, local, qName, attrs);
082            }
083            if (XHTML_NS.equals(uri)) {
084                if (!drop && isEmpty(local)) {
085                    super.endElement(uri, local, qName);
086                }
087            }
088        }
089    }