001    /*
002     * Copyright (c) 2005 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.htmlparser;
024    
025    import java.util.regex.Matcher;
026    import java.util.regex.Pattern;
027    
028    import org.xml.sax.Attributes;
029    import org.xml.sax.SAXException;
030    
031    import fi.iki.hsivonen.xml.ContentHandlerFilter;
032    
033    /**
034     * @version $Id: CharacterEncodingDeclarationFilter.java,v 1.5 2006/11/14 22:32:44 hsivonen Exp $
035     * @author hsivonen
036     */
037    public final class CharacterEncodingDeclarationFilter extends ContentHandlerFilter {
038        private static final String XHTML_NS = "http://www.w3.org/1999/xhtml";
039    
040        private static final int DOC_STARTED = 0;
041        private static final int HTML_OPEN = 1;
042        private static final int HEAD_OPEN = 2;
043        private static final int SITUATION_OVER = 3;
044        
045        // XXX should white space and case-insensitivity be allowed
046        private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile("^[ \t\r\n]*Content-Type[ \t\r\n]*$", Pattern.CASE_INSENSITIVE);
047    
048        // XXX should white space and case-insensitivity be allowed
049        // charset name pattern based on RFC 2978
050        private static final Pattern CONTENT_PATTERN = Pattern.compile("^[ \t\r\n]*text/html[ \t\r\n]*;[ \t\r\n]*charset[ \t\r\n]*=[ \t\r\n]*([a-zA-Z0-9!#$%&\'+^_`{}~-]+)[ \t\r\n]*$", Pattern.CASE_INSENSITIVE);
051        
052        private int state = DOC_STARTED;
053    
054        private HtmlParser parser;
055    
056        private boolean swallowEnd;
057        
058        /**
059         * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
060         */
061        public void endElement(String uri, String local, String qName)
062                throws SAXException {
063            if(swallowEnd) {
064                swallowEnd = false;
065                return;
066            }
067            if (state != SITUATION_OVER) {
068                if(XHTML_NS.equals(uri)) {
069                    if("head".equals(local) || "html".equals(local)) {
070                        state = SITUATION_OVER;
071                        parser.setEncoding(null);
072                    }
073                }
074            }
075            super.endElement(uri, local, qName);
076        }
077        /**
078         * @see org.xml.sax.ContentHandler#startDocument()
079         */
080        public void startDocument() throws SAXException {
081            state = DOC_STARTED;
082            swallowEnd = false;
083            super.startDocument();
084        }
085        /**
086         * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
087         */
088        public void startElement(String uri, String local, String qName,
089                Attributes attrs) throws SAXException {
090            if(XHTML_NS.equals(uri)) {
091                if(state == DOC_STARTED) {
092                    if("html".equals(local)) {
093                        state = HTML_OPEN;
094                    } else {
095                        state = SITUATION_OVER;
096                        parser.setEncoding(null);
097                    }
098                } else if (state == HTML_OPEN){
099                    if("head".equals(local)) {
100                        state = HEAD_OPEN;
101                    } else {
102                        state = SITUATION_OVER;
103                        parser.setEncoding(null);
104                    }
105                } else if (state == HEAD_OPEN) {
106                    if("meta".equals(local)) {
107                        String httpEquiv = attrs.getValue("http-equiv");
108                        if(httpEquiv != null) {
109                            Matcher m = CONTENT_TYPE_PATTERN.matcher(httpEquiv);
110                            if(m.matches()) {
111                                if (attrs.getLength() == 2) {
112                                    String content = attrs.getValue("content");
113                                    if (content != null) {
114                                        m = CONTENT_PATTERN.matcher(content);
115                                        if (m.matches()) {
116                                            parser.setEncoding(m.group(1));
117                                            swallowEnd = true;
118                                            return;
119                                        } else {
120                                            // from WA1
121                                            err("The \u201Ccontent\u201D attribute of the \u201Cmeta\u201D element did not contain the string \u201Ctext/html; charset=\u201D followed by an IANA character encoding name.");
122                                        }
123                                    } else {
124                                        err("There was no \u201Ccontent\u201D attribute on the \u201Cmeta\u201D element.");
125                                    }
126                                } else {
127                                    // from WA1
128                                    err("When the element \u201Cmeta\u201D is used for declaring the character encoding, it must have exactly two attributes: \u201Chttp-equiv\u201D and \u201Ccontent\u201D.");
129                                }
130                            } else {
131                                // from WA1
132                                err("The element \u201Cmeta\u201D with the attribute \u201Chttp-equiv\u201D is only allowed when it is used for declaring the character encoding.");
133                            }
134                        } else {
135                            state = SITUATION_OVER;                                            
136                            parser.setEncoding(null);
137                        }
138                    } else {
139                        state = SITUATION_OVER;                    
140                        parser.setEncoding(null);
141                    }
142                } else {
143                    if("meta".equals(local)) {
144                        if(attrs.getIndex("http-equiv") >= 0) {
145                            // from WA1
146                            err("The element \u201Cmeta\u201D with the attribute \u201Chttp-equiv\u201D is not allowed at this point. The attribute \u201Chttp-equiv\u201D is only appropriate for declaring the character encoding and in that case the element must be the first element child of the \u201Chead\u201D element.");
147                        }
148                    }
149                }
150            }
151            super.startElement(uri, local, qName, attrs);
152        }
153        /**
154         * @param parser
155         */
156        public CharacterEncodingDeclarationFilter(HtmlParser parser) {
157            this.parser = parser;
158        }
159    }