001 /*
002 * Copyright (c) 2005 Henri Sivonen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package fi.iki.hsivonen.htmlparser;
024
025 import java.util.regex.Matcher;
026 import java.util.regex.Pattern;
027
028 import org.xml.sax.Attributes;
029 import org.xml.sax.SAXException;
030
031 import fi.iki.hsivonen.xml.ContentHandlerFilter;
032
033 /**
034 * @version $Id: CharacterEncodingDeclarationFilter.java,v 1.5 2006/11/14 22:32:44 hsivonen Exp $
035 * @author hsivonen
036 */
037 public final class CharacterEncodingDeclarationFilter extends ContentHandlerFilter {
038 private static final String XHTML_NS = "http://www.w3.org/1999/xhtml";
039
040 private static final int DOC_STARTED = 0;
041 private static final int HTML_OPEN = 1;
042 private static final int HEAD_OPEN = 2;
043 private static final int SITUATION_OVER = 3;
044
045 // XXX should white space and case-insensitivity be allowed
046 private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile("^[ \t\r\n]*Content-Type[ \t\r\n]*$", Pattern.CASE_INSENSITIVE);
047
048 // XXX should white space and case-insensitivity be allowed
049 // charset name pattern based on RFC 2978
050 private static final Pattern CONTENT_PATTERN = Pattern.compile("^[ \t\r\n]*text/html[ \t\r\n]*;[ \t\r\n]*charset[ \t\r\n]*=[ \t\r\n]*([a-zA-Z0-9!#$%&\'+^_`{}~-]+)[ \t\r\n]*$", Pattern.CASE_INSENSITIVE);
051
052 private int state = DOC_STARTED;
053
054 private HtmlParser parser;
055
056 private boolean swallowEnd;
057
058 /**
059 * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String)
060 */
061 public void endElement(String uri, String local, String qName)
062 throws SAXException {
063 if(swallowEnd) {
064 swallowEnd = false;
065 return;
066 }
067 if (state != SITUATION_OVER) {
068 if(XHTML_NS.equals(uri)) {
069 if("head".equals(local) || "html".equals(local)) {
070 state = SITUATION_OVER;
071 parser.setEncoding(null);
072 }
073 }
074 }
075 super.endElement(uri, local, qName);
076 }
077 /**
078 * @see org.xml.sax.ContentHandler#startDocument()
079 */
080 public void startDocument() throws SAXException {
081 state = DOC_STARTED;
082 swallowEnd = false;
083 super.startDocument();
084 }
085 /**
086 * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
087 */
088 public void startElement(String uri, String local, String qName,
089 Attributes attrs) throws SAXException {
090 if(XHTML_NS.equals(uri)) {
091 if(state == DOC_STARTED) {
092 if("html".equals(local)) {
093 state = HTML_OPEN;
094 } else {
095 state = SITUATION_OVER;
096 parser.setEncoding(null);
097 }
098 } else if (state == HTML_OPEN){
099 if("head".equals(local)) {
100 state = HEAD_OPEN;
101 } else {
102 state = SITUATION_OVER;
103 parser.setEncoding(null);
104 }
105 } else if (state == HEAD_OPEN) {
106 if("meta".equals(local)) {
107 String httpEquiv = attrs.getValue("http-equiv");
108 if(httpEquiv != null) {
109 Matcher m = CONTENT_TYPE_PATTERN.matcher(httpEquiv);
110 if(m.matches()) {
111 if (attrs.getLength() == 2) {
112 String content = attrs.getValue("content");
113 if (content != null) {
114 m = CONTENT_PATTERN.matcher(content);
115 if (m.matches()) {
116 parser.setEncoding(m.group(1));
117 swallowEnd = true;
118 return;
119 } else {
120 // from WA1
121 err("The \u201Ccontent\u201D attribute of the \u201Cmeta\u201D element did not contain the string \u201Ctext/html; charset=\u201D followed by an IANA character encoding name.");
122 }
123 } else {
124 err("There was no \u201Ccontent\u201D attribute on the \u201Cmeta\u201D element.");
125 }
126 } else {
127 // from WA1
128 err("When the element \u201Cmeta\u201D is used for declaring the character encoding, it must have exactly two attributes: \u201Chttp-equiv\u201D and \u201Ccontent\u201D.");
129 }
130 } else {
131 // from WA1
132 err("The element \u201Cmeta\u201D with the attribute \u201Chttp-equiv\u201D is only allowed when it is used for declaring the character encoding.");
133 }
134 } else {
135 state = SITUATION_OVER;
136 parser.setEncoding(null);
137 }
138 } else {
139 state = SITUATION_OVER;
140 parser.setEncoding(null);
141 }
142 } else {
143 if("meta".equals(local)) {
144 if(attrs.getIndex("http-equiv") >= 0) {
145 // from WA1
146 err("The element \u201Cmeta\u201D with the attribute \u201Chttp-equiv\u201D is not allowed at this point. The attribute \u201Chttp-equiv\u201D is only appropriate for declaring the character encoding and in that case the element must be the first element child of the \u201Chead\u201D element.");
147 }
148 }
149 }
150 }
151 super.startElement(uri, local, qName, attrs);
152 }
153 /**
154 * @param parser
155 */
156 public CharacterEncodingDeclarationFilter(HtmlParser parser) {
157 this.parser = parser;
158 }
159 }