001 /* 002 * Copyright (c) 2005 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.iki.hsivonen.htmlparser; 024 025 import java.util.regex.Matcher; 026 import java.util.regex.Pattern; 027 028 import org.xml.sax.Attributes; 029 import org.xml.sax.SAXException; 030 031 import fi.iki.hsivonen.xml.ContentHandlerFilter; 032 033 /** 034 * @version $Id: CharacterEncodingDeclarationFilter.java,v 1.5 2006/11/14 22:32:44 hsivonen Exp $ 035 * @author hsivonen 036 */ 037 public final class CharacterEncodingDeclarationFilter extends ContentHandlerFilter { 038 private static final String XHTML_NS = "http://www.w3.org/1999/xhtml"; 039 040 private static final int DOC_STARTED = 0; 041 private static final int HTML_OPEN = 1; 042 private static final int HEAD_OPEN = 2; 043 private static final int SITUATION_OVER = 3; 044 045 // XXX should white space and case-insensitivity be allowed 046 private static final Pattern CONTENT_TYPE_PATTERN = Pattern.compile("^[ \t\r\n]*Content-Type[ \t\r\n]*$", Pattern.CASE_INSENSITIVE); 047 048 // XXX should white space and case-insensitivity be allowed 049 // charset name pattern based on RFC 2978 050 private static final Pattern CONTENT_PATTERN = Pattern.compile("^[ \t\r\n]*text/html[ \t\r\n]*;[ \t\r\n]*charset[ \t\r\n]*=[ \t\r\n]*([a-zA-Z0-9!#$%&\'+^_`{}~-]+)[ \t\r\n]*$", Pattern.CASE_INSENSITIVE); 051 052 private int state = DOC_STARTED; 053 054 private HtmlParser parser; 055 056 private boolean swallowEnd; 057 058 /** 059 * @see org.xml.sax.ContentHandler#endElement(java.lang.String, java.lang.String, java.lang.String) 060 */ 061 public void endElement(String uri, String local, String qName) 062 throws SAXException { 063 if(swallowEnd) { 064 swallowEnd = false; 065 return; 066 } 067 if (state != SITUATION_OVER) { 068 if(XHTML_NS.equals(uri)) { 069 if("head".equals(local) || "html".equals(local)) { 070 state = SITUATION_OVER; 071 parser.setEncoding(null); 072 } 073 } 074 } 075 super.endElement(uri, local, qName); 076 } 077 /** 078 * @see org.xml.sax.ContentHandler#startDocument() 079 */ 080 public void startDocument() throws SAXException { 081 state = DOC_STARTED; 082 swallowEnd = false; 083 super.startDocument(); 084 } 085 /** 086 * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) 087 */ 088 public void startElement(String uri, String local, String qName, 089 Attributes attrs) throws SAXException { 090 if(XHTML_NS.equals(uri)) { 091 if(state == DOC_STARTED) { 092 if("html".equals(local)) { 093 state = HTML_OPEN; 094 } else { 095 state = SITUATION_OVER; 096 parser.setEncoding(null); 097 } 098 } else if (state == HTML_OPEN){ 099 if("head".equals(local)) { 100 state = HEAD_OPEN; 101 } else { 102 state = SITUATION_OVER; 103 parser.setEncoding(null); 104 } 105 } else if (state == HEAD_OPEN) { 106 if("meta".equals(local)) { 107 String httpEquiv = attrs.getValue("http-equiv"); 108 if(httpEquiv != null) { 109 Matcher m = CONTENT_TYPE_PATTERN.matcher(httpEquiv); 110 if(m.matches()) { 111 if (attrs.getLength() == 2) { 112 String content = attrs.getValue("content"); 113 if (content != null) { 114 m = CONTENT_PATTERN.matcher(content); 115 if (m.matches()) { 116 parser.setEncoding(m.group(1)); 117 swallowEnd = true; 118 return; 119 } else { 120 // from WA1 121 err("The \u201Ccontent\u201D attribute of the \u201Cmeta\u201D element did not contain the string \u201Ctext/html; charset=\u201D followed by an IANA character encoding name."); 122 } 123 } else { 124 err("There was no \u201Ccontent\u201D attribute on the \u201Cmeta\u201D element."); 125 } 126 } else { 127 // from WA1 128 err("When the element \u201Cmeta\u201D is used for declaring the character encoding, it must have exactly two attributes: \u201Chttp-equiv\u201D and \u201Ccontent\u201D."); 129 } 130 } else { 131 // from WA1 132 err("The element \u201Cmeta\u201D with the attribute \u201Chttp-equiv\u201D is only allowed when it is used for declaring the character encoding."); 133 } 134 } else { 135 state = SITUATION_OVER; 136 parser.setEncoding(null); 137 } 138 } else { 139 state = SITUATION_OVER; 140 parser.setEncoding(null); 141 } 142 } else { 143 if("meta".equals(local)) { 144 if(attrs.getIndex("http-equiv") >= 0) { 145 // from WA1 146 err("The element \u201Cmeta\u201D with the attribute \u201Chttp-equiv\u201D is not allowed at this point. The attribute \u201Chttp-equiv\u201D is only appropriate for declaring the character encoding and in that case the element must be the first element child of the \u201Chead\u201D element."); 147 } 148 } 149 } 150 } 151 super.startElement(uri, local, qName, attrs); 152 } 153 /** 154 * @param parser 155 */ 156 public CharacterEncodingDeclarationFilter(HtmlParser parser) { 157 this.parser = parser; 158 } 159 }