001 /* 002 * Copyright (c) 2005, 2006 Henri Sivonen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.iki.hsivonen.htmlparser; 024 025 import java.util.Arrays; 026 027 import org.xml.sax.Attributes; 028 import org.xml.sax.SAXException; 029 030 import fi.iki.hsivonen.xml.ContentHandlerFilter; 031 import fi.iki.hsivonen.xml.EmptyAttributes; 032 033 /** 034 * @version $Id: TagInferenceFilter.java,v 1.7 2006/04/18 11:50:40 hsivonen Exp $ 035 * @author hsivonen 036 */ 037 public final class TagInferenceFilter extends ContentHandlerFilter { 038 private static final String XHTML_NS = "http://www.w3.org/1999/xhtml"; 039 040 private static final String[][] END_CAUSING_STARTS = { 041 /* body */{}, 042 /* colgroup */{ "colgroup", "tbody", "tfoot", "thead", "tr" }, 043 /* dd */{ "dd", "dt" }, 044 /* dt */{ "dd", "dt" }, 045 /* head */{}, // handled separately 046 /* html */{}, 047 /* li */{ "li" }, 048 /* option */{"optgroup", "option"}, 049 /* p */{ "address", "blockquote", "center", "dd", "div", "dl", 050 "dt", "fieldset", "form", "h1", "h2", "h3", "h4", "h5", 051 "h6", "hr", "isindex", "li", "noframes", "noscript", "ol", 052 "p", "pre", "table", "tbody", "td", "tfoot", "th", "tr", 053 "ul" }, 054 /* tbody */{ "tbody" }, 055 /* td */{ "tbody", "td", "tfoot", "th", "tr" }, 056 /* tfoot */{ "tbody" }, 057 /* th */{ "tbody", "td", "tfoot", "th", "tr" }, 058 /* thead */{ "tbody", "tfoot" }, 059 /* tr */{ "tbody", "tfoot", "tr" } }; 060 061 private static final String[] OPTIONAL_END = { "body", "colgroup", "dd", 062 "dt", "head", "html", "li", "option", "p", "tbody", "td", "tfoot", "th", 063 "thead", "tr" }; 064 065 private static final String[] HEAD_CHILDREN = { "base", "bgsound", 066 "isindex", "link", "meta", "object", "script", "style", "title" }; 067 068 private String[] stack = new String[48]; 069 070 private int stackIndex = 0; 071 072 private HtmlParser parser; 073 074 private boolean headClosed; 075 076 private static boolean isOptionalEnd(String name) { 077 return Arrays.binarySearch(OPTIONAL_END, name) > -1; 078 } 079 080 private static boolean isHeadChild(String name) { 081 return Arrays.binarySearch(HEAD_CHILDREN, name) > -1; 082 } 083 084 private static boolean startImpliesEnd(String start, String top) { 085 if (top == null) { 086 return false; 087 } 088 if ("head".equals(top)) { 089 return !isHeadChild(start); 090 } 091 int i = Arrays.binarySearch(OPTIONAL_END, top); 092 if (i < 0) { 093 return false; 094 } 095 return Arrays.binarySearch(END_CAUSING_STARTS[i], start) > -1; 096 } 097 098 private void push(String str) { 099 stackIndex++; 100 if (stackIndex == stack.length) { 101 String[] newStack = new String[stack.length + 16]; 102 System.arraycopy(stack, 0, newStack, 0, stack.length); 103 stack = newStack; 104 } 105 stack[stackIndex] = str; 106 } 107 108 private String pop() { 109 String rv = stack[stackIndex]; 110 if (stackIndex > 0) { 111 stackIndex--; 112 } 113 return rv; 114 } 115 116 private String peek() { 117 return stack[stackIndex]; 118 } 119 120 private boolean isEmpty() { 121 return stackIndex == 0; 122 } 123 124 /** 125 * @see org.xml.sax.ContentHandler#endDocument() 126 */ 127 public void flushStack() throws SAXException { 128 for (;;) { 129 String top = pop(); 130 if (top == null) { 131 return; 132 } 133 if (isOptionalEnd(top)) { 134 endElement(top); 135 } else { 136 fatal("Document ended but there were unclosed elements."); 137 } 138 } 139 } 140 141 /** 142 * @see org.xml.sax.ContentHandler#endElement(java.lang.String, 143 * java.lang.String, java.lang.String) 144 */ 145 public void endElement(String uri, String local, String qName) 146 throws SAXException { 147 for (;;) { 148 String top = pop(); 149 if (top == null) { 150 fatal("End tag for an element that was not open."); 151 } else if (top.equals(local)) { 152 endElement(top); 153 break; 154 } else { 155 if (isOptionalEnd(top)) { 156 endElement(top); 157 } else { 158 fatal("Stray end tag: " + local); 159 } 160 } 161 } 162 if (isEmpty()) { 163 super.endPrefixMapping(""); 164 parser.setNonWhiteSpaceAllowed(false); 165 } 166 } 167 168 /** 169 * @param uri 170 * @param name 171 * @throws SAXException 172 */ 173 private void endElement(String name) throws SAXException { 174 super.endElement(XHTML_NS, name, name); 175 if ("head".equals(name)) { 176 headClosed = true; 177 } 178 } 179 180 /** 181 * @see org.xml.sax.ContentHandler#startDocument() 182 */ 183 public void startDocument() throws SAXException { 184 headClosed = false; 185 stackIndex = 0; 186 stack[0] = null; 187 super.startDocument(); 188 } 189 190 /** 191 * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes) 192 */ 193 public void startElement(String uri, String local, String qName, 194 Attributes attrs) throws SAXException { 195 if (isEmpty()) { 196 super.startPrefixMapping("", XHTML_NS); 197 } 198 boolean wereInferences; 199 String top = null; 200 do { 201 wereInferences = false; 202 for (;;) { 203 top = peek(); 204 if (startImpliesEnd(local, top)) { 205 pop(); 206 endElement(top); 207 wereInferences = true; 208 } else { 209 break; 210 } 211 } 212 top = peek(); 213 if ("table".equals(top) && "tr".equals(local)) { 214 startElement("tbody"); 215 wereInferences = true; 216 } else if (top == null && !"html".equals(local)) { 217 startElement("html"); 218 wereInferences = true; 219 } else if ("html".equals(top) && !"head".equals(local) 220 && !headClosed) { 221 startElement("head"); 222 wereInferences = true; 223 } else if ("html".equals(top) && !"body".equals(local) 224 && headClosed) { 225 startElement("body"); 226 wereInferences = true; 227 } 228 } while (wereInferences); 229 230 push(local); 231 super.startElement(uri, local, qName, attrs); 232 parser.setNonWhiteSpaceAllowed(true); 233 } 234 235 /** 236 * @param string 237 * @throws SAXException 238 */ 239 private void startElement(String name) throws SAXException { 240 push(name); 241 super.startElement(XHTML_NS, name, name, 242 EmptyAttributes.EMPTY_ATTRIBUTES); 243 parser.setNonWhiteSpaceAllowed(true); 244 } 245 246 /** 247 * @param parser 248 */ 249 public TagInferenceFilter(HtmlParser parser) { 250 this.parser = parser; 251 } 252 }