001    /*
002     * Copyright (c) 2005, 2006 Henri Sivonen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.htmlparser;
024    
025    import java.util.Arrays;
026    
027    import org.xml.sax.Attributes;
028    import org.xml.sax.SAXException;
029    
030    import fi.iki.hsivonen.xml.ContentHandlerFilter;
031    import fi.iki.hsivonen.xml.EmptyAttributes;
032    
033    /**
034     * @version $Id: TagInferenceFilter.java,v 1.7 2006/04/18 11:50:40 hsivonen Exp $
035     * @author hsivonen
036     */
037    public final class TagInferenceFilter extends ContentHandlerFilter {
038        private static final String XHTML_NS = "http://www.w3.org/1999/xhtml";
039    
040        private static final String[][] END_CAUSING_STARTS = {
041                /* body */{},
042                /* colgroup */{ "colgroup", "tbody", "tfoot", "thead", "tr" },
043                /* dd */{ "dd", "dt" },
044                /* dt */{ "dd", "dt" },
045                /* head */{}, // handled separately
046                /* html */{},
047                /* li */{ "li" },
048                /* option */{"optgroup", "option"},
049                /* p */{ "address", "blockquote", "center", "dd", "div", "dl",
050                        "dt", "fieldset", "form", "h1", "h2", "h3", "h4", "h5",
051                        "h6", "hr", "isindex", "li", "noframes", "noscript", "ol",
052                        "p", "pre", "table", "tbody", "td", "tfoot", "th", "tr",
053                        "ul" },
054                /* tbody */{ "tbody" },
055                /* td */{ "tbody", "td", "tfoot", "th", "tr" },
056                /* tfoot */{ "tbody" },
057                /* th */{ "tbody", "td", "tfoot", "th", "tr" },
058                /* thead */{ "tbody", "tfoot" },
059                /* tr */{ "tbody", "tfoot", "tr" } };
060    
061        private static final String[] OPTIONAL_END = { "body", "colgroup", "dd",
062                "dt", "head", "html", "li", "option", "p", "tbody", "td", "tfoot", "th",
063                "thead", "tr" };
064    
065        private static final String[] HEAD_CHILDREN = { "base", "bgsound",
066                "isindex", "link", "meta", "object", "script", "style", "title" };
067    
068        private String[] stack = new String[48];
069    
070        private int stackIndex = 0;
071    
072        private HtmlParser parser;
073    
074        private boolean headClosed;
075    
076        private static boolean isOptionalEnd(String name) {
077            return Arrays.binarySearch(OPTIONAL_END, name) > -1;
078        }
079    
080        private static boolean isHeadChild(String name) {
081            return Arrays.binarySearch(HEAD_CHILDREN, name) > -1;
082        }
083    
084        private static boolean startImpliesEnd(String start, String top) {
085            if (top == null) {
086                return false;
087            }
088            if ("head".equals(top)) {
089                return !isHeadChild(start);
090            }
091            int i = Arrays.binarySearch(OPTIONAL_END, top);
092            if (i < 0) {
093                return false;
094            }
095            return Arrays.binarySearch(END_CAUSING_STARTS[i], start) > -1;
096        }
097    
098        private void push(String str) {
099            stackIndex++;
100            if (stackIndex == stack.length) {
101                String[] newStack = new String[stack.length + 16];
102                System.arraycopy(stack, 0, newStack, 0, stack.length);
103                stack = newStack;
104            }
105            stack[stackIndex] = str;
106        }
107    
108        private String pop() {
109            String rv = stack[stackIndex];
110            if (stackIndex > 0) {
111                stackIndex--;
112            }
113            return rv;
114        }
115    
116        private String peek() {
117            return stack[stackIndex];
118        }
119    
120        private boolean isEmpty() {
121            return stackIndex == 0;
122        }
123    
124        /**
125         * @see org.xml.sax.ContentHandler#endDocument()
126         */
127        public void flushStack() throws SAXException {
128            for (;;) {
129                String top = pop();
130                if (top == null) {
131                    return;
132                }
133                if (isOptionalEnd(top)) {
134                    endElement(top);
135                } else {
136                    fatal("Document ended but there were unclosed elements.");
137                }
138            }
139        }
140    
141        /**
142         * @see org.xml.sax.ContentHandler#endElement(java.lang.String,
143         *      java.lang.String, java.lang.String)
144         */
145        public void endElement(String uri, String local, String qName)
146                throws SAXException {
147            for (;;) {
148                String top = pop();
149                if (top == null) {
150                    fatal("End tag for an element that was not open.");
151                } else if (top.equals(local)) {
152                    endElement(top);
153                    break;
154                } else {
155                    if (isOptionalEnd(top)) {
156                        endElement(top);
157                    } else {
158                        fatal("Stray end tag: " + local);
159                    }
160                }
161            }
162            if (isEmpty()) {
163                super.endPrefixMapping("");
164                parser.setNonWhiteSpaceAllowed(false);
165            }
166        }
167    
168        /**
169         * @param uri
170         * @param name
171         * @throws SAXException
172         */
173        private void endElement(String name) throws SAXException {
174            super.endElement(XHTML_NS, name, name);
175            if ("head".equals(name)) {
176                headClosed = true;
177            }
178        }
179    
180        /**
181         * @see org.xml.sax.ContentHandler#startDocument()
182         */
183        public void startDocument() throws SAXException {
184            headClosed = false;
185            stackIndex = 0;
186            stack[0] = null;
187            super.startDocument();
188        }
189    
190        /**
191         * @see org.xml.sax.ContentHandler#startElement(java.lang.String, java.lang.String, java.lang.String, org.xml.sax.Attributes)
192         */
193        public void startElement(String uri, String local, String qName,
194                Attributes attrs) throws SAXException {
195            if (isEmpty()) {
196                super.startPrefixMapping("", XHTML_NS);
197            }
198            boolean wereInferences;
199            String top = null;
200            do {
201                wereInferences = false;
202                for (;;) {
203                    top = peek();
204                    if (startImpliesEnd(local, top)) {
205                        pop();
206                        endElement(top);
207                        wereInferences = true;
208                    } else {
209                        break;
210                    }
211                }
212                top = peek();
213                if ("table".equals(top) && "tr".equals(local)) {
214                    startElement("tbody");
215                    wereInferences = true;
216                } else if (top == null && !"html".equals(local)) {
217                    startElement("html");
218                    wereInferences = true;
219                } else if ("html".equals(top) && !"head".equals(local)
220                        && !headClosed) {
221                    startElement("head");
222                    wereInferences = true;
223                } else if ("html".equals(top) && !"body".equals(local)
224                        && headClosed) {
225                    startElement("body");
226                    wereInferences = true;
227                }
228            } while (wereInferences);
229    
230            push(local);
231            super.startElement(uri, local, qName, attrs);
232            parser.setNonWhiteSpaceAllowed(true);
233        }
234    
235        /**
236         * @param string
237         * @throws SAXException
238         */
239        private void startElement(String name) throws SAXException {
240            push(name);
241            super.startElement(XHTML_NS, name, name,
242                    EmptyAttributes.EMPTY_ATTRIBUTES);
243            parser.setNonWhiteSpaceAllowed(true);
244        }
245    
246        /**
247         * @param parser
248         */
249        public TagInferenceFilter(HtmlParser parser) {
250            this.parser = parser;
251        }
252    }