fi.iki.hsivonen.htmlparser
Class HtmlParser

java.lang.Object
  extended by fi.iki.hsivonen.htmlparser.HtmlParser
All Implemented Interfaces:
Locator, XMLReader

public final class HtmlParser
extends Object
implements XMLReader, Locator

WARNING: This parser is incomplete. It does not perform tag inference, yet. It does not yet perform case folding for attribute value like method="POST".

Version:
$Id: HtmlParser.java,v 1.20 2006/11/21 10:13:24 hsivonen Exp $
Author:
hsivonen

Field Summary
private  boolean alreadyWarnedAboutPrivateUseCharacters
           
private static char[] APOS
           
private  char[] astralChar
           
private  char[] attrBuf
           
private  int attrBufLen
           
private  AttributesImpl attrs
           
private  char[] bmpChar
           
private  char[] buf
           
private  int bufLen
           
private static int CASE_MASK
           
private  int cdataState
           
private  CharacterEncodingDeclarationFilter cedf
           
private  ContentHandler ch
           
private  int col
           
private  int cstart
           
private  DoctypeHandler doctypeHandler
           
private  int doctypeMode
           
private  boolean doctypeSeen
           
private  DTDHandler dtdHandler
           
private  EmptyElementFilter eef
           
private  ErrorHandler eh
           
private  XhtmlSaxEmitter emitter
           
private  String encoding
           
private  EntityResolver entityResolver
           
private  boolean foldedAttributeValue
           
private  boolean html5
           
private static int LEAD_OFFSET
           
private  int line
           
private static char[] LT
           
private  boolean nonWhiteSpaceAllowed
           
private  NormalizationChecker normalizationChecker
           
private static char[] OCTYPE
           
private static int PCDATA
           
private  ContentHandlerFilter pipelineLast
           
private  int pos
           
private  char prev
           
private  String publicId
           
private  Reader reader
           
private static int SCRIPT
           
private  char[] strBuf
           
private  int strBufLen
           
private  InputStream stream
           
private static int STYLE
           
private static int SURROGATE_OFFSET
           
private  String systemId
           
private  TagInferenceFilter tif
           
private static char[] TML
           
private static char[] UBLIC
           
private  boolean wasLt
           
 
Constructor Summary
HtmlParser()
           
 
Method Summary
private  void appendAttrBuf(char c)
           
private  void appendAttrBuf(char[] cs)
           
private  void appendAttrBufAsciiLowerCase(char c)
           
private  void appendAttrBufAsciiLowerCase(char[] cs)
           
private  void appendStrBuf(char c)
           
private  void appendStrBufAsciiLowerCase(char c)
           
private  String attrBufToString()
           
private  void cannotDetermineEncoding()
           
private  void cdataStateEnd(String gi)
           
private  void checkPublicAndSystemIds(String publicId, String systemId)
           
private  void clearAttrBuf()
           
private  void clearStrBuf()
           
private  char consumeAttribute(char c)
           
private  boolean consumeCaseInsensitiveAsciiLetterString(char[] str)
           
private  char[] consumeCharRef()
           
private  char consumeComment()
           
private  void consumeDoctype()
           
private  void consumeEndTag()
           
private  char[] consumeEntityRef(char c)
           
private  void consumeMarkup()
           
private  void consumeMarkupDecl()
           
private  char[] consumeNCR()
           
private  void consumePI()
           
private  void consumeQuotedAttributeValue(char delim)
           
private  void consumeStartTag(char c)
           
private  void doctypeNotOk()
           
private  Reader draconianInputStreamReader(String encoding, InputStream stream, boolean requireAsciiSuperset)
           
private  void err(String message)
           
private  void fatal(String message)
           
private  void fatalIfAttributeExists(String name)
           
private  void flushChars()
           
 int getColumnNumber()
           
 ContentHandler getContentHandler()
           
 DoctypeHandler getDoctypeHandler()
          Returns the doctypeHandler.
 int getDoctypeMode()
          Returns the doctypeMode.
 DTDHandler getDTDHandler()
           
 EntityResolver getEntityResolver()
           
 ErrorHandler getErrorHandler()
           
 boolean getFeature(String key)
           
 int getLineNumber()
           
 Object getProperty(String key)
           
 String getPublicId()
           
 String getSystemId()
           
private  boolean isAstralPrivateUse(int c)
           
private  boolean isForbidden(char c)
           
private  boolean isNameChar(char c)
           
private  boolean isNameStart(char c)
           
private  boolean isNonCharacter(int c)
           
private  boolean isPrivateUse(char c)
           
private  boolean isUnquotedAttributeChar(char c)
           
private  boolean isWhiteSpace(char c)
           
private  void maybeBeginCdata(String gi)
           
private  char next()
           
private  char nextAfterZeroOrMoreWhiteSpace()
           
private  char nextMayEnd()
           
private  void parse()
           
 void parse(InputSource is)
           
 void parse(String url)
           
 void refireStart()
           
private  void sawHtml5Doctype()
           
 void setContentHandler(ContentHandler ch)
           
 void setDoctypeHandler(DoctypeHandler doctypeHandler)
          Sets the doctypeHandler.
 void setDoctypeMode(int doctypeMode)
          Sets the doctypeMode.
 void setDTDHandler(DTDHandler handler)
           
(package private)  void setEncoding(String enc)
           
 void setEntityResolver(EntityResolver entityResolver)
           
 void setErrorHandler(ErrorHandler eh)
           
 void setFeature(String key, boolean value)
           
(package private)  void setNonWhiteSpaceAllowed(boolean allow)
           
 void setProperty(String key, Object value)
           
private  String strBufToString()
           
private  void streamSetup(InputSource is)
           
private  String unescapedStringUntil(char delim)
           
private  void warn(String message)
           
private  void warnAboutPrivateUseChar()
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

CASE_MASK

private static final int CASE_MASK
See Also:
Constant Field Values

LEAD_OFFSET

private static final int LEAD_OFFSET
See Also:
Constant Field Values

SURROGATE_OFFSET

private static final int SURROGATE_OFFSET
See Also:
Constant Field Values

LT

private static final char[] LT

APOS

private static final char[] APOS

OCTYPE

private static final char[] OCTYPE

TML

private static final char[] TML

UBLIC

private static final char[] UBLIC

PCDATA

private static final int PCDATA
See Also:
Constant Field Values

SCRIPT

private static final int SCRIPT
See Also:
Constant Field Values

STYLE

private static final int STYLE
See Also:
Constant Field Values

publicId

private String publicId

systemId

private String systemId

nonWhiteSpaceAllowed

private boolean nonWhiteSpaceAllowed

cdataState

private int cdataState

eh

private ErrorHandler eh

ch

private ContentHandler ch

doctypeHandler

private DoctypeHandler doctypeHandler

emitter

private XhtmlSaxEmitter emitter

reader

private Reader reader

pos

private int pos

cstart

private int cstart

buf

private char[] buf

bufLen

private int bufLen

line

private int line

col

private int col

doctypeSeen

private boolean doctypeSeen

doctypeMode

private int doctypeMode

html5

private boolean html5

prev

private char prev

wasLt

private boolean wasLt

strBuf

private char[] strBuf

strBufLen

private int strBufLen

attrBuf

private char[] attrBuf

attrBufLen

private int attrBufLen

attrs

private AttributesImpl attrs

bmpChar

private char[] bmpChar

astralChar

private char[] astralChar

dtdHandler

private DTDHandler dtdHandler

eef

private EmptyElementFilter eef

tif

private TagInferenceFilter tif

cedf

private CharacterEncodingDeclarationFilter cedf

pipelineLast

private ContentHandlerFilter pipelineLast

entityResolver

private EntityResolver entityResolver

encoding

private String encoding

stream

private InputStream stream

foldedAttributeValue

private boolean foldedAttributeValue

alreadyWarnedAboutPrivateUseCharacters

private boolean alreadyWarnedAboutPrivateUseCharacters

normalizationChecker

private NormalizationChecker normalizationChecker
Constructor Detail

HtmlParser

public HtmlParser()
Method Detail

clearStrBuf

private void clearStrBuf()

appendStrBufAsciiLowerCase

private void appendStrBufAsciiLowerCase(char c)
                                 throws SAXException,
                                        IOException
Throws:
SAXException
IOException

appendStrBuf

private void appendStrBuf(char c)
                   throws SAXException,
                          IOException
Throws:
SAXException
IOException

strBufToString

private String strBufToString()

clearAttrBuf

private void clearAttrBuf()

appendAttrBuf

private void appendAttrBuf(char c)
                    throws SAXException,
                           IOException
Throws:
SAXException
IOException

appendAttrBufAsciiLowerCase

private void appendAttrBufAsciiLowerCase(char c)
                                  throws SAXException,
                                         IOException
Throws:
SAXException
IOException

appendAttrBuf

private void appendAttrBuf(char[] cs)
                    throws SAXException,
                           IOException
Parameters:
cs -
Throws:
SAXException
IOException

appendAttrBufAsciiLowerCase

private void appendAttrBufAsciiLowerCase(char[] cs)
                                  throws SAXException,
                                         IOException
Parameters:
cs -
Throws:
SAXException
IOException

attrBufToString

private String attrBufToString()

parse

private void parse()
            throws SAXException,
                   IOException
Throws:
SAXException
IOException

doctypeNotOk

private void doctypeNotOk()
                   throws SAXException,
                          IOException
Throws:
SAXException
IOException

isWhiteSpace

private boolean isWhiteSpace(char c)
Parameters:
c -
Returns:

consumeCharRef

private char[] consumeCharRef()
                       throws SAXException,
                              IOException
Throws:
SAXException
IOException

consumeEntityRef

private char[] consumeEntityRef(char c)
                         throws SAXException,
                                IOException
Parameters:
c -
Throws:
SAXException
IOException

consumeNCR

private char[] consumeNCR()
                   throws SAXException,
                          IOException
Throws:
SAXException
IOException

consumeMarkup

private void consumeMarkup()
                    throws SAXException,
                           IOException
Throws:
SAXException
IOException

next

private char next()
           throws SAXException,
                  IOException
Returns:
Throws:
SAXException
IOException

consumeStartTag

private void consumeStartTag(char c)
                      throws SAXException,
                             IOException
Parameters:
c -
Throws:
SAXException
IOException

maybeBeginCdata

private void maybeBeginCdata(String gi)
Parameters:
gi -

consumeAttribute

private char consumeAttribute(char c)
                       throws SAXException,
                              IOException
Parameters:
c -
Returns:
Throws:
SAXException
IOException

isUnquotedAttributeChar

private boolean isUnquotedAttributeChar(char c)
Parameters:
c -
Returns:

consumeQuotedAttributeValue

private void consumeQuotedAttributeValue(char delim)
                                  throws SAXException,
                                         IOException
Parameters:
c -
Throws:
SAXException
IOException

fatalIfAttributeExists

private void fatalIfAttributeExists(String name)
                             throws SAXException,
                                    IOException
Parameters:
name -
Throws:
SAXException
IOException

isNameStart

private boolean isNameStart(char c)
Parameters:
c -
Returns:

isNameChar

private boolean isNameChar(char c)
Parameters:
c -
Returns:

consumeEndTag

private void consumeEndTag()
                    throws SAXException,
                           IOException
Throws:
SAXException
IOException

cdataStateEnd

private void cdataStateEnd(String gi)
                    throws SAXException,
                           IOException
Parameters:
gi -
Throws:
SAXException
IOException

nextAfterZeroOrMoreWhiteSpace

private char nextAfterZeroOrMoreWhiteSpace()
                                    throws SAXException,
                                           IOException
Returns:
Throws:
SAXException
IOException

consumePI

private void consumePI()
                throws SAXException,
                       IOException
Throws:
SAXException
IOException

consumeMarkupDecl

private void consumeMarkupDecl()
                        throws SAXException,
                               IOException
Throws:
SAXException
IOException

consumeDoctype

private void consumeDoctype()
                     throws SAXException,
                            IOException
Throws:
SAXException
IOException

sawHtml5Doctype

private void sawHtml5Doctype()
                      throws SAXException
Throws:
SAXException

checkPublicAndSystemIds

private void checkPublicAndSystemIds(String publicId,
                                     String systemId)
                              throws SAXException,
                                     IOException
Parameters:
publicId -
systemId -
Throws:
SAXException
IOException

unescapedStringUntil

private String unescapedStringUntil(char delim)
                             throws SAXException,
                                    IOException
Parameters:
c -
Returns:
Throws:
SAXException
IOException

consumeCaseInsensitiveAsciiLetterString

private boolean consumeCaseInsensitiveAsciiLetterString(char[] str)
                                                 throws SAXException,
                                                        IOException
Throws:
SAXException
IOException

consumeComment

private char consumeComment()
                     throws SAXException,
                            IOException
Throws:
SAXException
IOException

nextMayEnd

private char nextMayEnd()
                 throws SAXException,
                        IOException
Throws:
SAXException
IOException

warnAboutPrivateUseChar

private void warnAboutPrivateUseChar()
                              throws SAXException
Throws:
SAXException

isPrivateUse

private boolean isPrivateUse(char c)

isAstralPrivateUse

private boolean isAstralPrivateUse(int c)

isNonCharacter

private boolean isNonCharacter(int c)
Parameters:
intVal -
Returns:

isForbidden

private boolean isForbidden(char c)
Parameters:
c -
Returns:

flushChars

private void flushChars()
                 throws SAXException,
                        IOException
Throws:
SAXException
IOException

fatal

private void fatal(String message)
            throws SAXException
Throws:
SAXException
SAXParseException

err

private void err(String message)
          throws SAXException
Parameters:
string -
Throws:
SAXException

warn

private void warn(String message)
           throws SAXException
Parameters:
string -
Throws:
SAXException

getPublicId

public String getPublicId()
Specified by:
getPublicId in interface Locator
See Also:
Locator.getPublicId()

getSystemId

public String getSystemId()
Specified by:
getSystemId in interface Locator
See Also:
Locator.getSystemId()

getLineNumber

public int getLineNumber()
Specified by:
getLineNumber in interface Locator
See Also:
Locator.getLineNumber()

getColumnNumber

public int getColumnNumber()
Specified by:
getColumnNumber in interface Locator
See Also:
Locator.getColumnNumber()

getFeature

public boolean getFeature(String key)
                   throws SAXNotRecognizedException,
                          SAXNotSupportedException
Specified by:
getFeature in interface XMLReader
Throws:
SAXNotRecognizedException
SAXNotSupportedException
See Also:
XMLReader.getFeature(java.lang.String)

setFeature

public void setFeature(String key,
                       boolean value)
                throws SAXNotRecognizedException,
                       SAXNotSupportedException
Specified by:
setFeature in interface XMLReader
Throws:
SAXNotRecognizedException
SAXNotSupportedException
See Also:
XMLReader.setFeature(java.lang.String, boolean)

getProperty

public Object getProperty(String key)
                   throws SAXNotRecognizedException,
                          SAXNotSupportedException
Specified by:
getProperty in interface XMLReader
Throws:
SAXNotRecognizedException
SAXNotSupportedException
See Also:
XMLReader.getProperty(java.lang.String)

setProperty

public void setProperty(String key,
                        Object value)
                 throws SAXNotRecognizedException,
                        SAXNotSupportedException
Specified by:
setProperty in interface XMLReader
Throws:
SAXNotRecognizedException
SAXNotSupportedException
See Also:
XMLReader.setProperty(java.lang.String, java.lang.Object)

setEntityResolver

public void setEntityResolver(EntityResolver entityResolver)
Specified by:
setEntityResolver in interface XMLReader
See Also:
XMLReader.setEntityResolver(org.xml.sax.EntityResolver)

getEntityResolver

public EntityResolver getEntityResolver()
Specified by:
getEntityResolver in interface XMLReader
See Also:
XMLReader.getEntityResolver()

setDTDHandler

public void setDTDHandler(DTDHandler handler)
Specified by:
setDTDHandler in interface XMLReader
See Also:
XMLReader.setDTDHandler(org.xml.sax.DTDHandler)

getDTDHandler

public DTDHandler getDTDHandler()
Specified by:
getDTDHandler in interface XMLReader
See Also:
XMLReader.getDTDHandler()

setContentHandler

public void setContentHandler(ContentHandler ch)
Specified by:
setContentHandler in interface XMLReader
See Also:
XMLReader.setContentHandler(org.xml.sax.ContentHandler)

getContentHandler

public ContentHandler getContentHandler()
Specified by:
getContentHandler in interface XMLReader
See Also:
XMLReader.getContentHandler()

setErrorHandler

public void setErrorHandler(ErrorHandler eh)
Specified by:
setErrorHandler in interface XMLReader
See Also:
XMLReader.setErrorHandler(org.xml.sax.ErrorHandler)

getErrorHandler

public ErrorHandler getErrorHandler()
Specified by:
getErrorHandler in interface XMLReader
See Also:
XMLReader.getErrorHandler()

parse

public void parse(InputSource is)
           throws IOException,
                  SAXException
Specified by:
parse in interface XMLReader
Throws:
IOException
SAXException
See Also:
XMLReader.parse(org.xml.sax.InputSource)

streamSetup

private void streamSetup(InputSource is)
                  throws SAXException,
                         IOException
Parameters:
is -
swallowBom -
Throws:
IOException
SAXException

cannotDetermineEncoding

private void cannotDetermineEncoding()
                              throws IOException
Throws:
IOException

draconianInputStreamReader

private Reader draconianInputStreamReader(String encoding,
                                          InputStream stream,
                                          boolean requireAsciiSuperset)
                                   throws SAXException
Throws:
SAXException

parse

public void parse(String url)
           throws IOException,
                  SAXException
Specified by:
parse in interface XMLReader
Throws:
IOException
SAXException
See Also:
XMLReader.parse(java.lang.String)

setEncoding

void setEncoding(String enc)
           throws SAXException
Parameters:
string -
Throws:
SAXException

setNonWhiteSpaceAllowed

void setNonWhiteSpaceAllowed(boolean allow)

getDoctypeMode

public int getDoctypeMode()
Returns the doctypeMode.

Returns:
the doctypeMode

setDoctypeMode

public void setDoctypeMode(int doctypeMode)
Sets the doctypeMode.

Parameters:
doctypeMode - the doctypeMode to set

getDoctypeHandler

public DoctypeHandler getDoctypeHandler()
Returns the doctypeHandler.

Returns:
the doctypeHandler

setDoctypeHandler

public void setDoctypeHandler(DoctypeHandler doctypeHandler)
Sets the doctypeHandler.

Parameters:
doctypeHandler - the doctypeHandler to set

refireStart

public void refireStart()
                 throws SAXException
Throws:
SAXException