fi.iki.hsivonen.util
Class MetadataExtractor

java.lang.Object
  extended by fi.iki.hsivonen.util.MetadataExtractor

public class MetadataExtractor
extends Object

Author:
hsivonen, ykk

Field Summary
private static char ANY
           
private static Pattern charsetPat
           
private  String contentType
           
private static boolean DEBUG
          Do we print the debug information
private static char DIGIT
           
private  Document doc
           
private  String encoding
           
private static String[] extensions
           
private static String[] extensionTypes
           
private  File file
           
private  String fileName
           
private  boolean guessedContentType
           
private  boolean hasCSSCharset
           
private static char[][] magicNumbers
           
private static String[] magicTypes
           
private static int MAX_MAGIC_LENGTH
           
private static String[] mimeTypeNames
           
private static String[] mimeTypes
           
private  String title
           
 
Constructor Summary
MetadataExtractor(File f)
           
MetadataExtractor(File f, String fileName)
          Creates a new instance of MetadataExtractor
 
Method Summary
private  void checkEncoding()
           
private  boolean couldBeUTF8()
           
private  void determineTypeFromExtension()
           
private  void determineTypeFromMagic()
           
private  void determineTypeFromXML()
           
private  void determineTypeFromZip()
           
private  void extractMetaFromHTML()
           
private  void extractMetaFromPDF()
           
 String getContentType()
          Getter for property contentType.
 String getContentTypeWithParams()
           
 Document getDOM()
           
 String getEncoding()
          Getter for property encoding
 boolean getGuessedContentType()
          Getter for property guessedContentType.
static String[] getMimeTypeNames()
           
static String[] getMimeTypes()
           
 String getTitle()
           
private  void guessEncoding()
           
static void main(String[] args)
           
private  String normalizeWhiteSpace(String text)
           
private  void setTitle(String title)
           
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

DEBUG

private static boolean DEBUG
Do we print the debug information


charsetPat

private static Pattern charsetPat

file

private File file

contentType

private String contentType

title

private String title

doc

private Document doc

encoding

private String encoding

fileName

private String fileName

hasCSSCharset

private boolean hasCSSCharset

guessedContentType

private boolean guessedContentType

ANY

private static final char ANY
See Also:
Constant Field Values

DIGIT

private static final char DIGIT
See Also:
Constant Field Values

MAX_MAGIC_LENGTH

private static final int MAX_MAGIC_LENGTH
See Also:
Constant Field Values

magicNumbers

private static char[][] magicNumbers

magicTypes

private static String[] magicTypes

extensions

private static String[] extensions

extensionTypes

private static String[] extensionTypes

mimeTypes

private static String[] mimeTypes

mimeTypeNames

private static String[] mimeTypeNames
Constructor Detail

MetadataExtractor

public MetadataExtractor(File f)
                  throws IOException
Throws:
IOException

MetadataExtractor

public MetadataExtractor(File f,
                         String fileName)
                  throws IOException
Creates a new instance of MetadataExtractor

Throws:
IOException
Method Detail

main

public static void main(String[] args)
                 throws Exception
Throws:
Exception

checkEncoding

private void checkEncoding()
                    throws IOException
Throws:
IOException

determineTypeFromMagic

private void determineTypeFromMagic()
                             throws IOException
Throws:
IOException

guessEncoding

private void guessEncoding()
                    throws IOException
Throws:
IOException

couldBeUTF8

private boolean couldBeUTF8()
                     throws IOException
Throws:
IOException

determineTypeFromXML

private void determineTypeFromXML()
                           throws IOException
Throws:
IOException

determineTypeFromZip

private void determineTypeFromZip()
                           throws IOException
Throws:
IOException

determineTypeFromExtension

private void determineTypeFromExtension()

extractMetaFromHTML

private void extractMetaFromHTML()
                          throws IOException
Throws:
IOException

extractMetaFromPDF

private void extractMetaFromPDF()
                         throws IOException
Throws:
IOException

normalizeWhiteSpace

private String normalizeWhiteSpace(String text)

setTitle

private void setTitle(String title)

getContentType

public String getContentType()
Getter for property contentType.

Returns:
Value of property contentType.

getEncoding

public String getEncoding()
Getter for property encoding

Returns:
Value of property encoding.

getContentTypeWithParams

public String getContentTypeWithParams()

getGuessedContentType

public boolean getGuessedContentType()
Getter for property guessedContentType. If guessedContentType is true, the content type was derived from the file name and it could be wrong. If guessedContentType is false, the content type was derived from the magic bytes of the file and you can trust on that information.

Returns:
the value of guessedContentType field

getTitle

public String getTitle()

getDOM

public Document getDOM()

getMimeTypes

public static String[] getMimeTypes()

getMimeTypeNames

public static String[] getMimeTypeNames()