001    /*
002     * Copyright (c) 2003, 2004 Henri Sivonen, Yrjö Kari-Koskinen
003     *
004     * Permission is hereby granted, free of charge, to any person obtaining a 
005     * copy of this software and associated documentation files (the "Software"), 
006     * to deal in the Software without restriction, including without limitation 
007     * the rights to use, copy, modify, merge, publish, distribute, sublicense, 
008     * and/or sell copies of the Software, and to permit persons to whom the 
009     * Software is furnished to do so, subject to the following conditions:
010     *
011     * The above copyright notice and this permission notice shall be included in 
012     * all copies or substantial portions of the Software.
013     *
014     * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 
015     * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 
016     * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 
017     * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 
018     * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 
019     * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 
020     * DEALINGS IN THE SOFTWARE.
021     */
022    
023    package fi.iki.hsivonen.util;
024    import java.io.BufferedInputStream;
025    import java.io.BufferedReader;
026    import java.io.File;
027    import java.io.FileInputStream;
028    import java.io.IOException;
029    import java.io.InputStreamReader;
030    import java.nio.charset.CharacterCodingException;
031    import java.nio.charset.Charset;
032    import java.nio.charset.CharsetDecoder;
033    import java.nio.charset.CodingErrorAction;
034    import java.util.Arrays;
035    import java.util.regex.Matcher;
036    import java.util.regex.Pattern;
037    import java.util.zip.ZipEntry;
038    import java.util.zip.ZipException;
039    import java.util.zip.ZipFile;
040    
041    import javax.xml.parsers.DocumentBuilder;
042    
043    import multivalent.std.adaptor.pdf.Dict;
044    import multivalent.std.adaptor.pdf.PDFReader;
045    
046    import org.w3c.dom.Document;
047    import org.w3c.dom.Element;
048    import org.w3c.dom.Node;
049    import org.w3c.dom.NodeList;
050    import org.xml.sax.InputSource;
051    import org.xml.sax.XMLReader;
052    
053    import fi.iki.hsivonen.gnu.xml.pipeline.DomConsumer;
054    import fi.iki.hsivonen.xml.DOMUtils;
055    import fi.iki.hsivonen.xml.SAXUtils;
056    
057    /**
058     *
059     * @author hsivonen
060     * @author ykk
061     */
062    public class MetadataExtractor {
063            /** Do we print the debug information */
064            private static boolean DEBUG = false;
065    
066            private static Pattern charsetPat =
067                    Pattern.compile("^.*charset\\s*=\\s*((?:-|\\w)+).*$");
068    
069            private File file;
070            private String contentType;
071            private String title;
072            private Document doc;
073            private String encoding;
074            private String fileName;
075            private boolean hasCSSCharset = false;
076            private boolean guessedContentType = false;
077    
078            private static final char ANY = '\uE000';
079            private static final char DIGIT = '\uE001';
080    
081            private static final int MAX_MAGIC_LENGTH = 12;
082    
083            private static char[][] magicNumbers = {
084                    {'\u0089', 'P', 'N', 'G', '\r', '\n', '\u001A', '\n'}, // PNG 1.0 spec
085                    {'\u008a', 'M', 'N', 'G', '\r', '\n', '\u001A', '\n'}, // MNG spec
086                    {'G', 'I', 'F', '8', '7', 'a'}, // BSD /etc/magic
087                    {'G', 'I', 'F', '8', '9', 'a'}, // BSD /etc/magic
088                    {'%', 'P', 'D', 'F', '-', DIGIT, '.', DIGIT},
089                    {'%', '!'},
090                    {'P', 'K', '\u0003', '\u0004'},
091                    {'F', 'W', 'S'}, // observation and BSD /etc/magic
092                    {'.', 'R', 'M', 'F', '\u0000'}, // observation
093                    {'{', '\\', 'r', 't', 'f'},
094                    {'\u0000', '\u0000', '\u0001', '\u00BA'},
095                    {'\u0000', '\u0000', '\u0001', '\u00B3'},
096                    {ANY, ANY, ANY, ANY, ANY, ANY, 'J', 'F', 'I', 'F', '\u0000'}, // observation, BSD /etc/magic and the JFIF spec http://www.w3.org/Graphics/JPEG/jfif3.pdf
097                    {ANY, ANY, ANY, ANY, 'm', 'o', 'o', 'v'}, // /usr/share/misc/file/magic
098                    {ANY, ANY, ANY, ANY, 'm', 'd', 'a', 't'}, // /usr/share/misc/file/magic
099                    {ANY, ANY, ANY, ANY, 'f', 'r', 'e', 'e'}, // /usr/share/misc/file/magic
100                    {ANY, ANY, ANY, ANY, 'j', 'u', 'n', 'k'}, // /usr/share/misc/file/magic
101                    {ANY, ANY, ANY, ANY, 'p', 'n', 'o', 't'}, // /usr/share/misc/file/magic
102                    {ANY, ANY, ANY, ANY, 's', 'k', 'i', 'p'}, // /usr/share/misc/file/magic
103                    {ANY, ANY, ANY, ANY, 'w', 'i', 'd', 'e'}, // /usr/share/misc/file/magic
104                    {ANY, ANY, ANY, ANY, 'p', 'i', 'c', 't'}, // /usr/share/misc/file/magic
105                    {ANY, ANY, ANY, ANY, 'f', 't', 'y', 'p'}, // observation
106                    {'R', 'I', 'F', 'F', ANY, ANY, ANY, ANY, 'A', 'V', 'I'}, // /usr/share/file/magic.mime
107                    {'R', 'I', 'F', 'F', ANY, ANY, ANY, ANY, 'W', 'A', 'V', 'E'}, // /usr/share/file/magic.mime
108                    {'\u00FF', '\u00FA'}, // /usr/share/file/magic.mime
109                    {'\u00FF', '\u00FB'}, // /usr/share/file/magic.mime
110                    {'I', 'D', '3'}, // /usr/share/file/magic.mime
111                    {'O', 'g', 'g', 'S'}, // /usr/share/file/magic.mime
112                    {'B', 'M'}, // /usr/share/file/magic.mime
113                    {'S', 't', 'u', 'f', 'f', 'I', 't'}, // observation
114                    {'S', 'I', 'T', '!'}, // observation
115                    {'\u00FE', '\u00FF'}, // big endian BOM
116                    {'\u00FF', '\u00FE'}, // little endian BOM
117                    {'@', 'c', 'h', 'a', 'r', 's', 'e', 't'},
118                    {'I', 'I'},
119                    {'M', 'M'},
120                    {'\u0000', '\u0000', '\u0000', '\u000C', '\u006A', '\u0050', '\u0020', '\u0020', '\r', '\n', '\u0087', '\n'} // http://www.iana.org/assignments/media-types/image/jp2
121            };
122    
123    
124        private static String[] magicTypes =
125            {
126                "image/png",
127                "video/x-mng",
128                "image/gif",
129                "image/gif",
130                "application/pdf",
131                "application/postscript",
132                "application/zip",
133                "application/x-shockwave-flash",
134                "audio/x-pn-realaudio",
135                "text/rtf",
136                "video/mpeg",
137                "video/mpeg",
138                "image/jpeg",
139                "video/quicktime",
140                "video/quicktime",
141                "video/quicktime",
142                "video/quicktime",
143                "video/quicktime",
144                "video/quicktime",
145                "video/quicktime",
146                "video/quicktime",
147                "video/mp4",
148                "video/x-msvideo",
149                "audio/x-wav",
150                "audio/mpeg",
151                "audio/mpeg",
152                "audio/mpeg",
153                "application/ogg",
154                "image/x-bmp",
155                "application/x-stuffit",
156                "application/x-stuffit",
157                "utf-16",
158            // see determineTypeFromMagic
159            "utf-16", // see determineTypeFromMagic
160            "CSS_CHARSET", // see determineTypeFromMagic
161            "image/tiff", "image/tiff", "image/jp2" };
162    
163        // must be sorted lexicographically
164        private static String[] extensions =
165            {
166                "asf",
167                "css",
168                "doc",
169                "htm",
170                "html",
171                "ppt",
172                "txt",
173                "wma",
174                "wmv",
175                "xls" };
176    
177        private static String[] extensionTypes =
178            {
179                "video/x-ms-asf",
180                "text/css",
181                "application/msword",
182                "text/html",
183                "text/html",
184                "application/vnd.ms-powerpoint",
185                "text/plain",
186                "audio/x-ms-wma",
187                "video/x-ms-wmv",
188                "application/vnd.ms-excel" };
189    
190        private static String[] mimeTypes = {
191            // teksti
192            "text/plain",
193                "text/html",
194                "application/xhtml+xml",
195                "application/xml",
196                "text/css",
197                "text/rtf",
198                "application/pdf",
199                "application/postscript",
200    
201            // toimisto-ohjelmat
202            "application/msword",
203                "application/vnd.ms-excel",
204                "application/vnd.ms-powerpoint",
205                "application/vnd.sun.xml.writer",
206                "application/vnd.sun.xml.calc",
207                "application/vnd.sun.xml.impress",
208                "application/vnd.sun.xml.draw",
209    
210            // kuvat
211            "image/jpeg",
212                "image/gif",
213                "image/png",
214                "image/x-bmp",
215                "image/svg+xml",
216                "image/tiff",
217                "image/jp2",
218    
219            // video
220            "video/mpeg",
221                "video/mp4",
222                "video/x-msvideo",
223                "video/x-ms-asf",
224                "video/x-ms-wmv",
225                "audio/x-pn-realaudio",
226                "video/quicktime",
227                "video/x-mng",
228    
229            // ääni
230            "audio/mpeg", "audio/x-wav", "audio/x-ms-wma", "application/ogg",
231    
232            // pakkaustyökalut
233            "application/zip",
234                "application/x-stuffit",
235                "application/x-shockwave-flash" };
236    
237        private static String[] mimeTypeNames = {
238            // teksti
239            "Tekstidokumentti",
240                "HTML-dokumentti",
241                "XHTML-dokumentti",
242                "XML-dokumentti",
243                "Tyylisivu (CSS)",
244                "RTF-dokumentti",
245                "PDF-dokumentti",
246                "PostScript-dokumentti",
247    
248            // toimisto-ohjelmat
249            "MSOffice Word -dokumentti",
250                "MSOffice Excel -työkirja",
251                "MSOffice Powerpoint -esitys",
252                "OpenOffice Writer -dokumentti",
253                "OpenOffice Calc -työkirja",
254                "OpenOffice Impress -esitys",
255                "OpenOffice Impress -piirros",
256    
257            // kuvat
258            "JPEG-kuva",
259                "GIF-kuva",
260                "PNG-kuva",
261                "BMP-kuva",
262                "SVG-kuva",
263                "TIFF-kuva",
264                "JPEG 2000 -kuva",
265    
266            // video
267            "MPEG-video",
268                "MPEG4-video",
269                "AVI-video",
270                "ASF-video",
271                "WMV-video",
272                "Real Media -video",
273                "Quicktime-video",
274                "MNG-animaatio",
275    
276            // ääni
277            "MP3-ääni", "WAV-ääni", "WMA-ääni", "OGG Vorbis -ääni",
278    
279            // pakkaustyökalut
280            "ZIP-paketti", "StuffIt-paketti", "Flash-animaatio" };
281    
282        public static void main(String[] args) throws Exception {
283            if (args.length < 1) {
284                System.out.println("Usage: java MetadataExtractor filename");
285                System.exit(0);
286            }
287    
288            File f = new File(args[0]);
289            MetadataExtractor me = new MetadataExtractor(f);
290            System.out.println(
291                "content-type:\t"
292                    + me.getContentType()
293                    + (me.getGuessedContentType() ? " (guessed)" : ""));
294                    System.out.println("encoding:\t\t" + me.getEncoding());
295                    System.out.println("title:\t\t" + me.getTitle());
296            if (DEBUG && me.getTitle() != null) {
297                for (int i = 0; i < me.getTitle().length(); i++) {
298                    System.out.println(
299                        Integer.toHexString((int)me.getTitle().charAt(i)));
300                }
301            }
302        }
303    
304        public MetadataExtractor(File f) throws IOException {
305            this(f, null);
306        }
307    
308        /** Creates a new instance of MetadataExtractor */
309        public MetadataExtractor(File f, String fileName) throws IOException {
310            DEBUG = (System.getProperty("DebugMetadataExtractor") != null);
311            this.file = f;
312            if (fileName == null) {
313                this.fileName = f.getName();
314            } else {
315                this.fileName = fileName;
316            }
317            this.contentType = null;
318            this.determineTypeFromMagic();
319            if ("application/zip".equals(this.contentType)) {
320                this.determineTypeFromZip();
321            }
322            if (this.contentType == null) {
323                this.determineTypeFromXML();
324            }
325            if (this.contentType == null) {
326                this.determineTypeFromExtension();
327            }
328            this.checkEncoding();
329            if ("application/pdf".equals(this.contentType)) {
330                this.extractMetaFromPDF();
331            }
332            if (this.contentType != null
333                && this.contentType.startsWith("text/html")) {
334                this.extractMetaFromHTML();
335            }
336        }
337    
338        private void checkEncoding() throws IOException {
339            if (!("text/css".equals(this.contentType)
340                || "text/html".equals(this.contentType)
341                || "text/plain".equals(this.contentType))) {
342                return;
343            }
344            if ("text/css".equals(this.contentType) && this.hasCSSCharset) {
345                return;
346            }
347            if (this.encoding == null) {
348                this.guessEncoding();
349            }
350    
351            // ei haluta contentType-stringiin vielä tässä vaiheessa,
352            //sillä muuten käyttäjä ei voi valita contentTypeä
353    
354            //this.contentType = this.contentType + "; charset=" + this.encoding;
355        }
356    
357        private void determineTypeFromMagic() throws IOException {
358            byte[] buf = new byte[MAX_MAGIC_LENGTH];
359            FileInputStream stream = new FileInputStream(this.file);
360            int numBytes = stream.read(buf);
361            stream.close();
362    
363            if (DEBUG) {
364                for (int i = 0; i < buf.length; i++) {
365                    System.out.println(Integer.toHexString(((int)buf[i]) & 0xFF));
366                }
367            }
368    
369            for (int i = 0; i < magicNumbers.length; i++) {
370                boolean match = true;
371                for (int j = 0; j < magicNumbers[i].length && j < numBytes; j++) {
372                    if (magicNumbers[i][j] == ANY) {
373                        continue;
374                    } else if (
375                        magicNumbers[i][j] == DIGIT
376                            && buf[j] < '9'
377                            && buf[j] > '0') {
378                        continue;
379                    } else if (
380                        magicNumbers[i][j] == (char) (((int)buf[j]) & 0xFF)) {
381                        continue;
382                    } else {
383                        match = false;
384                        break;
385                    }
386                }
387                if (match) {
388                    if ("utf-16".equals(magicTypes[i])) {
389                        this.encoding = "utf-16";
390                    } else if ("CSS_CHARSET".equals(magicTypes[i])) {
391                        this.hasCSSCharset = true;
392                    } else {
393                        this.contentType = magicTypes[i];
394                    }
395                    return;
396                }
397            }
398        }
399    
400        private void guessEncoding() throws IOException {
401            int b;
402            boolean couldBeASCII = true;
403            boolean couldBeISO = true;
404            BufferedInputStream in =
405                new BufferedInputStream(new FileInputStream(this.file));
406            try {
407                while (((b = in.read()) != -1) && (couldBeASCII || couldBeISO)) {
408                    if (b > 0x7F) {
409                        couldBeASCII = false;
410                        if (b < 0xA0) {
411                            couldBeISO = false;
412                        }
413                    }
414                }
415            } finally {
416                in.close();
417            }
418            if (couldBeASCII) {
419                this.encoding = "us-ascii";
420                return;
421            }
422            if (this.couldBeUTF8()) {
423                this.encoding = "utf-8";
424                return;
425            }
426            if (couldBeISO) {
427                this.encoding = "iso-8859-1";
428                return;
429            }
430            this.encoding = "windows-1252";
431            return;
432        }
433    
434        private boolean couldBeUTF8() throws IOException {
435            BufferedReader in;
436            CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
437            decoder.onMalformedInput(CodingErrorAction.REPORT);
438            decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
439            in =
440                new BufferedReader(
441                    new InputStreamReader(new FileInputStream(this.file), decoder));
442            try {
443                while (in.read() != -1) {}
444            } catch (CharacterCodingException e) {
445                return false;
446            } finally {
447                in.close();
448            }
449            return true;
450        }
451    
452        private void determineTypeFromXML() throws IOException {
453            DocumentBuilder builder = DOMUtils.newNonvalidatingDocumentBuilder();
454            // If the file can be parsed as XML without fatal errors, it is by
455            // definition an XML document. OTOH, if there are fatal errors, it
456            // by definition is not.
457            try {
458                InputSource is = new InputSource(new FileInputStream(this.file));
459                is.setSystemId("file:///foo");
460                this.doc = builder.parse(is);
461            } catch (Exception e) {
462                return;
463            }
464            // However, some HTML tag soup docs might by chance be well-formed.
465            Element root = doc.getDocumentElement();
466            if (root.getNamespaceURI() == null
467                && "HTML".equalsIgnoreCase(root.getNodeName())) {
468                this.contentType = "text/html";
469                return;
470            } else if (
471                "http://www.w3.org/1999/xhtml".equals(root.getNamespaceURI())
472                    && "html".equals(root.getLocalName())) {
473                loop : for (
474                    Node n = root.getFirstChild();
475                        n != null;
476                        n = n.getNextSibling()) {
477                    if ("http://www.w3.org/1999/xhtml".equals(n.getNamespaceURI())
478                        && "head".equals(n.getLocalName())) {
479                        for (Node m = n.getFirstChild();
480                            m != null;
481                            m = m.getNextSibling()) {
482                            if ("http://www.w3.org/1999/xhtml"
483                                .equals(m.getNamespaceURI())
484                                && "title".equals(m.getLocalName())) {
485                                this.setTitle(DOMUtils.textContent(m));
486                                break loop;
487                            }
488                        }
489                    }
490                }
491                this.contentType = "application/xhtml+xml";
492                return;
493            } else if (
494                "http://www.w3.org/2000/svg".equals(root.getNamespaceURI())
495                    && "svg".equals(root.getLocalName())) {
496                for (Node n = root.getFirstChild();
497                    n != null;
498                    n = n.getNextSibling()) {
499                    if ("http://www.w3.org/2000/svg".equals(n.getNamespaceURI())
500                        && "title".equals(n.getLocalName())) {
501                        this.setTitle(DOMUtils.textContent(n));
502                        break;
503                    }
504                }
505                this.contentType = "image/svg+xml";
506                return;
507            } else {
508                this.contentType = "application/xml";
509                return;
510            }
511        }
512    
513        private void determineTypeFromZip() throws IOException {
514            ZipFile zf;
515            try {
516                zf = new ZipFile(this.file);
517            } catch (ZipException e) {
518                // doesn't look like a zip file after all
519                this.contentType = null;
520                return;
521            }
522            // no support for encrypted files
523            Document metaDoc;
524            DocumentBuilder builder = DOMUtils.newNonvalidatingDocumentBuilder();
525            ZipEntry metaEntry;
526            if ((metaEntry = zf.getEntry("meta.xml")) != null) {
527                try {
528                    InputSource is = new InputSource(zf.getInputStream(metaEntry));
529                    is.setSystemId("file:///foo");
530                    metaDoc = builder.parse(is);
531                } catch (Exception e) {
532                    return;
533                }
534                Node titleElt =
535                    DOMUtils.findElement(
536                        metaDoc,
537                        "http://purl.org/dc/elements/1.1/",
538                        "title");
539                if (titleElt != null) {
540                    this.setTitle(DOMUtils.textContent(titleElt));
541                }
542            }
543            ZipEntry contentEntry;
544            if ((contentEntry = zf.getEntry("content.xml")) == null) {
545                // not an OOo file
546                return;
547            }
548            try {
549                InputSource is = new InputSource(zf.getInputStream(contentEntry));
550                is.setSystemId("file:///foo");
551                this.doc = builder.parse(is);
552            } catch (Exception e) {
553                return;
554            }
555            Element root = this.doc.getDocumentElement();
556            if ("http://www.w3.org/1998/Math/MathML"
557                .equals(root.getNamespaceURI())) {
558                this.contentType = "application/vnd.sun.xml.math";
559                return;
560            }
561            String docClass =
562                root.getAttributeNS("http://openoffice.org/2000/office", "class");
563            if ("text".equals(docClass)) {
564                this.contentType = "application/vnd.sun.xml.writer";
565                // If we didn't get a title from the metadata, 
566                // let's use the first heading
567                if (this.title == null) {
568                    Node headingElt =
569                        DOMUtils.findElement(
570                            root,
571                            "http://openoffice.org/2000/text",
572                            "h");
573                    if (headingElt != null) {
574                        this.setTitle(DOMUtils.textContent(headingElt));
575                    }
576                }
577                return;
578            } else if ("text-global".equals(docClass)) {
579                this.contentType = "application/vnd.sun.xml.writer.global";
580                return;
581            } else if ("spreadsheet".equals(docClass)) {
582                this.contentType = "application/vnd.sun.xml.calc";
583                return;
584            } else if ("drawing".equals(docClass)) {
585                this.contentType = "application/vnd.sun.xml.draw";
586                return;
587            } else if ("presentation".equals(docClass)) {
588                this.contentType = "application/vnd.sun.xml.impress";
589                // If we didn't get a title from the metadata, 
590                // let's use the first text box
591                if (this.title == null) {
592                    Node headingElt =
593                        DOMUtils.findElement(
594                            root,
595                            "http://openoffice.org/2000/text",
596                            "p");
597                    if (headingElt != null) {
598                        this.setTitle(DOMUtils.textContent(headingElt));
599                    }
600                }
601                return;
602            } else if ("chart".equals(docClass)) {
603                this.contentType = "application/vnd.sun.xml.calc";
604                // XXX is this OK?
605                return;
606            }
607        }
608    
609        private void determineTypeFromExtension() {
610            int dotIndex = this.fileName.lastIndexOf(".");
611            if (dotIndex < 1) {
612                return;
613            }
614            String ext = this.fileName.substring(dotIndex + 1).toLowerCase();
615            int i = Arrays.binarySearch(extensions, ext);
616            if (i < 0) {
617                return;
618            }
619            this.contentType = extensionTypes[i];
620            this.guessedContentType = true;
621        }
622    
623        private void extractMetaFromHTML() throws IOException {
624            try {
625                boolean reparse = false;
626                XMLReader tagSoup = SAXUtils.newTagSoupXMLReader();
627                InputSource is = new InputSource(new FileInputStream(this.file));
628                is.setSystemId("file:///foo");
629                is.setEncoding(this.encoding);
630                DomConsumer builder = DOMUtils.newDomConsumer();
631                tagSoup.setContentHandler(builder.getContentHandler());
632                tagSoup.parse(is);
633                this.doc = builder.getDocument();
634                Element root = doc.getDocumentElement();
635    
636                NodeList nl =
637                    root.getElementsByTagNameNS(
638                        "http://www.w3.org/1999/xhtml",
639                        "meta");
640                int len = nl.getLength();
641                for (int i = 0; i < len; i++) {
642                    Element meta = (Element)nl.item(i);
643                    if ("content-type"
644                        .equalsIgnoreCase(meta.getAttribute("httpequiv"))) {
645                        String enc = meta.getAttribute("content");
646                        if (enc == null) {
647                            break;
648                        }
649                        enc = enc.toLowerCase();
650                        Matcher m = charsetPat.matcher(enc);
651                        if(m.matches()) {
652                                                    enc = m.group(1);
653                                                    if (enc == null) {
654                                                            break;
655                                                    }                       
656                        } else {
657                            break;
658                        }
659                        if (!(enc.startsWith("utf-")
660                            || enc.equals("iso-8859-1")
661                            || enc.equals("windows-1252")
662                            || enc.equals("us-ascii"))) {
663                            this.encoding = enc;
664                            reparse = true;
665                            break;
666                        }
667    
668                    }
669                }
670    
671                if (reparse) {
672                    is.setByteStream(new FileInputStream(this.file));
673                    is.setEncoding(this.encoding);
674                    tagSoup.parse(is);
675                    this.doc = builder.getDocument();
676                    root = doc.getDocumentElement();
677                }
678    
679                loop : for (
680                    Node n = root.getFirstChild();
681                        n != null;
682                        n = n.getNextSibling()) {
683                    if ("http://www.w3.org/1999/xhtml".equals(n.getNamespaceURI())
684                        && "head".equals(n.getLocalName())) {
685                        for (Node m = n.getFirstChild();
686                            m != null;
687                            m = m.getNextSibling()) {
688                            if ("http://www.w3.org/1999/xhtml"
689                                .equals(m.getNamespaceURI())
690                                && "title".equals(m.getLocalName())) {
691                                this.setTitle(DOMUtils.textContent(m));
692                                break loop;
693                            }
694                        }
695                    }
696                }
697            } catch (Exception e) {
698    System.err.println(e);
699            }
700        }
701    
702        private void extractMetaFromPDF() throws IOException {
703            try {
704                PDFReader reader = new PDFReader(this.file);
705                Dict info = reader.getInfo();
706                this.setTitle(reader.getObject(info.get("Title")).toString());
707            } catch (Exception e) {
708    
709            }
710        }
711    
712        private String normalizeWhiteSpace(String text) {
713            StringBuilder buf = new StringBuilder(text.length());
714            boolean lastIsWhitespace = true;
715            for (int i = 0; i < text.length(); i++) {
716                char c = text.charAt(i);
717                if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
718                    if (!lastIsWhitespace) {
719                        buf.append(' ');
720                        lastIsWhitespace = true;
721                    }
722                } else {
723                    buf.append(c);
724                    lastIsWhitespace = false;
725                }
726            }
727            if (buf.charAt(buf.length() - 1) == ' ') {
728                buf.deleteCharAt(buf.length() - 1);
729            }
730            return buf.toString();
731        }
732    
733        private void setTitle(String title) {
734            String collapsed = this.normalizeWhiteSpace(title);
735            if ("".equals(collapsed)) {
736                this.title = null;
737            } else {
738                this.title = collapsed;
739            }
740        }
741    
742        /** 
743         * Getter for property contentType.
744         * @return Value of property contentType.
745         */
746        public java.lang.String getContentType() {
747            return this.contentType;
748        }
749    
750        /** 
751         * Getter for property encoding
752         * @return Value of property encoding.
753         */
754        public java.lang.String getEncoding() {
755            return this.encoding;
756        }
757        
758        public String getContentTypeWithParams() {
759            if(this.encoding == null) {
760                return this.contentType;
761            } else {
762                return this.contentType + "; charset=" + this.encoding;
763            }
764        }
765    
766        /** 
767         * Getter for property guessedContentType. If guessedContentType
768         * is true, the content type was derived from the file name and it
769         * could be wrong. If guessedContentType is false, the content
770         * type was derived from the magic bytes of the file and you can
771         * trust on that information.
772         * @return the value of guessedContentType field
773         */
774        public boolean getGuessedContentType() {
775            return this.guessedContentType;
776        }
777    
778        public java.lang.String getTitle() {
779            return this.title;
780        }
781    
782        public Document getDOM() {
783            return this.doc;
784        }
785    
786        public static String[] getMimeTypes() {
787            return mimeTypes;
788        }
789    
790        public static String[] getMimeTypeNames() {
791            return mimeTypeNames;
792        }
793    }