001 /*
002 * Copyright (c) 2003, 2004 Henri Sivonen, Yrjö Kari-Koskinen
003 *
004 * Permission is hereby granted, free of charge, to any person obtaining a
005 * copy of this software and associated documentation files (the "Software"),
006 * to deal in the Software without restriction, including without limitation
007 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
008 * and/or sell copies of the Software, and to permit persons to whom the
009 * Software is furnished to do so, subject to the following conditions:
010 *
011 * The above copyright notice and this permission notice shall be included in
012 * all copies or substantial portions of the Software.
013 *
014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
020 * DEALINGS IN THE SOFTWARE.
021 */
022
023 package fi.iki.hsivonen.util;
024 import java.io.BufferedInputStream;
025 import java.io.BufferedReader;
026 import java.io.File;
027 import java.io.FileInputStream;
028 import java.io.IOException;
029 import java.io.InputStreamReader;
030 import java.nio.charset.CharacterCodingException;
031 import java.nio.charset.Charset;
032 import java.nio.charset.CharsetDecoder;
033 import java.nio.charset.CodingErrorAction;
034 import java.util.Arrays;
035 import java.util.regex.Matcher;
036 import java.util.regex.Pattern;
037 import java.util.zip.ZipEntry;
038 import java.util.zip.ZipException;
039 import java.util.zip.ZipFile;
040
041 import javax.xml.parsers.DocumentBuilder;
042
043 import multivalent.std.adaptor.pdf.Dict;
044 import multivalent.std.adaptor.pdf.PDFReader;
045
046 import org.w3c.dom.Document;
047 import org.w3c.dom.Element;
048 import org.w3c.dom.Node;
049 import org.w3c.dom.NodeList;
050 import org.xml.sax.InputSource;
051 import org.xml.sax.XMLReader;
052
053 import fi.iki.hsivonen.gnu.xml.pipeline.DomConsumer;
054 import fi.iki.hsivonen.xml.DOMUtils;
055 import fi.iki.hsivonen.xml.SAXUtils;
056
057 /**
058 *
059 * @author hsivonen
060 * @author ykk
061 */
062 public class MetadataExtractor {
063 /** Do we print the debug information */
064 private static boolean DEBUG = false;
065
066 private static Pattern charsetPat =
067 Pattern.compile("^.*charset\\s*=\\s*((?:-|\\w)+).*$");
068
069 private File file;
070 private String contentType;
071 private String title;
072 private Document doc;
073 private String encoding;
074 private String fileName;
075 private boolean hasCSSCharset = false;
076 private boolean guessedContentType = false;
077
078 private static final char ANY = '\uE000';
079 private static final char DIGIT = '\uE001';
080
081 private static final int MAX_MAGIC_LENGTH = 12;
082
083 private static char[][] magicNumbers = {
084 {'\u0089', 'P', 'N', 'G', '\r', '\n', '\u001A', '\n'}, // PNG 1.0 spec
085 {'\u008a', 'M', 'N', 'G', '\r', '\n', '\u001A', '\n'}, // MNG spec
086 {'G', 'I', 'F', '8', '7', 'a'}, // BSD /etc/magic
087 {'G', 'I', 'F', '8', '9', 'a'}, // BSD /etc/magic
088 {'%', 'P', 'D', 'F', '-', DIGIT, '.', DIGIT},
089 {'%', '!'},
090 {'P', 'K', '\u0003', '\u0004'},
091 {'F', 'W', 'S'}, // observation and BSD /etc/magic
092 {'.', 'R', 'M', 'F', '\u0000'}, // observation
093 {'{', '\\', 'r', 't', 'f'},
094 {'\u0000', '\u0000', '\u0001', '\u00BA'},
095 {'\u0000', '\u0000', '\u0001', '\u00B3'},
096 {ANY, ANY, ANY, ANY, ANY, ANY, 'J', 'F', 'I', 'F', '\u0000'}, // observation, BSD /etc/magic and the JFIF spec http://www.w3.org/Graphics/JPEG/jfif3.pdf
097 {ANY, ANY, ANY, ANY, 'm', 'o', 'o', 'v'}, // /usr/share/misc/file/magic
098 {ANY, ANY, ANY, ANY, 'm', 'd', 'a', 't'}, // /usr/share/misc/file/magic
099 {ANY, ANY, ANY, ANY, 'f', 'r', 'e', 'e'}, // /usr/share/misc/file/magic
100 {ANY, ANY, ANY, ANY, 'j', 'u', 'n', 'k'}, // /usr/share/misc/file/magic
101 {ANY, ANY, ANY, ANY, 'p', 'n', 'o', 't'}, // /usr/share/misc/file/magic
102 {ANY, ANY, ANY, ANY, 's', 'k', 'i', 'p'}, // /usr/share/misc/file/magic
103 {ANY, ANY, ANY, ANY, 'w', 'i', 'd', 'e'}, // /usr/share/misc/file/magic
104 {ANY, ANY, ANY, ANY, 'p', 'i', 'c', 't'}, // /usr/share/misc/file/magic
105 {ANY, ANY, ANY, ANY, 'f', 't', 'y', 'p'}, // observation
106 {'R', 'I', 'F', 'F', ANY, ANY, ANY, ANY, 'A', 'V', 'I'}, // /usr/share/file/magic.mime
107 {'R', 'I', 'F', 'F', ANY, ANY, ANY, ANY, 'W', 'A', 'V', 'E'}, // /usr/share/file/magic.mime
108 {'\u00FF', '\u00FA'}, // /usr/share/file/magic.mime
109 {'\u00FF', '\u00FB'}, // /usr/share/file/magic.mime
110 {'I', 'D', '3'}, // /usr/share/file/magic.mime
111 {'O', 'g', 'g', 'S'}, // /usr/share/file/magic.mime
112 {'B', 'M'}, // /usr/share/file/magic.mime
113 {'S', 't', 'u', 'f', 'f', 'I', 't'}, // observation
114 {'S', 'I', 'T', '!'}, // observation
115 {'\u00FE', '\u00FF'}, // big endian BOM
116 {'\u00FF', '\u00FE'}, // little endian BOM
117 {'@', 'c', 'h', 'a', 'r', 's', 'e', 't'},
118 {'I', 'I'},
119 {'M', 'M'},
120 {'\u0000', '\u0000', '\u0000', '\u000C', '\u006A', '\u0050', '\u0020', '\u0020', '\r', '\n', '\u0087', '\n'} // http://www.iana.org/assignments/media-types/image/jp2
121 };
122
123
124 private static String[] magicTypes =
125 {
126 "image/png",
127 "video/x-mng",
128 "image/gif",
129 "image/gif",
130 "application/pdf",
131 "application/postscript",
132 "application/zip",
133 "application/x-shockwave-flash",
134 "audio/x-pn-realaudio",
135 "text/rtf",
136 "video/mpeg",
137 "video/mpeg",
138 "image/jpeg",
139 "video/quicktime",
140 "video/quicktime",
141 "video/quicktime",
142 "video/quicktime",
143 "video/quicktime",
144 "video/quicktime",
145 "video/quicktime",
146 "video/quicktime",
147 "video/mp4",
148 "video/x-msvideo",
149 "audio/x-wav",
150 "audio/mpeg",
151 "audio/mpeg",
152 "audio/mpeg",
153 "application/ogg",
154 "image/x-bmp",
155 "application/x-stuffit",
156 "application/x-stuffit",
157 "utf-16",
158 // see determineTypeFromMagic
159 "utf-16", // see determineTypeFromMagic
160 "CSS_CHARSET", // see determineTypeFromMagic
161 "image/tiff", "image/tiff", "image/jp2" };
162
163 // must be sorted lexicographically
164 private static String[] extensions =
165 {
166 "asf",
167 "css",
168 "doc",
169 "htm",
170 "html",
171 "ppt",
172 "txt",
173 "wma",
174 "wmv",
175 "xls" };
176
177 private static String[] extensionTypes =
178 {
179 "video/x-ms-asf",
180 "text/css",
181 "application/msword",
182 "text/html",
183 "text/html",
184 "application/vnd.ms-powerpoint",
185 "text/plain",
186 "audio/x-ms-wma",
187 "video/x-ms-wmv",
188 "application/vnd.ms-excel" };
189
190 private static String[] mimeTypes = {
191 // teksti
192 "text/plain",
193 "text/html",
194 "application/xhtml+xml",
195 "application/xml",
196 "text/css",
197 "text/rtf",
198 "application/pdf",
199 "application/postscript",
200
201 // toimisto-ohjelmat
202 "application/msword",
203 "application/vnd.ms-excel",
204 "application/vnd.ms-powerpoint",
205 "application/vnd.sun.xml.writer",
206 "application/vnd.sun.xml.calc",
207 "application/vnd.sun.xml.impress",
208 "application/vnd.sun.xml.draw",
209
210 // kuvat
211 "image/jpeg",
212 "image/gif",
213 "image/png",
214 "image/x-bmp",
215 "image/svg+xml",
216 "image/tiff",
217 "image/jp2",
218
219 // video
220 "video/mpeg",
221 "video/mp4",
222 "video/x-msvideo",
223 "video/x-ms-asf",
224 "video/x-ms-wmv",
225 "audio/x-pn-realaudio",
226 "video/quicktime",
227 "video/x-mng",
228
229 // ääni
230 "audio/mpeg", "audio/x-wav", "audio/x-ms-wma", "application/ogg",
231
232 // pakkaustyökalut
233 "application/zip",
234 "application/x-stuffit",
235 "application/x-shockwave-flash" };
236
237 private static String[] mimeTypeNames = {
238 // teksti
239 "Tekstidokumentti",
240 "HTML-dokumentti",
241 "XHTML-dokumentti",
242 "XML-dokumentti",
243 "Tyylisivu (CSS)",
244 "RTF-dokumentti",
245 "PDF-dokumentti",
246 "PostScript-dokumentti",
247
248 // toimisto-ohjelmat
249 "MSOffice Word -dokumentti",
250 "MSOffice Excel -työkirja",
251 "MSOffice Powerpoint -esitys",
252 "OpenOffice Writer -dokumentti",
253 "OpenOffice Calc -työkirja",
254 "OpenOffice Impress -esitys",
255 "OpenOffice Impress -piirros",
256
257 // kuvat
258 "JPEG-kuva",
259 "GIF-kuva",
260 "PNG-kuva",
261 "BMP-kuva",
262 "SVG-kuva",
263 "TIFF-kuva",
264 "JPEG 2000 -kuva",
265
266 // video
267 "MPEG-video",
268 "MPEG4-video",
269 "AVI-video",
270 "ASF-video",
271 "WMV-video",
272 "Real Media -video",
273 "Quicktime-video",
274 "MNG-animaatio",
275
276 // ääni
277 "MP3-ääni", "WAV-ääni", "WMA-ääni", "OGG Vorbis -ääni",
278
279 // pakkaustyökalut
280 "ZIP-paketti", "StuffIt-paketti", "Flash-animaatio" };
281
282 public static void main(String[] args) throws Exception {
283 if (args.length < 1) {
284 System.out.println("Usage: java MetadataExtractor filename");
285 System.exit(0);
286 }
287
288 File f = new File(args[0]);
289 MetadataExtractor me = new MetadataExtractor(f);
290 System.out.println(
291 "content-type:\t"
292 + me.getContentType()
293 + (me.getGuessedContentType() ? " (guessed)" : ""));
294 System.out.println("encoding:\t\t" + me.getEncoding());
295 System.out.println("title:\t\t" + me.getTitle());
296 if (DEBUG && me.getTitle() != null) {
297 for (int i = 0; i < me.getTitle().length(); i++) {
298 System.out.println(
299 Integer.toHexString((int)me.getTitle().charAt(i)));
300 }
301 }
302 }
303
304 public MetadataExtractor(File f) throws IOException {
305 this(f, null);
306 }
307
308 /** Creates a new instance of MetadataExtractor */
309 public MetadataExtractor(File f, String fileName) throws IOException {
310 DEBUG = (System.getProperty("DebugMetadataExtractor") != null);
311 this.file = f;
312 if (fileName == null) {
313 this.fileName = f.getName();
314 } else {
315 this.fileName = fileName;
316 }
317 this.contentType = null;
318 this.determineTypeFromMagic();
319 if ("application/zip".equals(this.contentType)) {
320 this.determineTypeFromZip();
321 }
322 if (this.contentType == null) {
323 this.determineTypeFromXML();
324 }
325 if (this.contentType == null) {
326 this.determineTypeFromExtension();
327 }
328 this.checkEncoding();
329 if ("application/pdf".equals(this.contentType)) {
330 this.extractMetaFromPDF();
331 }
332 if (this.contentType != null
333 && this.contentType.startsWith("text/html")) {
334 this.extractMetaFromHTML();
335 }
336 }
337
338 private void checkEncoding() throws IOException {
339 if (!("text/css".equals(this.contentType)
340 || "text/html".equals(this.contentType)
341 || "text/plain".equals(this.contentType))) {
342 return;
343 }
344 if ("text/css".equals(this.contentType) && this.hasCSSCharset) {
345 return;
346 }
347 if (this.encoding == null) {
348 this.guessEncoding();
349 }
350
351 // ei haluta contentType-stringiin vielä tässä vaiheessa,
352 //sillä muuten käyttäjä ei voi valita contentTypeä
353
354 //this.contentType = this.contentType + "; charset=" + this.encoding;
355 }
356
357 private void determineTypeFromMagic() throws IOException {
358 byte[] buf = new byte[MAX_MAGIC_LENGTH];
359 FileInputStream stream = new FileInputStream(this.file);
360 int numBytes = stream.read(buf);
361 stream.close();
362
363 if (DEBUG) {
364 for (int i = 0; i < buf.length; i++) {
365 System.out.println(Integer.toHexString(((int)buf[i]) & 0xFF));
366 }
367 }
368
369 for (int i = 0; i < magicNumbers.length; i++) {
370 boolean match = true;
371 for (int j = 0; j < magicNumbers[i].length && j < numBytes; j++) {
372 if (magicNumbers[i][j] == ANY) {
373 continue;
374 } else if (
375 magicNumbers[i][j] == DIGIT
376 && buf[j] < '9'
377 && buf[j] > '0') {
378 continue;
379 } else if (
380 magicNumbers[i][j] == (char) (((int)buf[j]) & 0xFF)) {
381 continue;
382 } else {
383 match = false;
384 break;
385 }
386 }
387 if (match) {
388 if ("utf-16".equals(magicTypes[i])) {
389 this.encoding = "utf-16";
390 } else if ("CSS_CHARSET".equals(magicTypes[i])) {
391 this.hasCSSCharset = true;
392 } else {
393 this.contentType = magicTypes[i];
394 }
395 return;
396 }
397 }
398 }
399
400 private void guessEncoding() throws IOException {
401 int b;
402 boolean couldBeASCII = true;
403 boolean couldBeISO = true;
404 BufferedInputStream in =
405 new BufferedInputStream(new FileInputStream(this.file));
406 try {
407 while (((b = in.read()) != -1) && (couldBeASCII || couldBeISO)) {
408 if (b > 0x7F) {
409 couldBeASCII = false;
410 if (b < 0xA0) {
411 couldBeISO = false;
412 }
413 }
414 }
415 } finally {
416 in.close();
417 }
418 if (couldBeASCII) {
419 this.encoding = "us-ascii";
420 return;
421 }
422 if (this.couldBeUTF8()) {
423 this.encoding = "utf-8";
424 return;
425 }
426 if (couldBeISO) {
427 this.encoding = "iso-8859-1";
428 return;
429 }
430 this.encoding = "windows-1252";
431 return;
432 }
433
434 private boolean couldBeUTF8() throws IOException {
435 BufferedReader in;
436 CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder();
437 decoder.onMalformedInput(CodingErrorAction.REPORT);
438 decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
439 in =
440 new BufferedReader(
441 new InputStreamReader(new FileInputStream(this.file), decoder));
442 try {
443 while (in.read() != -1) {}
444 } catch (CharacterCodingException e) {
445 return false;
446 } finally {
447 in.close();
448 }
449 return true;
450 }
451
452 private void determineTypeFromXML() throws IOException {
453 DocumentBuilder builder = DOMUtils.newNonvalidatingDocumentBuilder();
454 // If the file can be parsed as XML without fatal errors, it is by
455 // definition an XML document. OTOH, if there are fatal errors, it
456 // by definition is not.
457 try {
458 InputSource is = new InputSource(new FileInputStream(this.file));
459 is.setSystemId("file:///foo");
460 this.doc = builder.parse(is);
461 } catch (Exception e) {
462 return;
463 }
464 // However, some HTML tag soup docs might by chance be well-formed.
465 Element root = doc.getDocumentElement();
466 if (root.getNamespaceURI() == null
467 && "HTML".equalsIgnoreCase(root.getNodeName())) {
468 this.contentType = "text/html";
469 return;
470 } else if (
471 "http://www.w3.org/1999/xhtml".equals(root.getNamespaceURI())
472 && "html".equals(root.getLocalName())) {
473 loop : for (
474 Node n = root.getFirstChild();
475 n != null;
476 n = n.getNextSibling()) {
477 if ("http://www.w3.org/1999/xhtml".equals(n.getNamespaceURI())
478 && "head".equals(n.getLocalName())) {
479 for (Node m = n.getFirstChild();
480 m != null;
481 m = m.getNextSibling()) {
482 if ("http://www.w3.org/1999/xhtml"
483 .equals(m.getNamespaceURI())
484 && "title".equals(m.getLocalName())) {
485 this.setTitle(DOMUtils.textContent(m));
486 break loop;
487 }
488 }
489 }
490 }
491 this.contentType = "application/xhtml+xml";
492 return;
493 } else if (
494 "http://www.w3.org/2000/svg".equals(root.getNamespaceURI())
495 && "svg".equals(root.getLocalName())) {
496 for (Node n = root.getFirstChild();
497 n != null;
498 n = n.getNextSibling()) {
499 if ("http://www.w3.org/2000/svg".equals(n.getNamespaceURI())
500 && "title".equals(n.getLocalName())) {
501 this.setTitle(DOMUtils.textContent(n));
502 break;
503 }
504 }
505 this.contentType = "image/svg+xml";
506 return;
507 } else {
508 this.contentType = "application/xml";
509 return;
510 }
511 }
512
513 private void determineTypeFromZip() throws IOException {
514 ZipFile zf;
515 try {
516 zf = new ZipFile(this.file);
517 } catch (ZipException e) {
518 // doesn't look like a zip file after all
519 this.contentType = null;
520 return;
521 }
522 // no support for encrypted files
523 Document metaDoc;
524 DocumentBuilder builder = DOMUtils.newNonvalidatingDocumentBuilder();
525 ZipEntry metaEntry;
526 if ((metaEntry = zf.getEntry("meta.xml")) != null) {
527 try {
528 InputSource is = new InputSource(zf.getInputStream(metaEntry));
529 is.setSystemId("file:///foo");
530 metaDoc = builder.parse(is);
531 } catch (Exception e) {
532 return;
533 }
534 Node titleElt =
535 DOMUtils.findElement(
536 metaDoc,
537 "http://purl.org/dc/elements/1.1/",
538 "title");
539 if (titleElt != null) {
540 this.setTitle(DOMUtils.textContent(titleElt));
541 }
542 }
543 ZipEntry contentEntry;
544 if ((contentEntry = zf.getEntry("content.xml")) == null) {
545 // not an OOo file
546 return;
547 }
548 try {
549 InputSource is = new InputSource(zf.getInputStream(contentEntry));
550 is.setSystemId("file:///foo");
551 this.doc = builder.parse(is);
552 } catch (Exception e) {
553 return;
554 }
555 Element root = this.doc.getDocumentElement();
556 if ("http://www.w3.org/1998/Math/MathML"
557 .equals(root.getNamespaceURI())) {
558 this.contentType = "application/vnd.sun.xml.math";
559 return;
560 }
561 String docClass =
562 root.getAttributeNS("http://openoffice.org/2000/office", "class");
563 if ("text".equals(docClass)) {
564 this.contentType = "application/vnd.sun.xml.writer";
565 // If we didn't get a title from the metadata,
566 // let's use the first heading
567 if (this.title == null) {
568 Node headingElt =
569 DOMUtils.findElement(
570 root,
571 "http://openoffice.org/2000/text",
572 "h");
573 if (headingElt != null) {
574 this.setTitle(DOMUtils.textContent(headingElt));
575 }
576 }
577 return;
578 } else if ("text-global".equals(docClass)) {
579 this.contentType = "application/vnd.sun.xml.writer.global";
580 return;
581 } else if ("spreadsheet".equals(docClass)) {
582 this.contentType = "application/vnd.sun.xml.calc";
583 return;
584 } else if ("drawing".equals(docClass)) {
585 this.contentType = "application/vnd.sun.xml.draw";
586 return;
587 } else if ("presentation".equals(docClass)) {
588 this.contentType = "application/vnd.sun.xml.impress";
589 // If we didn't get a title from the metadata,
590 // let's use the first text box
591 if (this.title == null) {
592 Node headingElt =
593 DOMUtils.findElement(
594 root,
595 "http://openoffice.org/2000/text",
596 "p");
597 if (headingElt != null) {
598 this.setTitle(DOMUtils.textContent(headingElt));
599 }
600 }
601 return;
602 } else if ("chart".equals(docClass)) {
603 this.contentType = "application/vnd.sun.xml.calc";
604 // XXX is this OK?
605 return;
606 }
607 }
608
609 private void determineTypeFromExtension() {
610 int dotIndex = this.fileName.lastIndexOf(".");
611 if (dotIndex < 1) {
612 return;
613 }
614 String ext = this.fileName.substring(dotIndex + 1).toLowerCase();
615 int i = Arrays.binarySearch(extensions, ext);
616 if (i < 0) {
617 return;
618 }
619 this.contentType = extensionTypes[i];
620 this.guessedContentType = true;
621 }
622
623 private void extractMetaFromHTML() throws IOException {
624 try {
625 boolean reparse = false;
626 XMLReader tagSoup = SAXUtils.newTagSoupXMLReader();
627 InputSource is = new InputSource(new FileInputStream(this.file));
628 is.setSystemId("file:///foo");
629 is.setEncoding(this.encoding);
630 DomConsumer builder = DOMUtils.newDomConsumer();
631 tagSoup.setContentHandler(builder.getContentHandler());
632 tagSoup.parse(is);
633 this.doc = builder.getDocument();
634 Element root = doc.getDocumentElement();
635
636 NodeList nl =
637 root.getElementsByTagNameNS(
638 "http://www.w3.org/1999/xhtml",
639 "meta");
640 int len = nl.getLength();
641 for (int i = 0; i < len; i++) {
642 Element meta = (Element)nl.item(i);
643 if ("content-type"
644 .equalsIgnoreCase(meta.getAttribute("httpequiv"))) {
645 String enc = meta.getAttribute("content");
646 if (enc == null) {
647 break;
648 }
649 enc = enc.toLowerCase();
650 Matcher m = charsetPat.matcher(enc);
651 if(m.matches()) {
652 enc = m.group(1);
653 if (enc == null) {
654 break;
655 }
656 } else {
657 break;
658 }
659 if (!(enc.startsWith("utf-")
660 || enc.equals("iso-8859-1")
661 || enc.equals("windows-1252")
662 || enc.equals("us-ascii"))) {
663 this.encoding = enc;
664 reparse = true;
665 break;
666 }
667
668 }
669 }
670
671 if (reparse) {
672 is.setByteStream(new FileInputStream(this.file));
673 is.setEncoding(this.encoding);
674 tagSoup.parse(is);
675 this.doc = builder.getDocument();
676 root = doc.getDocumentElement();
677 }
678
679 loop : for (
680 Node n = root.getFirstChild();
681 n != null;
682 n = n.getNextSibling()) {
683 if ("http://www.w3.org/1999/xhtml".equals(n.getNamespaceURI())
684 && "head".equals(n.getLocalName())) {
685 for (Node m = n.getFirstChild();
686 m != null;
687 m = m.getNextSibling()) {
688 if ("http://www.w3.org/1999/xhtml"
689 .equals(m.getNamespaceURI())
690 && "title".equals(m.getLocalName())) {
691 this.setTitle(DOMUtils.textContent(m));
692 break loop;
693 }
694 }
695 }
696 }
697 } catch (Exception e) {
698 System.err.println(e);
699 }
700 }
701
702 private void extractMetaFromPDF() throws IOException {
703 try {
704 PDFReader reader = new PDFReader(this.file);
705 Dict info = reader.getInfo();
706 this.setTitle(reader.getObject(info.get("Title")).toString());
707 } catch (Exception e) {
708
709 }
710 }
711
712 private String normalizeWhiteSpace(String text) {
713 StringBuilder buf = new StringBuilder(text.length());
714 boolean lastIsWhitespace = true;
715 for (int i = 0; i < text.length(); i++) {
716 char c = text.charAt(i);
717 if (c == ' ' || c == '\t' || c == '\n' || c == '\r') {
718 if (!lastIsWhitespace) {
719 buf.append(' ');
720 lastIsWhitespace = true;
721 }
722 } else {
723 buf.append(c);
724 lastIsWhitespace = false;
725 }
726 }
727 if (buf.charAt(buf.length() - 1) == ' ') {
728 buf.deleteCharAt(buf.length() - 1);
729 }
730 return buf.toString();
731 }
732
733 private void setTitle(String title) {
734 String collapsed = this.normalizeWhiteSpace(title);
735 if ("".equals(collapsed)) {
736 this.title = null;
737 } else {
738 this.title = collapsed;
739 }
740 }
741
742 /**
743 * Getter for property contentType.
744 * @return Value of property contentType.
745 */
746 public java.lang.String getContentType() {
747 return this.contentType;
748 }
749
750 /**
751 * Getter for property encoding
752 * @return Value of property encoding.
753 */
754 public java.lang.String getEncoding() {
755 return this.encoding;
756 }
757
758 public String getContentTypeWithParams() {
759 if(this.encoding == null) {
760 return this.contentType;
761 } else {
762 return this.contentType + "; charset=" + this.encoding;
763 }
764 }
765
766 /**
767 * Getter for property guessedContentType. If guessedContentType
768 * is true, the content type was derived from the file name and it
769 * could be wrong. If guessedContentType is false, the content
770 * type was derived from the magic bytes of the file and you can
771 * trust on that information.
772 * @return the value of guessedContentType field
773 */
774 public boolean getGuessedContentType() {
775 return this.guessedContentType;
776 }
777
778 public java.lang.String getTitle() {
779 return this.title;
780 }
781
782 public Document getDOM() {
783 return this.doc;
784 }
785
786 public static String[] getMimeTypes() {
787 return mimeTypes;
788 }
789
790 public static String[] getMimeTypeNames() {
791 return mimeTypeNames;
792 }
793 }