001 /* 002 * Copyright (c) 2003, 2004 Henri Sivonen, Yrjö Kari-Koskinen 003 * 004 * Permission is hereby granted, free of charge, to any person obtaining a 005 * copy of this software and associated documentation files (the "Software"), 006 * to deal in the Software without restriction, including without limitation 007 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 008 * and/or sell copies of the Software, and to permit persons to whom the 009 * Software is furnished to do so, subject to the following conditions: 010 * 011 * The above copyright notice and this permission notice shall be included in 012 * all copies or substantial portions of the Software. 013 * 014 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 015 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 016 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 017 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 018 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 019 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 020 * DEALINGS IN THE SOFTWARE. 021 */ 022 023 package fi.iki.hsivonen.util; 024 import java.io.BufferedInputStream; 025 import java.io.BufferedReader; 026 import java.io.File; 027 import java.io.FileInputStream; 028 import java.io.IOException; 029 import java.io.InputStreamReader; 030 import java.nio.charset.CharacterCodingException; 031 import java.nio.charset.Charset; 032 import java.nio.charset.CharsetDecoder; 033 import java.nio.charset.CodingErrorAction; 034 import java.util.Arrays; 035 import java.util.regex.Matcher; 036 import java.util.regex.Pattern; 037 import java.util.zip.ZipEntry; 038 import java.util.zip.ZipException; 039 import java.util.zip.ZipFile; 040 041 import javax.xml.parsers.DocumentBuilder; 042 043 import multivalent.std.adaptor.pdf.Dict; 044 import multivalent.std.adaptor.pdf.PDFReader; 045 046 import org.w3c.dom.Document; 047 import org.w3c.dom.Element; 048 import org.w3c.dom.Node; 049 import org.w3c.dom.NodeList; 050 import org.xml.sax.InputSource; 051 import org.xml.sax.XMLReader; 052 053 import fi.iki.hsivonen.gnu.xml.pipeline.DomConsumer; 054 import fi.iki.hsivonen.xml.DOMUtils; 055 import fi.iki.hsivonen.xml.SAXUtils; 056 057 /** 058 * 059 * @author hsivonen 060 * @author ykk 061 */ 062 public class MetadataExtractor { 063 /** Do we print the debug information */ 064 private static boolean DEBUG = false; 065 066 private static Pattern charsetPat = 067 Pattern.compile("^.*charset\\s*=\\s*((?:-|\\w)+).*$"); 068 069 private File file; 070 private String contentType; 071 private String title; 072 private Document doc; 073 private String encoding; 074 private String fileName; 075 private boolean hasCSSCharset = false; 076 private boolean guessedContentType = false; 077 078 private static final char ANY = '\uE000'; 079 private static final char DIGIT = '\uE001'; 080 081 private static final int MAX_MAGIC_LENGTH = 12; 082 083 private static char[][] magicNumbers = { 084 {'\u0089', 'P', 'N', 'G', '\r', '\n', '\u001A', '\n'}, // PNG 1.0 spec 085 {'\u008a', 'M', 'N', 'G', '\r', '\n', '\u001A', '\n'}, // MNG spec 086 {'G', 'I', 'F', '8', '7', 'a'}, // BSD /etc/magic 087 {'G', 'I', 'F', '8', '9', 'a'}, // BSD /etc/magic 088 {'%', 'P', 'D', 'F', '-', DIGIT, '.', DIGIT}, 089 {'%', '!'}, 090 {'P', 'K', '\u0003', '\u0004'}, 091 {'F', 'W', 'S'}, // observation and BSD /etc/magic 092 {'.', 'R', 'M', 'F', '\u0000'}, // observation 093 {'{', '\\', 'r', 't', 'f'}, 094 {'\u0000', '\u0000', '\u0001', '\u00BA'}, 095 {'\u0000', '\u0000', '\u0001', '\u00B3'}, 096 {ANY, ANY, ANY, ANY, ANY, ANY, 'J', 'F', 'I', 'F', '\u0000'}, // observation, BSD /etc/magic and the JFIF spec http://www.w3.org/Graphics/JPEG/jfif3.pdf 097 {ANY, ANY, ANY, ANY, 'm', 'o', 'o', 'v'}, // /usr/share/misc/file/magic 098 {ANY, ANY, ANY, ANY, 'm', 'd', 'a', 't'}, // /usr/share/misc/file/magic 099 {ANY, ANY, ANY, ANY, 'f', 'r', 'e', 'e'}, // /usr/share/misc/file/magic 100 {ANY, ANY, ANY, ANY, 'j', 'u', 'n', 'k'}, // /usr/share/misc/file/magic 101 {ANY, ANY, ANY, ANY, 'p', 'n', 'o', 't'}, // /usr/share/misc/file/magic 102 {ANY, ANY, ANY, ANY, 's', 'k', 'i', 'p'}, // /usr/share/misc/file/magic 103 {ANY, ANY, ANY, ANY, 'w', 'i', 'd', 'e'}, // /usr/share/misc/file/magic 104 {ANY, ANY, ANY, ANY, 'p', 'i', 'c', 't'}, // /usr/share/misc/file/magic 105 {ANY, ANY, ANY, ANY, 'f', 't', 'y', 'p'}, // observation 106 {'R', 'I', 'F', 'F', ANY, ANY, ANY, ANY, 'A', 'V', 'I'}, // /usr/share/file/magic.mime 107 {'R', 'I', 'F', 'F', ANY, ANY, ANY, ANY, 'W', 'A', 'V', 'E'}, // /usr/share/file/magic.mime 108 {'\u00FF', '\u00FA'}, // /usr/share/file/magic.mime 109 {'\u00FF', '\u00FB'}, // /usr/share/file/magic.mime 110 {'I', 'D', '3'}, // /usr/share/file/magic.mime 111 {'O', 'g', 'g', 'S'}, // /usr/share/file/magic.mime 112 {'B', 'M'}, // /usr/share/file/magic.mime 113 {'S', 't', 'u', 'f', 'f', 'I', 't'}, // observation 114 {'S', 'I', 'T', '!'}, // observation 115 {'\u00FE', '\u00FF'}, // big endian BOM 116 {'\u00FF', '\u00FE'}, // little endian BOM 117 {'@', 'c', 'h', 'a', 'r', 's', 'e', 't'}, 118 {'I', 'I'}, 119 {'M', 'M'}, 120 {'\u0000', '\u0000', '\u0000', '\u000C', '\u006A', '\u0050', '\u0020', '\u0020', '\r', '\n', '\u0087', '\n'} // http://www.iana.org/assignments/media-types/image/jp2 121 }; 122 123 124 private static String[] magicTypes = 125 { 126 "image/png", 127 "video/x-mng", 128 "image/gif", 129 "image/gif", 130 "application/pdf", 131 "application/postscript", 132 "application/zip", 133 "application/x-shockwave-flash", 134 "audio/x-pn-realaudio", 135 "text/rtf", 136 "video/mpeg", 137 "video/mpeg", 138 "image/jpeg", 139 "video/quicktime", 140 "video/quicktime", 141 "video/quicktime", 142 "video/quicktime", 143 "video/quicktime", 144 "video/quicktime", 145 "video/quicktime", 146 "video/quicktime", 147 "video/mp4", 148 "video/x-msvideo", 149 "audio/x-wav", 150 "audio/mpeg", 151 "audio/mpeg", 152 "audio/mpeg", 153 "application/ogg", 154 "image/x-bmp", 155 "application/x-stuffit", 156 "application/x-stuffit", 157 "utf-16", 158 // see determineTypeFromMagic 159 "utf-16", // see determineTypeFromMagic 160 "CSS_CHARSET", // see determineTypeFromMagic 161 "image/tiff", "image/tiff", "image/jp2" }; 162 163 // must be sorted lexicographically 164 private static String[] extensions = 165 { 166 "asf", 167 "css", 168 "doc", 169 "htm", 170 "html", 171 "ppt", 172 "txt", 173 "wma", 174 "wmv", 175 "xls" }; 176 177 private static String[] extensionTypes = 178 { 179 "video/x-ms-asf", 180 "text/css", 181 "application/msword", 182 "text/html", 183 "text/html", 184 "application/vnd.ms-powerpoint", 185 "text/plain", 186 "audio/x-ms-wma", 187 "video/x-ms-wmv", 188 "application/vnd.ms-excel" }; 189 190 private static String[] mimeTypes = { 191 // teksti 192 "text/plain", 193 "text/html", 194 "application/xhtml+xml", 195 "application/xml", 196 "text/css", 197 "text/rtf", 198 "application/pdf", 199 "application/postscript", 200 201 // toimisto-ohjelmat 202 "application/msword", 203 "application/vnd.ms-excel", 204 "application/vnd.ms-powerpoint", 205 "application/vnd.sun.xml.writer", 206 "application/vnd.sun.xml.calc", 207 "application/vnd.sun.xml.impress", 208 "application/vnd.sun.xml.draw", 209 210 // kuvat 211 "image/jpeg", 212 "image/gif", 213 "image/png", 214 "image/x-bmp", 215 "image/svg+xml", 216 "image/tiff", 217 "image/jp2", 218 219 // video 220 "video/mpeg", 221 "video/mp4", 222 "video/x-msvideo", 223 "video/x-ms-asf", 224 "video/x-ms-wmv", 225 "audio/x-pn-realaudio", 226 "video/quicktime", 227 "video/x-mng", 228 229 // ääni 230 "audio/mpeg", "audio/x-wav", "audio/x-ms-wma", "application/ogg", 231 232 // pakkaustyökalut 233 "application/zip", 234 "application/x-stuffit", 235 "application/x-shockwave-flash" }; 236 237 private static String[] mimeTypeNames = { 238 // teksti 239 "Tekstidokumentti", 240 "HTML-dokumentti", 241 "XHTML-dokumentti", 242 "XML-dokumentti", 243 "Tyylisivu (CSS)", 244 "RTF-dokumentti", 245 "PDF-dokumentti", 246 "PostScript-dokumentti", 247 248 // toimisto-ohjelmat 249 "MSOffice Word -dokumentti", 250 "MSOffice Excel -työkirja", 251 "MSOffice Powerpoint -esitys", 252 "OpenOffice Writer -dokumentti", 253 "OpenOffice Calc -työkirja", 254 "OpenOffice Impress -esitys", 255 "OpenOffice Impress -piirros", 256 257 // kuvat 258 "JPEG-kuva", 259 "GIF-kuva", 260 "PNG-kuva", 261 "BMP-kuva", 262 "SVG-kuva", 263 "TIFF-kuva", 264 "JPEG 2000 -kuva", 265 266 // video 267 "MPEG-video", 268 "MPEG4-video", 269 "AVI-video", 270 "ASF-video", 271 "WMV-video", 272 "Real Media -video", 273 "Quicktime-video", 274 "MNG-animaatio", 275 276 // ääni 277 "MP3-ääni", "WAV-ääni", "WMA-ääni", "OGG Vorbis -ääni", 278 279 // pakkaustyökalut 280 "ZIP-paketti", "StuffIt-paketti", "Flash-animaatio" }; 281 282 public static void main(String[] args) throws Exception { 283 if (args.length < 1) { 284 System.out.println("Usage: java MetadataExtractor filename"); 285 System.exit(0); 286 } 287 288 File f = new File(args[0]); 289 MetadataExtractor me = new MetadataExtractor(f); 290 System.out.println( 291 "content-type:\t" 292 + me.getContentType() 293 + (me.getGuessedContentType() ? " (guessed)" : "")); 294 System.out.println("encoding:\t\t" + me.getEncoding()); 295 System.out.println("title:\t\t" + me.getTitle()); 296 if (DEBUG && me.getTitle() != null) { 297 for (int i = 0; i < me.getTitle().length(); i++) { 298 System.out.println( 299 Integer.toHexString((int)me.getTitle().charAt(i))); 300 } 301 } 302 } 303 304 public MetadataExtractor(File f) throws IOException { 305 this(f, null); 306 } 307 308 /** Creates a new instance of MetadataExtractor */ 309 public MetadataExtractor(File f, String fileName) throws IOException { 310 DEBUG = (System.getProperty("DebugMetadataExtractor") != null); 311 this.file = f; 312 if (fileName == null) { 313 this.fileName = f.getName(); 314 } else { 315 this.fileName = fileName; 316 } 317 this.contentType = null; 318 this.determineTypeFromMagic(); 319 if ("application/zip".equals(this.contentType)) { 320 this.determineTypeFromZip(); 321 } 322 if (this.contentType == null) { 323 this.determineTypeFromXML(); 324 } 325 if (this.contentType == null) { 326 this.determineTypeFromExtension(); 327 } 328 this.checkEncoding(); 329 if ("application/pdf".equals(this.contentType)) { 330 this.extractMetaFromPDF(); 331 } 332 if (this.contentType != null 333 && this.contentType.startsWith("text/html")) { 334 this.extractMetaFromHTML(); 335 } 336 } 337 338 private void checkEncoding() throws IOException { 339 if (!("text/css".equals(this.contentType) 340 || "text/html".equals(this.contentType) 341 || "text/plain".equals(this.contentType))) { 342 return; 343 } 344 if ("text/css".equals(this.contentType) && this.hasCSSCharset) { 345 return; 346 } 347 if (this.encoding == null) { 348 this.guessEncoding(); 349 } 350 351 // ei haluta contentType-stringiin vielä tässä vaiheessa, 352 //sillä muuten käyttäjä ei voi valita contentTypeä 353 354 //this.contentType = this.contentType + "; charset=" + this.encoding; 355 } 356 357 private void determineTypeFromMagic() throws IOException { 358 byte[] buf = new byte[MAX_MAGIC_LENGTH]; 359 FileInputStream stream = new FileInputStream(this.file); 360 int numBytes = stream.read(buf); 361 stream.close(); 362 363 if (DEBUG) { 364 for (int i = 0; i < buf.length; i++) { 365 System.out.println(Integer.toHexString(((int)buf[i]) & 0xFF)); 366 } 367 } 368 369 for (int i = 0; i < magicNumbers.length; i++) { 370 boolean match = true; 371 for (int j = 0; j < magicNumbers[i].length && j < numBytes; j++) { 372 if (magicNumbers[i][j] == ANY) { 373 continue; 374 } else if ( 375 magicNumbers[i][j] == DIGIT 376 && buf[j] < '9' 377 && buf[j] > '0') { 378 continue; 379 } else if ( 380 magicNumbers[i][j] == (char) (((int)buf[j]) & 0xFF)) { 381 continue; 382 } else { 383 match = false; 384 break; 385 } 386 } 387 if (match) { 388 if ("utf-16".equals(magicTypes[i])) { 389 this.encoding = "utf-16"; 390 } else if ("CSS_CHARSET".equals(magicTypes[i])) { 391 this.hasCSSCharset = true; 392 } else { 393 this.contentType = magicTypes[i]; 394 } 395 return; 396 } 397 } 398 } 399 400 private void guessEncoding() throws IOException { 401 int b; 402 boolean couldBeASCII = true; 403 boolean couldBeISO = true; 404 BufferedInputStream in = 405 new BufferedInputStream(new FileInputStream(this.file)); 406 try { 407 while (((b = in.read()) != -1) && (couldBeASCII || couldBeISO)) { 408 if (b > 0x7F) { 409 couldBeASCII = false; 410 if (b < 0xA0) { 411 couldBeISO = false; 412 } 413 } 414 } 415 } finally { 416 in.close(); 417 } 418 if (couldBeASCII) { 419 this.encoding = "us-ascii"; 420 return; 421 } 422 if (this.couldBeUTF8()) { 423 this.encoding = "utf-8"; 424 return; 425 } 426 if (couldBeISO) { 427 this.encoding = "iso-8859-1"; 428 return; 429 } 430 this.encoding = "windows-1252"; 431 return; 432 } 433 434 private boolean couldBeUTF8() throws IOException { 435 BufferedReader in; 436 CharsetDecoder decoder = Charset.forName("UTF-8").newDecoder(); 437 decoder.onMalformedInput(CodingErrorAction.REPORT); 438 decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 439 in = 440 new BufferedReader( 441 new InputStreamReader(new FileInputStream(this.file), decoder)); 442 try { 443 while (in.read() != -1) {} 444 } catch (CharacterCodingException e) { 445 return false; 446 } finally { 447 in.close(); 448 } 449 return true; 450 } 451 452 private void determineTypeFromXML() throws IOException { 453 DocumentBuilder builder = DOMUtils.newNonvalidatingDocumentBuilder(); 454 // If the file can be parsed as XML without fatal errors, it is by 455 // definition an XML document. OTOH, if there are fatal errors, it 456 // by definition is not. 457 try { 458 InputSource is = new InputSource(new FileInputStream(this.file)); 459 is.setSystemId("file:///foo"); 460 this.doc = builder.parse(is); 461 } catch (Exception e) { 462 return; 463 } 464 // However, some HTML tag soup docs might by chance be well-formed. 465 Element root = doc.getDocumentElement(); 466 if (root.getNamespaceURI() == null 467 && "HTML".equalsIgnoreCase(root.getNodeName())) { 468 this.contentType = "text/html"; 469 return; 470 } else if ( 471 "http://www.w3.org/1999/xhtml".equals(root.getNamespaceURI()) 472 && "html".equals(root.getLocalName())) { 473 loop : for ( 474 Node n = root.getFirstChild(); 475 n != null; 476 n = n.getNextSibling()) { 477 if ("http://www.w3.org/1999/xhtml".equals(n.getNamespaceURI()) 478 && "head".equals(n.getLocalName())) { 479 for (Node m = n.getFirstChild(); 480 m != null; 481 m = m.getNextSibling()) { 482 if ("http://www.w3.org/1999/xhtml" 483 .equals(m.getNamespaceURI()) 484 && "title".equals(m.getLocalName())) { 485 this.setTitle(DOMUtils.textContent(m)); 486 break loop; 487 } 488 } 489 } 490 } 491 this.contentType = "application/xhtml+xml"; 492 return; 493 } else if ( 494 "http://www.w3.org/2000/svg".equals(root.getNamespaceURI()) 495 && "svg".equals(root.getLocalName())) { 496 for (Node n = root.getFirstChild(); 497 n != null; 498 n = n.getNextSibling()) { 499 if ("http://www.w3.org/2000/svg".equals(n.getNamespaceURI()) 500 && "title".equals(n.getLocalName())) { 501 this.setTitle(DOMUtils.textContent(n)); 502 break; 503 } 504 } 505 this.contentType = "image/svg+xml"; 506 return; 507 } else { 508 this.contentType = "application/xml"; 509 return; 510 } 511 } 512 513 private void determineTypeFromZip() throws IOException { 514 ZipFile zf; 515 try { 516 zf = new ZipFile(this.file); 517 } catch (ZipException e) { 518 // doesn't look like a zip file after all 519 this.contentType = null; 520 return; 521 } 522 // no support for encrypted files 523 Document metaDoc; 524 DocumentBuilder builder = DOMUtils.newNonvalidatingDocumentBuilder(); 525 ZipEntry metaEntry; 526 if ((metaEntry = zf.getEntry("meta.xml")) != null) { 527 try { 528 InputSource is = new InputSource(zf.getInputStream(metaEntry)); 529 is.setSystemId("file:///foo"); 530 metaDoc = builder.parse(is); 531 } catch (Exception e) { 532 return; 533 } 534 Node titleElt = 535 DOMUtils.findElement( 536 metaDoc, 537 "http://purl.org/dc/elements/1.1/", 538 "title"); 539 if (titleElt != null) { 540 this.setTitle(DOMUtils.textContent(titleElt)); 541 } 542 } 543 ZipEntry contentEntry; 544 if ((contentEntry = zf.getEntry("content.xml")) == null) { 545 // not an OOo file 546 return; 547 } 548 try { 549 InputSource is = new InputSource(zf.getInputStream(contentEntry)); 550 is.setSystemId("file:///foo"); 551 this.doc = builder.parse(is); 552 } catch (Exception e) { 553 return; 554 } 555 Element root = this.doc.getDocumentElement(); 556 if ("http://www.w3.org/1998/Math/MathML" 557 .equals(root.getNamespaceURI())) { 558 this.contentType = "application/vnd.sun.xml.math"; 559 return; 560 } 561 String docClass = 562 root.getAttributeNS("http://openoffice.org/2000/office", "class"); 563 if ("text".equals(docClass)) { 564 this.contentType = "application/vnd.sun.xml.writer"; 565 // If we didn't get a title from the metadata, 566 // let's use the first heading 567 if (this.title == null) { 568 Node headingElt = 569 DOMUtils.findElement( 570 root, 571 "http://openoffice.org/2000/text", 572 "h"); 573 if (headingElt != null) { 574 this.setTitle(DOMUtils.textContent(headingElt)); 575 } 576 } 577 return; 578 } else if ("text-global".equals(docClass)) { 579 this.contentType = "application/vnd.sun.xml.writer.global"; 580 return; 581 } else if ("spreadsheet".equals(docClass)) { 582 this.contentType = "application/vnd.sun.xml.calc"; 583 return; 584 } else if ("drawing".equals(docClass)) { 585 this.contentType = "application/vnd.sun.xml.draw"; 586 return; 587 } else if ("presentation".equals(docClass)) { 588 this.contentType = "application/vnd.sun.xml.impress"; 589 // If we didn't get a title from the metadata, 590 // let's use the first text box 591 if (this.title == null) { 592 Node headingElt = 593 DOMUtils.findElement( 594 root, 595 "http://openoffice.org/2000/text", 596 "p"); 597 if (headingElt != null) { 598 this.setTitle(DOMUtils.textContent(headingElt)); 599 } 600 } 601 return; 602 } else if ("chart".equals(docClass)) { 603 this.contentType = "application/vnd.sun.xml.calc"; 604 // XXX is this OK? 605 return; 606 } 607 } 608 609 private void determineTypeFromExtension() { 610 int dotIndex = this.fileName.lastIndexOf("."); 611 if (dotIndex < 1) { 612 return; 613 } 614 String ext = this.fileName.substring(dotIndex + 1).toLowerCase(); 615 int i = Arrays.binarySearch(extensions, ext); 616 if (i < 0) { 617 return; 618 } 619 this.contentType = extensionTypes[i]; 620 this.guessedContentType = true; 621 } 622 623 private void extractMetaFromHTML() throws IOException { 624 try { 625 boolean reparse = false; 626 XMLReader tagSoup = SAXUtils.newTagSoupXMLReader(); 627 InputSource is = new InputSource(new FileInputStream(this.file)); 628 is.setSystemId("file:///foo"); 629 is.setEncoding(this.encoding); 630 DomConsumer builder = DOMUtils.newDomConsumer(); 631 tagSoup.setContentHandler(builder.getContentHandler()); 632 tagSoup.parse(is); 633 this.doc = builder.getDocument(); 634 Element root = doc.getDocumentElement(); 635 636 NodeList nl = 637 root.getElementsByTagNameNS( 638 "http://www.w3.org/1999/xhtml", 639 "meta"); 640 int len = nl.getLength(); 641 for (int i = 0; i < len; i++) { 642 Element meta = (Element)nl.item(i); 643 if ("content-type" 644 .equalsIgnoreCase(meta.getAttribute("httpequiv"))) { 645 String enc = meta.getAttribute("content"); 646 if (enc == null) { 647 break; 648 } 649 enc = enc.toLowerCase(); 650 Matcher m = charsetPat.matcher(enc); 651 if(m.matches()) { 652 enc = m.group(1); 653 if (enc == null) { 654 break; 655 } 656 } else { 657 break; 658 } 659 if (!(enc.startsWith("utf-") 660 || enc.equals("iso-8859-1") 661 || enc.equals("windows-1252") 662 || enc.equals("us-ascii"))) { 663 this.encoding = enc; 664 reparse = true; 665 break; 666 } 667 668 } 669 } 670 671 if (reparse) { 672 is.setByteStream(new FileInputStream(this.file)); 673 is.setEncoding(this.encoding); 674 tagSoup.parse(is); 675 this.doc = builder.getDocument(); 676 root = doc.getDocumentElement(); 677 } 678 679 loop : for ( 680 Node n = root.getFirstChild(); 681 n != null; 682 n = n.getNextSibling()) { 683 if ("http://www.w3.org/1999/xhtml".equals(n.getNamespaceURI()) 684 && "head".equals(n.getLocalName())) { 685 for (Node m = n.getFirstChild(); 686 m != null; 687 m = m.getNextSibling()) { 688 if ("http://www.w3.org/1999/xhtml" 689 .equals(m.getNamespaceURI()) 690 && "title".equals(m.getLocalName())) { 691 this.setTitle(DOMUtils.textContent(m)); 692 break loop; 693 } 694 } 695 } 696 } 697 } catch (Exception e) { 698 System.err.println(e); 699 } 700 } 701 702 private void extractMetaFromPDF() throws IOException { 703 try { 704 PDFReader reader = new PDFReader(this.file); 705 Dict info = reader.getInfo(); 706 this.setTitle(reader.getObject(info.get("Title")).toString()); 707 } catch (Exception e) { 708 709 } 710 } 711 712 private String normalizeWhiteSpace(String text) { 713 StringBuilder buf = new StringBuilder(text.length()); 714 boolean lastIsWhitespace = true; 715 for (int i = 0; i < text.length(); i++) { 716 char c = text.charAt(i); 717 if (c == ' ' || c == '\t' || c == '\n' || c == '\r') { 718 if (!lastIsWhitespace) { 719 buf.append(' '); 720 lastIsWhitespace = true; 721 } 722 } else { 723 buf.append(c); 724 lastIsWhitespace = false; 725 } 726 } 727 if (buf.charAt(buf.length() - 1) == ' ') { 728 buf.deleteCharAt(buf.length() - 1); 729 } 730 return buf.toString(); 731 } 732 733 private void setTitle(String title) { 734 String collapsed = this.normalizeWhiteSpace(title); 735 if ("".equals(collapsed)) { 736 this.title = null; 737 } else { 738 this.title = collapsed; 739 } 740 } 741 742 /** 743 * Getter for property contentType. 744 * @return Value of property contentType. 745 */ 746 public java.lang.String getContentType() { 747 return this.contentType; 748 } 749 750 /** 751 * Getter for property encoding 752 * @return Value of property encoding. 753 */ 754 public java.lang.String getEncoding() { 755 return this.encoding; 756 } 757 758 public String getContentTypeWithParams() { 759 if(this.encoding == null) { 760 return this.contentType; 761 } else { 762 return this.contentType + "; charset=" + this.encoding; 763 } 764 } 765 766 /** 767 * Getter for property guessedContentType. If guessedContentType 768 * is true, the content type was derived from the file name and it 769 * could be wrong. If guessedContentType is false, the content 770 * type was derived from the magic bytes of the file and you can 771 * trust on that information. 772 * @return the value of guessedContentType field 773 */ 774 public boolean getGuessedContentType() { 775 return this.guessedContentType; 776 } 777 778 public java.lang.String getTitle() { 779 return this.title; 780 } 781 782 public Document getDOM() { 783 return this.doc; 784 } 785 786 public static String[] getMimeTypes() { 787 return mimeTypes; 788 } 789 790 public static String[] getMimeTypeNames() { 791 return mimeTypeNames; 792 } 793 }