1<?php 2 3///////////////////////////////////////////////////////////////// 4/// getID3() by James Heinrich <info@getid3.org> // 5// available at https://github.com/JamesHeinrich/getID3 // 6// or https://www.getid3.org // 7// or http://getid3.sourceforge.net // 8// see readme.txt for more details // 9///////////////////////////////////////////////////////////////// 10// // 11// module.misc.pdf.php // 12// module for analyzing PDF files // 13// dependencies: NONE // 14// /// 15///////////////////////////////////////////////////////////////// 16 17if (!defined('GETID3_INCLUDEPATH')) { // prevent path-exposing attacks that access modules directly on public webservers 18 exit; 19} 20 21class getid3_pdf extends getid3_handler 22{ 23 public $returnXREF = false; // return full details of PDF Cross-Reference Table (XREF) 24 25 /** 26 * @return bool 27 */ 28 public function Analyze() { 29 $info = &$this->getid3->info; 30 31 $this->fseek(0); 32 if (preg_match('#^%PDF-([0-9\\.]+)$#', rtrim($this->fgets()), $matches)) { 33 $info['pdf']['header']['version'] = floatval($matches[1]); 34 $info['fileformat'] = 'pdf'; 35 36 // the PDF Cross-Reference Table (XREF) is located near the end of the file 37 // the starting offset is specified in the penultimate section, on the two lines just before "%%EOF" 38 // the first line is "startxref", the second line is the byte offset of the XREF. 39 // We know the length of "%%EOF" and "startxref", but the offset could be 2-10 bytes, 40 // and we're not sure if the line ends are one or two bytes, so we might find "startxref" as little as 18(?) bytes 41 // from EOF, but it could 30 bytes, so we start 40 bytes back just to be safe and do a search for the data we want. 42 $this->fseek(-40, SEEK_END); 43 if (preg_match('#[\r\n]startxref[ \r\n]+([0-9]+)[ \r\n]+#', $this->fread(40), $matches)) { 44 $info['pdf']['trailer']['startxref'] = intval($matches[1]); 45 $this->parseXREF($info['pdf']['trailer']['startxref']); 46 if (!empty($info['pdf']['xref']['offset'])) { 47 while (!$this->feof() && (max(array_keys($info['pdf']['xref']['offset'])) > $info['pdf']['xref']['count'])) { 48 // suspect that there may be another XREF entry somewhere in the file, brute-force scan for it 49 /* 50 // starting at last known entry of main XREF table 51 $this->fseek(max($info['pdf']['xref']['offset'])); 52 */ 53 // starting at the beginning of the file 54 $this->fseek(0); 55 while (!$this->feof()) { 56 $XREFoffset = $this->ftell(); 57 if (rtrim($this->fgets()) == 'xref') { 58 if (empty($info['pdf']['xref']['xref_offsets']) || !in_array($XREFoffset, $info['pdf']['xref']['xref_offsets'])) { 59 $this->parseXREF($XREFoffset); 60 break; 61 } 62 } 63 } 64 } 65 foreach ($info['pdf']['xref']['offset'] as $objectNumber => $offset) { 66 if ($info['pdf']['xref']['entry'][$objectNumber] == 'f') { 67 // "free" object means "deleted", ignore 68 continue; 69 } 70 $this->fseek($offset); 71 $line = rtrim($this->fgets()); 72 if (preg_match('#^'.$objectNumber.' ([0-9]+) obj#', $line, $matches)) { 73 if (strlen($line) > strlen($matches[0])) { 74 // object header line not actually on its own line, rewind file pointer to start reading data 75 $this->fseek($offset + strlen($matches[0])); 76 } 77 $objectData = ''; 78 while (true) { 79 $line = $this->fgets(); 80 if (rtrim($line) == 'endobj') { 81 break; 82 } 83 $objectData .= $line; 84 } 85 if (preg_match('#^<<[\r\n\s]*(/Type|/Pages|/Parent [0-9]+ [0-9]+ [A-Z]|/Count [0-9]+|/Kids *\\[[0-9A-Z ]+\\]|[\r\n\s])+[\r\n\s]*>>#', $objectData, $matches)) { 86 if (preg_match('#/Count ([0-9]+)#', $objectData, $matches)) { 87 $info['pdf']['pages'] = (int) $matches[1]; 88 break; // for now this is the only data we're looking for in the PDF not need to loop through every object in the file (and a large PDF may contain MANY objects). And it MAY be possible that there are other objects elsewhere in the file that define additional (or removed?) pages 89 } 90 } 91 } else { 92 $this->error('Unexpected structure "'.$line.'" at offset '.$offset); 93 break; 94 } 95 } 96 if (!$this->returnXREF) { 97 unset($info['pdf']['xref']['offset'], $info['pdf']['xref']['generation'], $info['pdf']['xref']['entry']); 98 } 99 100 } else { 101 $this->error('Did not find "xref" at offset '.$info['pdf']['trailer']['startxref']); 102 } 103 } else { 104 $this->error('Did not find "startxref" in the last 40 bytes of the PDF'); 105 } 106 107 $this->warning('PDF parsing incomplete in this version of getID3() ['.$this->getid3->version().']'); 108 return true; 109 } 110 $this->error('Did not find "%PDF" at the beginning of the PDF'); 111 return false; 112 113 } 114 115 /** 116 * @return bool 117 */ 118 private function parseXREF($XREFoffset) { 119 $info = &$this->getid3->info; 120 121 $this->fseek($XREFoffset); 122 if (rtrim($this->fgets()) == 'xref') { 123 124 $info['pdf']['xref']['xref_offsets'][$XREFoffset] = $XREFoffset; 125 list($firstObjectNumber, $XREFcount) = explode(' ', rtrim($this->fgets())); 126 $XREFcount = (int) $XREFcount; 127 $info['pdf']['xref']['count'] = $XREFcount + (!empty($info['pdf']['xref']['count']) ? $info['pdf']['xref']['count'] : 0); 128 for ($i = 0; $i < $XREFcount; $i++) { 129 $line = rtrim($this->fgets()); 130 if (preg_match('#^([0-9]+) ([0-9]+) ([nf])$#', $line, $matches)) { 131 $info['pdf']['xref']['offset'][($firstObjectNumber + $i)] = (int) $matches[1]; 132 $info['pdf']['xref']['generation'][($firstObjectNumber + $i)] = (int) $matches[2]; 133 $info['pdf']['xref']['entry'][($firstObjectNumber + $i)] = $matches[3]; 134 } else { 135 $this->error('failed to parse XREF entry #'.$i.' in XREF table at offset '.$XREFoffset); 136 return false; 137 } 138 } 139 sort($info['pdf']['xref']['xref_offsets']); 140 return true; 141 142 } 143 $this->warning('failed to find expected XREF structure at offset '.$XREFoffset); 144 return false; 145 } 146 147} 148