1<?php 2/** 3 * DokuWiki Plugin elasticsearch (DocParser Helper Component) 4 * 5 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html 6 * @author Andreas Gohr <gohr@cosmocode.de> 7 */ 8 9require_once dirname(__FILE__) . '/../vendor/autoload.php'; 10 11/** 12 * Convert a file to text and metainfos 13 */ 14class helper_plugin_elasticsearch_docparser extends DokuWiki_Plugin 15{ 16 const CONFFILE = DOKU_CONF . 'elasticsearch.conf'; 17 18 /** 19 * @var array maps extensions to parsers. A parser may be a local cli tool (file is passed as argument) 20 * or an URL accepting input by PUT (like Apache Tika). They need to return plain text or a JSON response. 21 */ 22 protected $parsers; 23 24 /** 25 * Maps fields returned by Tika or other JSON returning parsers to our own field names. 26 * Order does matter. Last non-empty field wins. 27 */ 28 const FILEDMAP = [ 29 'title' => 'title', 30 'dc:title' => 'title', 31 'content' => 'content', 32 'body' => 'content', 33 'dc:description' => 'content', 34 'X-TIKA:content' => 'content', 35 'Creation-Date' => 'created', 36 'dcterms:created' => 'created', 37 'meta:creation-date' => 'created', 38 ]; 39 40 /** 41 * Load the parser setup 42 */ 43 public function __construct() 44 { 45 $parsers = confToHash(self::CONFFILE); 46 47 if (empty($parsers)) { 48 throw new RuntimeException( 49 'Cannot process media, the parser configuration in ' . self::CONFFILE . ' is missing.' 50 ); 51 } 52 53 $this->parsers = $parsers; 54 } 55 56 /** 57 * Parse the given file 58 * 59 * Returns an array with the following keys 60 * 61 * title - will be filled with the basename if no title could be extracted 62 * content - the content to index 63 * mime - the mime type as determined by us 64 * ext - the extension of the file 65 * language - the language code the file is written in 66 * created - creation time 67 * 68 * Returns false if the file can not be parsed and thus should not be indexed 69 * 70 * @param string $file 71 * @return array 72 * @fixme throw smarter exceptions 73 */ 74 public function parse($file) 75 { 76 if (!file_exists($file)) { 77 throw new RuntimeException('File ' . $file . 'does not exist'); 78 } 79 list($ext, $mime) = mimetype($file); 80 if (!$ext) { 81 throw new RuntimeException('Cannot parse file with unidentified extension'); 82 } 83 if (!isset($this->parsers[$ext])) { 84 throw new RuntimeException('No parser configured for files of type ' . $ext); 85 }; 86 87 $result = $this->runParser($file, $this->parsers[$ext]); 88 if ($result === false) { 89 throw new RuntimeException('No response from parser'); 90 } 91 92 // defaults 93 $data = [ 94 'title' => basename($file), 95 'content' => '', 96 'mime' => $mime, 97 'ext' => $ext, 98 'language' => '', 99 'created' => date('Y-m-d\TH:i:s\Z', filemtime($file)), 100 ]; 101 102 // add what we got from the parser 103 $data = array_merge($data, $this->processParserResult($result)); 104 105 // add language info 106 $data['language'] = $this->detectLanguage($data['content']); 107 108 return $data; 109 } 110 111 /** 112 * Execute the parser on the given file 113 * 114 * The parser can be an URL accepting a PUT request or a local command 115 * 116 * @param string $file 117 * @param string $parser 118 * @return bool|string 119 */ 120 protected function runParser($file, $parser) 121 { 122 if (preg_match('/^https?:\/\//', $parser)) { 123 $http = new DokuHTTPClient(); 124 $http->timeout = 90; 125 $ok = $http->sendRequest($parser, io_readFile($file, false), 'PUT'); 126 if ($ok) { 127 return $http->resp_body; 128 } 129 return false; 130 } elseif (is_executable(strtok($parser, ' '))) { 131 $output = []; 132 $ok = 0; 133 exec(str_replace('%in%', escapeshellarg($file), $parser), $output, $ok); 134 if ($ok === 0) { 135 return join(' ', $output); 136 } 137 return false; 138 } 139 140 return false; 141 } 142 143 /** 144 * @param string $result The string returned by the parser, might be json 145 * @return array 146 */ 147 protected function processParserResult($result) 148 { 149 // decode json responses 150 if ( 151 ( 152 $result[0] !== '[' && $result[0] !== '{' 153 ) 154 || 155 ( 156 ($decoded = json_decode($result, true)) === null 157 ) 158 ) { 159 return [ 160 'content' => $result, 161 ]; 162 }; 163 // we only want the first result from an Apache Tika response 164 if (isset($decoded[0]) && is_array($decoded[0])) { 165 $decoded = $decoded[0]; 166 } 167 168 $data = []; 169 foreach (self::FILEDMAP as $from => $to) { 170 if (!blank($decoded[$from])) $data[$to] = trim($decoded[$from]); 171 } 172 return $data; 173 } 174 175 /** 176 * Return the language the given body was written in 177 * 178 * Will always return the wiki default language unless the translation plugin is installed. 179 * 180 * @param string $body 181 * @return string The detected language 182 * @fixme handle languages like 'pt-br' correctly 183 * @fixme maybe make this optional in favor of the namespace method 184 */ 185 protected function detectLanguage($body) 186 { 187 global $conf; 188 189 /** @var helper_plugin_translation $trans */ 190 $trans = plugin_load('helper', 'translation'); 191 if ($trans === null) return $conf['lang']; 192 193 $ld = new \LanguageDetection\Language(); 194 195 $langs = array_keys($ld->detect($body)->whitelist(...$trans->translations)->close()); 196 return array_shift($langs); 197 } 198} 199