1<?php 2 3/** 4 * DokuWiki Plugin elasticsearch (DocParser Helper Component) 5 * 6 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html 7 * @author Andreas Gohr <gohr@cosmocode.de> 8 */ 9 10use dokuwiki\Extension\Plugin; 11use LanguageDetection\Language; 12use dokuwiki\HTTP\DokuHTTPClient; 13 14require_once __DIR__ . '/../vendor/autoload.php'; 15 16/** 17 * Convert a file to text and metainfos 18 */ 19class helper_plugin_elasticsearch_docparser extends Plugin 20{ 21 public const CONFFILE = DOKU_CONF . 'elasticsearch.conf'; 22 23 /** 24 * @var array maps extensions to parsers. A parser may be a local cli tool (file is passed as argument) 25 * or an URL accepting input by PUT (like Apache Tika). They need to return plain text or a JSON response. 26 */ 27 protected $parsers; 28 29 /** 30 * Maps fields returned by Tika or other JSON returning parsers to our own field names. 31 * Order does matter. Last non-empty field wins. 32 */ 33 protected const FIELDMAP = [ 34 'title' => 'title', 35 'dc:title' => 'title', 36 'content' => 'content', 37 'body' => 'content', 38 'dc:description' => 'content', 39 'X-TIKA:content' => 'content', 40 'Creation-Date' => 'created', 41 'dcterms:created' => 'created', 42 'meta:creation-date' => 'created', 43 ]; 44 45 /** 46 * Load the parser setup 47 */ 48 public function __construct() 49 { 50 $parsers = confToHash(self::CONFFILE); 51 52 if (empty($parsers)) { 53 throw new RuntimeException( 54 'Cannot process media, the parser configuration in ' . self::CONFFILE . ' is missing.' 55 ); 56 } 57 58 $this->parsers = $parsers; 59 } 60 61 /** 62 * Parse the given file 63 * 64 * Returns an array with the following keys 65 * 66 * title - will be filled with the basename if no title could be extracted 67 * content - the content to index 68 * mime - the mime type as determined by us 69 * ext - the extension of the file 70 * language - the language code the file is written in 71 * created - creation time 72 * 73 * Returns false if the file can not be parsed and thus should not be indexed 74 * 75 * @param string $file 76 * @return array 77 * @fixme throw smarter exceptions 78 */ 79 public function parse($file) 80 { 81 if (!file_exists($file)) { 82 throw new RuntimeException('File ' . $file . 'does not exist'); 83 } 84 [$ext, $mime] = mimetype($file); 85 if (!$ext) { 86 throw new RuntimeException('Cannot parse file with unidentified extension'); 87 } 88 if (!isset($this->parsers[$ext])) { 89 throw new RuntimeException('No parser configured for files of type ' . $ext); 90 }; 91 92 $result = $this->runParser($file, $this->parsers[$ext]); 93 if ($result === false) { 94 throw new RuntimeException('No response from parser'); 95 } 96 97 // defaults 98 $data = [ 99 'title' => basename($file), 100 'content' => '', 101 'mime' => $mime, 102 'ext' => $ext, 103 'language' => '', 104 'created' => date('Y-m-d\TH:i:s\Z', filemtime($file)), 105 ]; 106 107 // add what we got from the parser 108 $data = array_merge($data, $this->processParserResult($result)); 109 110 // add language info 111 $data['language'] = $this->detectLanguage($data['content']); 112 113 return $data; 114 } 115 116 /** 117 * Execute the parser on the given file 118 * 119 * The parser can be an URL accepting a PUT request or a local command 120 * 121 * @param string $file 122 * @param string $parser 123 * @return bool|string 124 */ 125 protected function runParser($file, $parser) 126 { 127 if (preg_match('/^https?:\/\//', $parser)) { 128 $http = new DokuHTTPClient(); 129 $http->timeout = 90; 130 $ok = $http->sendRequest($parser, io_readFile($file, false), 'PUT'); 131 if ($ok) { 132 return $http->resp_body; 133 } 134 return false; 135 } elseif (is_executable(strtok($parser, ' '))) { 136 $output = []; 137 $ok = 0; 138 exec(str_replace('%in%', escapeshellarg($file), $parser), $output, $ok); 139 if ($ok === 0) { 140 return implode(' ', $output); 141 } 142 return false; 143 } 144 145 return false; 146 } 147 148 /** 149 * @param string $result The string returned by the parser, might be json 150 * @return array 151 */ 152 protected function processParserResult($result) 153 { 154 // decode json responses 155 if ( 156 ( 157 $result[0] !== '[' && $result[0] !== '{' 158 ) 159 || 160 ( 161 ($decoded = json_decode($result, true)) === null 162 ) 163 ) { 164 return [ 165 'content' => $result, 166 ]; 167 }; 168 // we only want the first result from an Apache Tika response 169 if (isset($decoded[0]) && is_array($decoded[0])) { 170 $decoded = $decoded[0]; 171 } 172 173 $data = []; 174 foreach (self::FIELDMAP as $from => $to) { 175 if (!blank($decoded[$from])) $data[$to] = trim($decoded[$from]); 176 } 177 return $data; 178 } 179 180 /** 181 * Return the language the given body was written in 182 * 183 * Will always return the wiki default language unless the translation plugin is installed. 184 * 185 * @param string $body 186 * @return string The detected language 187 * @fixme handle languages like 'pt-br' correctly 188 * @fixme maybe make this optional in favor of the namespace method 189 */ 190 protected function detectLanguage($body) 191 { 192 global $conf; 193 194 /** @var helper_plugin_translation $trans */ 195 $trans = plugin_load('helper', 'translation'); 196 if ($trans === null) return $conf['lang']; 197 198 $ld = new Language(); 199 200 $langs = array_keys($ld->detect($body)->whitelist(...$trans->translations)->close()); 201 return array_shift($langs); 202 } 203} 204