*/ use dokuwiki\Extension\Plugin; use LanguageDetection\Language; use dokuwiki\HTTP\DokuHTTPClient; require_once __DIR__ . '/../vendor/autoload.php'; /** * Convert a file to text and metainfos */ class helper_plugin_elasticsearch_docparser extends Plugin { public const CONFFILE = DOKU_CONF . 'elasticsearch.conf'; /** * @var array maps extensions to parsers. A parser may be a local cli tool (file is passed as argument) * or an URL accepting input by PUT (like Apache Tika). They need to return plain text or a JSON response. */ protected $parsers; /** * Maps fields returned by Tika or other JSON returning parsers to our own field names. * Order does matter. Last non-empty field wins. */ protected const FIELDMAP = [ 'title' => 'title', 'dc:title' => 'title', 'content' => 'content', 'body' => 'content', 'dc:description' => 'content', 'X-TIKA:content' => 'content', 'Creation-Date' => 'created', 'dcterms:created' => 'created', 'meta:creation-date' => 'created', ]; /** * Load the parser setup */ public function __construct() { $parsers = confToHash(self::CONFFILE); if (empty($parsers)) { throw new RuntimeException( 'Cannot process media, the parser configuration in ' . self::CONFFILE . ' is missing.' ); } $this->parsers = $parsers; } /** * Parse the given file * * Returns an array with the following keys * * title - will be filled with the basename if no title could be extracted * content - the content to index * mime - the mime type as determined by us * ext - the extension of the file * language - the language code the file is written in * created - creation time * * Returns false if the file can not be parsed and thus should not be indexed * * @param string $file * @return array * @fixme throw smarter exceptions */ public function parse($file) { if (!file_exists($file)) { throw new RuntimeException('File ' . $file . 'does not exist'); } [$ext, $mime] = mimetype($file); if (!$ext) { throw new RuntimeException('Cannot parse file with unidentified extension'); } if (!isset($this->parsers[$ext])) { throw new RuntimeException('No parser configured for files of type ' . $ext); }; $result = $this->runParser($file, $this->parsers[$ext]); if ($result === false) { throw new RuntimeException('No response from parser'); } // defaults $data = [ 'title' => basename($file), 'content' => '', 'mime' => $mime, 'ext' => $ext, 'language' => '', 'created' => date('Y-m-d\TH:i:s\Z', filemtime($file)), ]; // add what we got from the parser $data = array_merge($data, $this->processParserResult($result)); // add language info $data['language'] = $this->detectLanguage($data['content']); return $data; } /** * Execute the parser on the given file * * The parser can be an URL accepting a PUT request or a local command * * @param string $file * @param string $parser * @return bool|string */ protected function runParser($file, $parser) { if (preg_match('/^https?:\/\//', $parser)) { $http = new DokuHTTPClient(); $http->timeout = 90; $ok = $http->sendRequest($parser, io_readFile($file, false), 'PUT'); if ($ok) { return $http->resp_body; } return false; } elseif (is_executable(strtok($parser, ' '))) { $output = []; $ok = 0; exec(str_replace('%in%', escapeshellarg($file), $parser), $output, $ok); if ($ok === 0) { return implode(' ', $output); } return false; } return false; } /** * @param string $result The string returned by the parser, might be json * @return array */ protected function processParserResult($result) { // decode json responses if ( ( $result[0] !== '[' && $result[0] !== '{' ) || ( ($decoded = json_decode($result, true)) === null ) ) { return [ 'content' => $result, ]; }; // we only want the first result from an Apache Tika response if (isset($decoded[0]) && is_array($decoded[0])) { $decoded = $decoded[0]; } $data = []; foreach (self::FIELDMAP as $from => $to) { if (!blank($decoded[$from])) $data[$to] = trim($decoded[$from]); } return $data; } /** * Return the language the given body was written in * * Will always return the wiki default language unless the translation plugin is installed. * * @param string $body * @return string The detected language * @fixme handle languages like 'pt-br' correctly * @fixme maybe make this optional in favor of the namespace method */ protected function detectLanguage($body) { global $conf; /** @var helper_plugin_translation $trans */ $trans = plugin_load('helper', 'translation'); if ($trans === null) return $conf['lang']; $ld = new Language(); $langs = array_keys($ld->detect($body)->whitelist(...$trans->translations)->close()); return array_shift($langs); } }