1<?php
2
3/**
4 * DokuWiki Plugin elasticsearch (DocParser Helper Component)
5 *
6 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
7 * @author  Andreas Gohr <gohr@cosmocode.de>
8 */
9
10use dokuwiki\Extension\Plugin;
11use LanguageDetection\Language;
12use dokuwiki\HTTP\DokuHTTPClient;
13
14require_once __DIR__ . '/../vendor/autoload.php';
15
16/**
17 * Convert a file to text and metainfos
18 */
19class helper_plugin_elasticsearch_docparser extends Plugin
20{
21    public const CONFFILE = DOKU_CONF . 'elasticsearch.conf';
22
23    /**
24     * @var array maps extensions to parsers. A parser may be a local cli tool (file is passed as argument)
25     * or an URL accepting input by PUT (like Apache Tika). They need to return plain text or a JSON response.
26     */
27    protected $parsers;
28
29    /**
30     * Maps fields returned by Tika or other JSON returning parsers to our own field names.
31     * Order does matter. Last non-empty field wins.
32     */
33    protected const FIELDMAP = [
34        'title' => 'title',
35        'dc:title' => 'title',
36        'content' => 'content',
37        'body' => 'content',
38        'dc:description' => 'content',
39        'X-TIKA:content' => 'content',
40        'Creation-Date' => 'created',
41        'dcterms:created' => 'created',
42        'meta:creation-date' => 'created',
43    ];
44
45    /**
46     * Load the parser setup
47     */
48    public function __construct()
49    {
50        $parsers = confToHash(self::CONFFILE);
51
52        if (empty($parsers)) {
53            throw new RuntimeException(
54                'Cannot process media, the parser configuration in ' . self::CONFFILE . ' is missing.'
55            );
56        }
57
58        $this->parsers = $parsers;
59    }
60
61    /**
62     * Parse the given file
63     *
64     * Returns an array with the following keys
65     *
66     * title - will be filled with the basename if no title could be extracted
67     * content - the content to index
68     * mime - the mime type as determined by us
69     * ext - the extension of the file
70     * language - the language code the file is written in
71     * created - creation time
72     *
73     * Returns false if the file can not be parsed and thus should not be indexed
74     *
75     * @param string $file
76     * @return array
77     * @fixme throw smarter exceptions
78     */
79    public function parse($file)
80    {
81        if (!file_exists($file)) {
82            throw new RuntimeException('File ' . $file . 'does not exist');
83        }
84        [$ext, $mime] = mimetype($file);
85        if (!$ext) {
86            throw new RuntimeException('Cannot parse file with unidentified extension');
87        }
88        if (!isset($this->parsers[$ext])) {
89            throw new RuntimeException('No parser configured for files of type ' . $ext);
90        };
91
92        $result = $this->runParser($file, $this->parsers[$ext]);
93        if ($result === false) {
94            throw new RuntimeException('No response from parser');
95        }
96
97        // defaults
98        $data = [
99            'title' => basename($file),
100            'content' => '',
101            'mime' => $mime,
102            'ext' => $ext,
103            'language' => '',
104            'created' => date('Y-m-d\TH:i:s\Z', filemtime($file)),
105        ];
106
107        // add what we got from the parser
108        $data = array_merge($data, $this->processParserResult($result));
109
110        // add language info
111        $data['language'] = $this->detectLanguage($data['content']);
112
113        return $data;
114    }
115
116    /**
117     * Execute the parser on the given file
118     *
119     * The parser can be an URL accepting a PUT request or a local command
120     *
121     * @param string $file
122     * @param string $parser
123     * @return bool|string
124     */
125    protected function runParser($file, $parser)
126    {
127        if (preg_match('/^https?:\/\//', $parser)) {
128            $http = new DokuHTTPClient();
129            $http->timeout = 90;
130            $ok = $http->sendRequest($parser, io_readFile($file, false), 'PUT');
131            if ($ok) {
132                return $http->resp_body;
133            }
134            return false;
135        } elseif (is_executable(strtok($parser, ' '))) {
136            $output = [];
137            $ok = 0;
138            exec(str_replace('%in%', escapeshellarg($file), $parser), $output, $ok);
139            if ($ok === 0) {
140                return implode(' ', $output);
141            }
142            return false;
143        }
144
145        return false;
146    }
147
148    /**
149     * @param string $result The string returned by the parser, might be json
150     * @return array
151     */
152    protected function processParserResult($result)
153    {
154        // decode json responses
155        if (
156            (
157                $result[0] !== '[' && $result[0] !== '{'
158            )
159            ||
160            (
161                ($decoded = json_decode($result, true)) === null
162            )
163        ) {
164            return [
165                'content' => $result,
166            ];
167        };
168        // we only want the first result from an Apache Tika response
169        if (isset($decoded[0]) && is_array($decoded[0])) {
170            $decoded = $decoded[0];
171        }
172
173        $data = [];
174        foreach (self::FIELDMAP as $from => $to) {
175            if (!blank($decoded[$from])) $data[$to] = trim($decoded[$from]);
176        }
177        return $data;
178    }
179
180    /**
181     * Return the language the given body was written in
182     *
183     * Will always return the wiki default language unless the translation plugin is installed.
184     *
185     * @param string $body
186     * @return string The detected language
187     * @fixme handle languages like 'pt-br' correctly
188     * @fixme maybe make this optional in favor of the namespace method
189     */
190    protected function detectLanguage($body)
191    {
192        global $conf;
193
194        /** @var helper_plugin_translation $trans */
195        $trans = plugin_load('helper', 'translation');
196        if ($trans === null) return $conf['lang'];
197
198        $ld = new Language();
199
200        $langs = array_keys($ld->detect($body)->whitelist(...$trans->translations)->close());
201        return array_shift($langs);
202    }
203}
204