1<?php
2/**
3 * DokuWiki Plugin elasticsearch (DocParser Helper Component)
4 *
5 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
6 * @author  Andreas Gohr <gohr@cosmocode.de>
7 */
8
9require_once dirname(__FILE__) . '/../vendor/autoload.php';
10
11/**
12 * Convert a file to text and metainfos
13 */
14class helper_plugin_elasticsearch_docparser extends DokuWiki_Plugin
15{
16    const CONFFILE = DOKU_CONF . 'elasticsearch.conf';
17
18    /**
19     * @var array maps extensions to parsers. A parser may be a local cli tool (file is passed as argument)
20     * or an URL accepting input by PUT (like Apache Tika). They need to return plain text or a JSON response.
21     */
22    protected $parsers;
23
24    /**
25     * Maps fields returned by Tika or other JSON returning parsers to our own field names.
26     * Order does matter. Last non-empty field wins.
27     */
28    const FILEDMAP = [
29        'title' => 'title',
30        'dc:title' => 'title',
31        'content' => 'content',
32        'body' => 'content',
33        'dc:description' => 'content',
34        'X-TIKA:content' => 'content',
35        'Creation-Date' => 'created',
36        'dcterms:created' => 'created',
37        'meta:creation-date' => 'created',
38    ];
39
40    /**
41     * Load the parser setup
42     */
43    public function __construct()
44    {
45        $parsers = confToHash(self::CONFFILE);
46
47        if (empty($parsers)) {
48            throw new RuntimeException(
49                'Cannot process media, the parser configuration in ' . self::CONFFILE . ' is missing.'
50            );
51        }
52
53        $this->parsers = $parsers;
54    }
55
56    /**
57     * Parse the given file
58     *
59     * Returns an array with the following keys
60     *
61     * title - will be filled with the basename if no title could be extracted
62     * content - the content to index
63     * mime - the mime type as determined by us
64     * ext - the extension of the file
65     * language - the language code the file is written in
66     * created - creation time
67     *
68     * Returns false if the file can not be parsed and thus should not be indexed
69     *
70     * @param string $file
71     * @return array
72     * @fixme throw smarter exceptions
73     */
74    public function parse($file)
75    {
76        if (!file_exists($file)) {
77            throw new RuntimeException('File ' . $file . 'does not exist');
78        }
79        list($ext, $mime) = mimetype($file);
80        if (!$ext) {
81            throw new RuntimeException('Cannot parse file with unidentified extension');
82        }
83        if (!isset($this->parsers[$ext])) {
84            throw new RuntimeException('No parser configured for files of type ' . $ext);
85        };
86
87        $result = $this->runParser($file, $this->parsers[$ext]);
88        if ($result === false) {
89            throw new RuntimeException('No response from parser');
90        }
91
92        // defaults
93        $data = [
94            'title' => basename($file),
95            'content' => '',
96            'mime' => $mime,
97            'ext' => $ext,
98            'language' => '',
99            'created' => date('Y-m-d\TH:i:s\Z', filemtime($file)),
100        ];
101
102        // add what we got from the parser
103        $data = array_merge($data, $this->processParserResult($result));
104
105        // add language info
106        $data['language'] = $this->detectLanguage($data['content']);
107
108        return $data;
109    }
110
111    /**
112     * Execute the parser on the given file
113     *
114     * The parser can be an URL accepting a PUT request or a local command
115     *
116     * @param string $file
117     * @param string $parser
118     * @return bool|string
119     */
120    protected function runParser($file, $parser)
121    {
122        if (preg_match('/^https?:\/\//', $parser)) {
123            $http = new DokuHTTPClient();
124            $http->timeout = 90;
125            $ok = $http->sendRequest($parser, io_readFile($file, false), 'PUT');
126            if ($ok) {
127                return $http->resp_body;
128            }
129            return false;
130        } elseif (is_executable(strtok($parser, ' '))) {
131            $output = [];
132            $ok = 0;
133            exec(str_replace('%in%', escapeshellarg($file), $parser), $output, $ok);
134            if ($ok === 0) {
135                return join(' ', $output);
136            }
137            return false;
138        }
139
140        return false;
141    }
142
143    /**
144     * @param string $result The string returned by the parser, might be json
145     * @return array
146     */
147    protected function processParserResult($result)
148    {
149        // decode json responses
150        if (
151            (
152                $result[0] !== '[' && $result[0] !== '{'
153            )
154            ||
155            (
156                ($decoded = json_decode($result, true)) === null
157            )
158        ) {
159            return [
160                'content' => $result,
161            ];
162        };
163        // we only want the first result from an Apache Tika response
164        if (isset($decoded[0]) && is_array($decoded[0])) {
165            $decoded = $decoded[0];
166        }
167
168        $data = [];
169        foreach (self::FILEDMAP as $from => $to) {
170            if (!blank($decoded[$from])) $data[$to] = trim($decoded[$from]);
171        }
172        return $data;
173    }
174
175    /**
176     * Return the language the given body was written in
177     *
178     * Will always return the wiki default language unless the translation plugin is installed.
179     *
180     * @param string $body
181     * @return string The detected language
182     * @fixme handle languages like 'pt-br' correctly
183     * @fixme maybe make this optional in favor of the namespace method
184     */
185    protected function detectLanguage($body)
186    {
187        global $conf;
188
189        /** @var helper_plugin_translation $trans */
190        $trans = plugin_load('helper', 'translation');
191        if ($trans === null) return $conf['lang'];
192
193        $ld = new \LanguageDetection\Language();
194
195        $langs = array_keys($ld->detect($body)->whitelist(...$trans->translations)->close());
196        return array_shift($langs);
197    }
198}
199