1<?php
2
3use dokuwiki\Extension\Event;
4
5/**
6 * DokuWiki Plugin elasticsearch (Action Component)
7 *
8 * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
9 * @author  Kieback&Peter IT <it-support@kieback-peter.de>
10 * @author  Andreas Gohr <gohr@cosmocode.de>
11 */
12
13class action_plugin_elasticsearch_indexing extends DokuWiki_Action_Plugin {
14
15    const MIME_DOKUWIKI = 'text/dokuwiki';
16    const DOCTYPE_PAGE = 'page';
17    const DOCTYPE_MEDIA = 'media';
18
19    /**
20     * Registers a callback function for a given event
21     *
22     * @param Doku_Event_Handler $controller DokuWiki's event controller object
23     * @return void
24     */
25    public function register(Doku_Event_Handler $controller) {
26        $controller->register_hook('TPL_CONTENT_DISPLAY', 'BEFORE', $this, 'handle_tpl_content_display');
27        $controller->register_hook('IO_WIKIPAGE_WRITE', 'BEFORE', $this, 'handle_delete');
28        $controller->register_hook('MEDIA_UPLOAD_FINISH', 'AFTER', $this, 'handle_media_upload');
29        $controller->register_hook('MEDIA_DELETE_FILE', 'AFTER', $this, 'handle_media_delete');
30    }
31
32    /**
33     * Add pages to index
34     *
35     * @param Doku_Event $event event object by reference
36     * @return void
37     */
38    public function handle_tpl_content_display(Doku_Event $event) {
39        global $ID, $INFO;
40        $this->log(
41            'content display',
42            [
43                metaFN($ID, '.elasticsearch_indexed'),
44                wikiFN($ID),
45                wikiFN($INFO['id']),
46                $this->needs_indexing($ID) ? 'needs indexing' : 'no indexing needed',
47            ]
48        );
49        if ($this->needs_indexing($ID)) {
50            $this->index_page($ID);
51        }
52    }
53
54    /**
55     * Update index on media upload
56     *
57     * @param Doku_Event $event
58     * @throws Exception
59     */
60    public function handle_media_upload(Doku_Event $event)
61    {
62        $this->index_file($event->data[2]);
63    }
64
65    /**
66     * Remove pages from index
67     *
68     * @param Doku_Event $event event object by reference
69     * @return void
70     */
71    public function handle_delete(Doku_Event $event) {
72        if ($event->data[3]) return; // is old revision stuff
73        if (!empty($event->data[0][1])) return; // page still exists
74        // still here? delete from index
75        $this->delete_entry($event->data[2], self::DOCTYPE_PAGE);
76    }
77
78    /**
79     * Remove deleted media from index
80     *
81     * @param Doku_Event $event
82     * @param $param
83     */
84    public function handle_media_delete(Doku_Event $event, $param)
85    {
86        if ($event->data['unl']) $this->delete_entry($event->data['id'], self::DOCTYPE_MEDIA);
87    }
88
89    /**
90     * Check if the page $id has changed since the last indexing.
91     *
92     * @param string $id
93     * @return boolean
94     */
95    protected function needs_indexing($id) {
96        $indexStateFile = metaFN($id, '.elasticsearch_indexed');
97        $refreshStateFile = metaFN($id, '.elasticsearch_refresh');
98        $dataFile = wikiFN($id);
99
100        // no data file or page is hidden ('hidepages' configuration option) -> no indexing
101        if (!file_exists($dataFile) || isHiddenPage($id)) {
102            // page should not be indexed but has a state file, try to remove from index
103            if (file_exists($indexStateFile)) {
104                $this->delete_entry($id, self::DOCTYPE_PAGE);
105            }
106            return false;
107        }
108
109        // force indexing if we're called via cli (e.g. cron)
110        if (php_sapi_name() == 'cli') {
111            return true;
112        }
113        // check if latest indexing attempt is done after page update
114        // and after other updates related to the page made by plugins
115        if (file_exists($indexStateFile)) {
116            if (
117                (filemtime($indexStateFile) > filemtime($dataFile)) &&
118                (!file_exists($refreshStateFile) || filemtime($indexStateFile) > filemtime($refreshStateFile))
119            ) {
120                return false;
121            }
122        }
123        return true;
124    }
125
126    /**
127     * @param array $data
128     */
129    protected function write_index($data)
130    {
131        /** @var helper_plugin_elasticsearch_client $hlp */
132        $hlp = plugin_load('helper', 'elasticsearch_client');
133
134        $indexName    = $this->getConf('indexname');
135        $client       = $hlp->connect();
136        $index        = $client->getIndex($indexName);
137        $documentId   = $data['doctype'] . '_' . $data['uri'];
138
139        // check if the document still exists to update it or add it as a new one
140        try {
141            $client->updateDocument($documentId, ['doc' => $data], $index->getName());
142        } catch (\Elastica\Exception\NotFoundException $e) {
143            $document = new \Elastica\Document($documentId, $data);
144            $index->addDocument($document);
145        } catch (\Elastica\Exception\ResponseException $e) {
146            if ($e->getResponse()->getStatus() == 404) {
147                $document = new \Elastica\Document($documentId, $data);
148                $index->addDocument($document);
149            } else {
150                throw $e;
151            }
152        } catch (Exception $e) {
153            msg(
154                'Something went wrong on indexing please try again later or ask an admin for help.<br /><pre>' .
155                hsc(get_class($e) . ' ' . $e->getMessage()) . '</pre>',
156                -1
157            );
158            return;
159        }
160        $index->refresh();
161        $this->update_indexstate($data['uri']);
162    }
163
164    /**
165     * Save indexed state for a page or a media file
166     *
167     * @param string $id
168     * @param string $doctype
169     * @return bool
170     */
171    protected function update_indexstate($id, $doctype = self::DOCTYPE_PAGE) {
172        $indexStateFile = ($doctype === self::DOCTYPE_MEDIA) ?
173            mediaMetaFN($id, '.elasticsearch_indexed') :
174            metaFN($id, '.elasticsearch_indexed');
175        return io_saveFile($indexStateFile, '');
176    }
177
178    /**
179     * Remove the given document from the index
180     *
181     * @param $id
182     * @param $doctype
183     */
184    public function delete_entry($id, $doctype) {
185        /** @var helper_plugin_elasticsearch_client $hlp */
186        $hlp          = plugin_load('helper', 'elasticsearch_client');
187        $indexName    = $this->getConf('indexname');
188        $client       = $hlp->connect();
189        $index        = $client->getIndex($indexName);
190        $documentId   = $doctype . '_' . $id;
191
192        try {
193            $index->deleteById($documentId);
194            $index->refresh();
195            $this->log($documentId.' deleted ');
196        } catch(Exception $e) {
197            // we ignore this
198            $this->log($documentId.' not deleted '.$e->getMessage());
199        }
200
201        // delete state file
202        $stateFile = ($doctype === self::DOCTYPE_MEDIA) ?
203            mediaMetaFN($id, '.elasticsearch_indexed') :
204            metaFN($id, '.elasticsearch_indexed');
205        @unlink($stateFile);
206    }
207
208    /**
209     * Index a page
210     *
211     * @param $id
212     * @return void
213     */
214    public function index_page($id) {
215        global $conf;
216
217        $this->log('Indexing page ' . $id);
218
219        // collect the date which should be indexed
220        $meta = p_get_metadata($id, '', METADATA_RENDER_UNLIMITED);
221
222        $data             = array();
223        $data['uri']      = $id;
224        $data['created']  = date('Y-m-d\TH:i:s\Z', $meta['date']['created']);
225        $data['modified'] = date('Y-m-d\TH:i:s\Z', $meta['date']['modified']);
226        $data['user']     = $meta['user'];
227        $data['title']    = $meta['title'] ?? $id;
228        $data['abstract'] = $meta['description']['abstract'];
229        $data['syntax']   = rawWiki($id);
230        $data['mime']     = self::MIME_DOKUWIKI;
231        $data['doctype']  = self::DOCTYPE_PAGE;
232
233        // prefer rendered plaintext over raw syntax output
234        /** @var \renderer_plugin_text $textRenderer */
235        $textRenderer = plugin_load('renderer', 'text');
236        if ($textRenderer) {
237            $data['content'] = p_cached_output(wikiFN($id),'text');
238        } else {
239            $data['content']  = $data['syntax'];
240        }
241
242        /** @var helper_plugin_translation $trans */
243        $trans = plugin_load('helper', 'translation');
244        if($trans) {
245            // translation plugin available
246            $lc               = $trans->getLangPart($id);
247            $data['language'] = $trans->realLC($lc);
248        } else {
249            // no translation plugin
250            $data['language'] = $conf['lang'];
251        }
252
253        $data['namespace'] = getNS($id);
254        if(trim($data['namespace']) == '') {
255            unset($data['namespace']);
256        }
257
258        /** @var helper_plugin_elasticsearch_acl $hlpAcl */
259        $hlpAcl = plugin_load('helper', 'elasticsearch_acl');
260
261        $fullACL = $hlpAcl->getPageACL($id);
262        $queryACL = $hlpAcl->splitRules($fullACL);
263        $data = array_merge($data, $queryACL);
264
265        // let plugins add their own data to index
266        $pluginData = $this->getPluginData($data['uri']);
267        $data = array_merge($data, $pluginData);
268
269        $this->write_index($data);
270    }
271
272    /**
273     * Index a file
274     *
275     * @param string $fileId
276     * @return void
277     * @throws Exception
278     */
279    public function index_file($fileId) {
280        $this->log('Indexing file ' . $fileId);
281
282        $docparser = new \helper_plugin_elasticsearch_docparser();
283
284        $file = mediaFN($fileId);
285
286        try {
287            $data = $docparser->parse($file);
288            $data['uri'] = $fileId;
289            $data['doctype'] = self::DOCTYPE_MEDIA;
290            $data['modified'] = date('Y-m-d\TH:i:s\Z', filemtime($file));
291            $data['namespace'] = getNS($fileId);
292            if (trim($data['namespace']) == '') {
293                unset($data['namespace']);
294            }
295
296            /** @var helper_plugin_elasticsearch_acl $hlpAcl */
297            $hlpAcl = plugin_load('helper', 'elasticsearch_acl');
298
299            $fullACL = $hlpAcl->getPageACL($fileId);
300            $queryACL = $hlpAcl->splitRules($fullACL);
301            $data = array_merge($data, $queryACL);
302
303            $this->write_index($data);
304        } catch (RuntimeException $e) {
305            $this->log('Skipping ' . $fileId . ': ' . $e->getMessage());
306        }
307    }
308
309
310    /**
311     * Get plugin data to feed into the index.
312     * If data does not match previously defined mappings, it will be ignored.
313     *
314     * @param $id
315     * @return array
316     */
317    protected function getPluginData($id): array
318    {
319        $pluginData = ['uri' => $id];
320        Event::createAndTrigger('PLUGIN_ELASTICSEARCH_INDEXPAGE', $pluginData);
321        return $pluginData;
322    }
323
324    /**
325     * Log something to the debug log
326     *
327     * @param string $txt
328     * @param mixed $info
329     */
330    protected function log($txt, $info=null) {
331        $txt = 'ElasticSearch: '.$txt;
332        \dokuwiki\Logger::debug($txt, $info);
333    }
334}
335