1<?php
2
3// must be run within Dokuwiki
4if( ! defined('DOKU_INC')) die();
5
6global $conf;
7define('SUBJ_IDX_DIR', $conf['indexdir'] . '/');
8define('SUBJ_IDX_FILE', SUBJ_IDX_DIR . 'subjectindex.idx');
9define('SUBJ_IDX_PLUGINS', DOKU_PLUGIN . 'subjectindex/plugins/');
10define('SUBJ_IDX_DEFAULT_TARGETS', DOKU_PLUGIN . 'subjectindex/conf/default_targets');
11define('SUBJ_IDX_LAST_CLEANUP', DOKU_PLUGIN . 'subjectindex/conf/last_cleanup');
12define('SUBJ_IDX_HONOUR_COLS', 10);
13
14
15
16/**
17 * Class Index
18 * Contains and manages the basic index data structure:
19 * $path : key => section/path/to/entry
20 * $pid :  key => page id where this entry is found
21 */
22class SI_Index implements Iterator {
23    public $paths = array();
24    public $pids = array();
25
26
27    function __construct(Array $paths = null, Array $pids = null) {
28        if ($paths === null) {
29            $paths = $pids = array();
30        }
31        $this->paths = $paths;
32        $this->pids  = $pids;
33    }
34
35
36    function add($path, $pid) {
37        $this->paths[] = $path;
38        $this->pids[]  = $pid;
39    }
40
41
42    function remove($key) {
43        unset($this->paths[$key]);
44        unset($this->pids[$key]);
45    }
46
47
48    function is_empty() {
49        return empty($this->paths);
50    }
51
52
53    function sort() {
54        uasort($this->paths, array($this, '_pathcmp'));
55    }
56
57
58    function rewind() {
59        reset($this->paths);
60        return $this;
61    }
62
63    function current($get_section = false) {
64        if ($this->valid()) {
65            $path = current($this->paths);
66            if ( ! $get_section) {
67                list($_, $path) = explode('/', $path, 2);
68            }
69            $key = key($this->paths);
70            $pid = $this->pids[$key];
71            $result = array($path, $pid );
72        } else {
73            $result = array(null, null);
74        }
75        return $result;
76    }
77
78    function key() {
79        return key($this->paths);
80    }
81
82    function next($get_section = false) {
83        next($this->paths);
84        $result = $this->current($get_section);
85        return $result;
86    }
87
88    function valid() {
89        $valid = current($this->paths) !== false;
90        return $valid;
91    }
92
93
94    /**
95     * Filter the index by section, regex (on path) or pid
96     * and return a new Index instance.
97     *
98     * @param null $section
99     * @param null $regex
100     * @param null $pid
101     * @return SI_Index
102     */
103    function filtered($section = null, $regex = null, $pid = null) {
104        $fpaths = $this->paths;
105        if ($section !== null && is_numeric($section)) {
106            $fpaths = preg_grep('`^' . $section . '\/.*`', $fpaths);
107        }
108        if ( ! empty($regex)) {
109            $fpaths = preg_grep('`' . $regex . '`', $fpaths);
110        }
111        if ($pid !== null) {
112            $fpaths = array_intersect_key($this->paths, preg_grep('/' . $pid . '/', $this->pids));
113        }
114        $fpids = array_intersect_key($this->pids, $fpaths);
115        $index = new SI_Index($fpaths, $fpids);
116        return $index;
117    }
118
119
120
121
122    /**
123     * String compare function: sorts index "paths" correctly
124     * i.e. root paths come before leaves
125     */
126    private function _pathcmp($a, $b) {
127        $a_txt = strtok($a, '|');
128        $b_txt = strtok($b, '|');
129        if (strnatcasecmp($a_txt,$b_txt) != 0) {
130            $a_sub = strpos($b_txt, $a_txt);
131            $b_sub = strpos($a_txt, $b_txt);
132            if ($a_sub !== false && $a_sub == 0) {
133                return -1;
134            } elseif ($b_sub !== false && $b_sub == 0) {
135                return 1;
136            }
137        }
138        return strnatcasecmp($a, $b);
139    }
140}
141
142
143/**
144 * Handles updating and cleaning the SI index
145 */
146class SI_Indexer {
147
148    private $index;
149
150    function __construct(SI_Index $index = null) {
151        if ($index === null) {
152            $this->index = SI_Utils::get_index();
153        } else {
154            $this->index = $index;
155        }
156    }
157
158
159    function cleanup(Array $all_pages) {
160        // first remove any entries that reference non-existant files (currently once a day!)
161        if ($this->_cleanup_time()) {
162            $this->_remove_invalid_entries($this->index, $all_pages);
163        }
164        return $this;
165    }
166
167
168    /**
169     * Updates a list of matched entries in a page
170     * Creates, deletes or updates as necessary
171     *
172     * @param integer $pid              The page id of the page being updated; needed to reference the correct page's entries
173     * @param array   $matched_entries    List of all current entries in the page (could be existing or new)
174     * @return SI_Indexer
175     */
176    function update($pid, Array $matched_entries) {
177
178        // grab all existing subject index entries for this page
179        $page_index = $this->index->filtered(null, null, $pid);
180        $page_paths = $page_index->paths;
181        $updated = false;
182        foreach ($matched_entries as $match) {
183            $matched_path = $match['section'] . '/' . $match['entry'];
184            // compare the previous entries with the current entries (matched), does it exist already?
185            $key = array_search($matched_path, $page_paths);
186            if ($key !== false) {
187                // EXISTS:
188                unset($page_paths[$key]);
189            } else {
190                // CREATE:
191                $this->index->add($matched_path, $pid);
192                $updated = true;
193            }
194        }
195        // DELETE: remove index entries that no longer exist on this page
196        foreach (array_keys($page_paths) as $key) {
197            $this->index->remove($key);
198            $updated = true;
199        }
200
201        if ($updated) {
202            $this->index->sort();
203        }
204        return $this;
205    }
206
207
208    function save() {
209        SI_Utils::save_index($this->index);
210        return $this;
211    }
212
213
214    /**
215     * Removes any pages that point to non-existing or otherwise invalid pages.
216     * Currently once a day.
217     *
218     * @param SI_Index $index   The SI entry index
219     * @param $all_pages        The Dokuwiki page index
220     */
221    private function _remove_invalid_entries(SI_Index $index, $all_pages) {
222        $missing_pids = array();
223        foreach ($index->pids as $key => $pid) {
224            // here we first check if the pid has already been processed,
225            // if so we just add the key straight to the missing list
226            // saving an unnecessary page check
227            if (isset($missing_pids[$pid]) || ! SI_Utils::is_valid_page($all_pages[$pid])) {
228                $missing_pids[$pid][] = $key;
229            }
230        }
231        foreach ($missing_pids as $pid) {
232            foreach ($pid as $key)
233            $index->remove($key);
234        }
235    }
236
237
238    /**
239     * Returns true if a full day has passed since last cleanup
240     * @return bool true => time to do clean up
241     */
242    private function _cleanup_time() {
243        $last_cleanup = file_get_contents(SUBJ_IDX_LAST_CLEANUP);
244        if ($last_cleanup === false) $last_cleanup = 0;
245        if ($last_cleanup == 0 || time() > $last_cleanup + 60 * 60 * 24) {
246            file_put_contents(SUBJ_IDX_LAST_CLEANUP, time());
247            return true;
248        } else {
249            return false;
250        }
251    }
252}
253
254
255
256class SI_Utils {
257
258    /**
259     * Get the subject index.
260     * Simply stored as a serialized pair of arrays,
261     * Tested up to 32,000 entries and still quicker than text files...
262     *
263     * @return SI_Index
264     */
265    static function get_index() {
266
267        // first check for old index format (deprecated, was slower)
268        $fn = SUBJ_IDX_DIR . 'subject.idx';
269        if (file_exists($fn)) {
270            list($paths, $pids) = self::import_old($fn);
271            $index = new SI_Index($paths, $pids);
272            unlink($fn);
273            self::save_index($index);
274        } else {
275            if (file_exists(SUBJ_IDX_FILE)) {
276                $data = file_get_contents(SUBJ_IDX_FILE);
277                $index = unserialize($data);
278            } else {
279                $index = new SI_Index();
280            }
281        }
282        return $index;
283    }
284
285
286    private function import_old($fn) {
287        $entries = file($fn, FILE_IGNORE_NEW_LINES);
288        $path = array();
289        $pid = array();
290        foreach ($entries as $entry) {
291            $delim = stripos($entry, '|');
292            $path[] = substr($entry, 0, $delim);
293            $pid[] = substr($entry, $delim + 1);
294        }
295        return array($path, $pid);
296    }
297
298
299    static function save_index(SI_Index $index) {
300        file_put_contents(SUBJ_IDX_FILE, serialize($index));
301    }
302
303    /**
304     * Gets target wiki page name based on a section number.
305     *
306     * @param int $section  index section number
307     * @return string       page name | empty string ('') if missing
308     */
309    static function get_target_page($section = 0) {
310        $pages = unserialize(file_get_contents(SUBJ_IDX_DEFAULT_TARGETS));
311        if ($pages !== false && isset($pages[$section])) {
312            return $pages[$section];
313        } else {
314            return '';
315        }
316    }
317
318
319    /**
320     * Adds/Updates default target wiki page for entry links in a given section.
321     */
322    static function set_target_page($page, $section = 0) {
323        // create if missing
324        if ( ! is_file(SUBJ_IDX_DEFAULT_TARGETS)) {
325            $pages = array();
326        } else {
327            $pages = unserialize(file_get_contents(SUBJ_IDX_DEFAULT_TARGETS));
328        }
329        $pages[$section] = $page;
330        file_put_contents(SUBJ_IDX_DEFAULT_TARGETS, serialize($pages));
331    }
332
333
334    /**
335     * Removes invalid chars from any string to make it suitable for use as a HTML id attribute
336     * @param string $text Any text string
337     * @return string A string suitable for a HTML id attribute
338     */
339    static function valid_id($text) {
340        $text = strtolower($text);
341        $text = str_replace('/', '-', $text);
342        $text = preg_replace('/[^0-9a-zA-Z-_]/', '', $text);
343        return $text;
344    }
345
346
347    /**
348     * Does this page: exist, is it visible, and does user have rights to see it?
349     */
350    static function is_valid_page($id) {
351        $id = trim($id);
352        return (page_exists($id) && isVisiblePage($id) && ! (auth_quickaclcheck($id) < AUTH_READ));
353    }
354}