1<?php 2 3// must be run within Dokuwiki 4if( ! defined('DOKU_INC')) die(); 5 6global $conf; 7define('SUBJ_IDX_DIR', $conf['indexdir'] . '/'); 8define('SUBJ_IDX_FILE', SUBJ_IDX_DIR . 'subjectindex.idx'); 9define('SUBJ_IDX_PLUGINS', DOKU_PLUGIN . 'subjectindex/plugins/'); 10define('SUBJ_IDX_DEFAULT_TARGETS', DOKU_PLUGIN . 'subjectindex/conf/default_targets'); 11define('SUBJ_IDX_LAST_CLEANUP', DOKU_PLUGIN . 'subjectindex/conf/last_cleanup'); 12define('SUBJ_IDX_HONOUR_COLS', 10); 13 14 15 16/** 17 * Class Index 18 * Contains and manages the basic index data structure: 19 * $path : key => section/path/to/entry 20 * $pid : key => page id where this entry is found 21 */ 22class SI_Index implements Iterator { 23 public $paths = array(); 24 public $pids = array(); 25 26 27 function __construct(Array $paths = null, Array $pids = null) { 28 if ($paths === null) { 29 $paths = $pids = array(); 30 } 31 $this->paths = $paths; 32 $this->pids = $pids; 33 } 34 35 36 function add($path, $pid) { 37 $this->paths[] = $path; 38 $this->pids[] = $pid; 39 } 40 41 42 function remove($key) { 43 unset($this->paths[$key]); 44 unset($this->pids[$key]); 45 } 46 47 48 function is_empty() { 49 return empty($this->paths); 50 } 51 52 53 function sort() { 54 uasort($this->paths, array($this, '_pathcmp')); 55 } 56 57 58 function rewind() { 59 reset($this->paths); 60 return $this; 61 } 62 63 function current($get_section = false) { 64 if ($this->valid()) { 65 $path = current($this->paths); 66 if ( ! $get_section) { 67 list($_, $path) = explode('/', $path, 2); 68 } 69 $key = key($this->paths); 70 $pid = $this->pids[$key]; 71 $result = array($path, $pid ); 72 } else { 73 $result = array(null, null); 74 } 75 return $result; 76 } 77 78 function key() { 79 return key($this->paths); 80 } 81 82 function next($get_section = false) { 83 next($this->paths); 84 $result = $this->current($get_section); 85 return $result; 86 } 87 88 function valid() { 89 $valid = current($this->paths) !== false; 90 return $valid; 91 } 92 93 94 /** 95 * Filter the index by section, regex (on path) or pid 96 * and return a new Index instance. 97 * 98 * @param null $section 99 * @param null $regex 100 * @param null $pid 101 * @return SI_Index 102 */ 103 function filtered($section = null, $regex = null, $pid = null) { 104 $fpaths = $this->paths; 105 if ($section !== null && is_numeric($section)) { 106 $fpaths = preg_grep('`^' . $section . '\/.*`', $fpaths); 107 } 108 if ( ! empty($regex)) { 109 $fpaths = preg_grep('`' . $regex . '`', $fpaths); 110 } 111 if ($pid !== null) { 112 $fpaths = array_intersect_key($this->paths, preg_grep('/' . $pid . '/', $this->pids)); 113 } 114 $fpids = array_intersect_key($this->pids, $fpaths); 115 $index = new SI_Index($fpaths, $fpids); 116 return $index; 117 } 118 119 120 121 122 /** 123 * String compare function: sorts index "paths" correctly 124 * i.e. root paths come before leaves 125 */ 126 private function _pathcmp($a, $b) { 127 $a_txt = strtok($a, '|'); 128 $b_txt = strtok($b, '|'); 129 if (strnatcasecmp($a_txt,$b_txt) != 0) { 130 $a_sub = strpos($b_txt, $a_txt); 131 $b_sub = strpos($a_txt, $b_txt); 132 if ($a_sub !== false && $a_sub == 0) { 133 return -1; 134 } elseif ($b_sub !== false && $b_sub == 0) { 135 return 1; 136 } 137 } 138 return strnatcasecmp($a, $b); 139 } 140} 141 142 143/** 144 * Handles updating and cleaning the SI index 145 */ 146class SI_Indexer { 147 148 private $index; 149 150 function __construct(SI_Index $index = null) { 151 if ($index === null) { 152 $this->index = SI_Utils::get_index(); 153 } else { 154 $this->index = $index; 155 } 156 } 157 158 159 function cleanup(Array $all_pages) { 160 // first remove any entries that reference non-existant files (currently once a day!) 161 if ($this->_cleanup_time()) { 162 $this->_remove_invalid_entries($this->index, $all_pages); 163 } 164 return $this; 165 } 166 167 168 /** 169 * Updates a list of matched entries in a page 170 * Creates, deletes or updates as necessary 171 * 172 * @param integer $pid The page id of the page being updated; needed to reference the correct page's entries 173 * @param array $matched_entries List of all current entries in the page (could be existing or new) 174 * @return SI_Indexer 175 */ 176 function update($pid, Array $matched_entries) { 177 178 // grab all existing subject index entries for this page 179 $page_index = $this->index->filtered(null, null, $pid); 180 $page_paths = $page_index->paths; 181 $updated = false; 182 foreach ($matched_entries as $match) { 183 $matched_path = $match['section'] . '/' . $match['entry']; 184 // compare the previous entries with the current entries (matched), does it exist already? 185 $key = array_search($matched_path, $page_paths); 186 if ($key !== false) { 187 // EXISTS: 188 unset($page_paths[$key]); 189 } else { 190 // CREATE: 191 $this->index->add($matched_path, $pid); 192 $updated = true; 193 } 194 } 195 // DELETE: remove index entries that no longer exist on this page 196 foreach (array_keys($page_paths) as $key) { 197 $this->index->remove($key); 198 $updated = true; 199 } 200 201 if ($updated) { 202 $this->index->sort(); 203 } 204 return $this; 205 } 206 207 208 function save() { 209 SI_Utils::save_index($this->index); 210 return $this; 211 } 212 213 214 /** 215 * Removes any pages that point to non-existing or otherwise invalid pages. 216 * Currently once a day. 217 * 218 * @param SI_Index $index The SI entry index 219 * @param $all_pages The Dokuwiki page index 220 */ 221 private function _remove_invalid_entries(SI_Index $index, $all_pages) { 222 $missing_pids = array(); 223 foreach ($index->pids as $key => $pid) { 224 // here we first check if the pid has already been processed, 225 // if so we just add the key straight to the missing list 226 // saving an unnecessary page check 227 if (isset($missing_pids[$pid]) || ! SI_Utils::is_valid_page($all_pages[$pid])) { 228 $missing_pids[$pid][] = $key; 229 } 230 } 231 foreach ($missing_pids as $pid) { 232 foreach ($pid as $key) 233 $index->remove($key); 234 } 235 } 236 237 238 /** 239 * Returns true if a full day has passed since last cleanup 240 * @return bool true => time to do clean up 241 */ 242 private function _cleanup_time() { 243 $last_cleanup = file_get_contents(SUBJ_IDX_LAST_CLEANUP); 244 if ($last_cleanup === false) $last_cleanup = 0; 245 if ($last_cleanup == 0 || time() > $last_cleanup + 60 * 60 * 24) { 246 file_put_contents(SUBJ_IDX_LAST_CLEANUP, time()); 247 return true; 248 } else { 249 return false; 250 } 251 } 252} 253 254 255 256class SI_Utils { 257 258 /** 259 * Get the subject index. 260 * Simply stored as a serialized pair of arrays, 261 * Tested up to 32,000 entries and still quicker than text files... 262 * 263 * @return SI_Index 264 */ 265 static function get_index() { 266 267 // first check for old index format (deprecated, was slower) 268 $fn = SUBJ_IDX_DIR . 'subject.idx'; 269 if (file_exists($fn)) { 270 list($paths, $pids) = self::import_old($fn); 271 $index = new SI_Index($paths, $pids); 272 unlink($fn); 273 self::save_index($index); 274 } else { 275 if (file_exists(SUBJ_IDX_FILE)) { 276 $data = file_get_contents(SUBJ_IDX_FILE); 277 $index = unserialize($data); 278 } else { 279 $index = new SI_Index(); 280 } 281 } 282 return $index; 283 } 284 285 286 private function import_old($fn) { 287 $entries = file($fn, FILE_IGNORE_NEW_LINES); 288 $path = array(); 289 $pid = array(); 290 foreach ($entries as $entry) { 291 $delim = stripos($entry, '|'); 292 $path[] = substr($entry, 0, $delim); 293 $pid[] = substr($entry, $delim + 1); 294 } 295 return array($path, $pid); 296 } 297 298 299 static function save_index(SI_Index $index) { 300 file_put_contents(SUBJ_IDX_FILE, serialize($index)); 301 } 302 303 /** 304 * Gets target wiki page name based on a section number. 305 * 306 * @param int $section index section number 307 * @return string page name | empty string ('') if missing 308 */ 309 static function get_target_page($section = 0) { 310 $pages = unserialize(file_get_contents(SUBJ_IDX_DEFAULT_TARGETS)); 311 if ($pages !== false && isset($pages[$section])) { 312 return $pages[$section]; 313 } else { 314 return ''; 315 } 316 } 317 318 319 /** 320 * Adds/Updates default target wiki page for entry links in a given section. 321 */ 322 static function set_target_page($page, $section = 0) { 323 // create if missing 324 if ( ! is_file(SUBJ_IDX_DEFAULT_TARGETS)) { 325 $pages = array(); 326 } else { 327 $pages = unserialize(file_get_contents(SUBJ_IDX_DEFAULT_TARGETS)); 328 } 329 $pages[$section] = $page; 330 file_put_contents(SUBJ_IDX_DEFAULT_TARGETS, serialize($pages)); 331 } 332 333 334 /** 335 * Removes invalid chars from any string to make it suitable for use as a HTML id attribute 336 * @param string $text Any text string 337 * @return string A string suitable for a HTML id attribute 338 */ 339 static function valid_id($text) { 340 $text = strtolower($text); 341 $text = str_replace('/', '-', $text); 342 $text = preg_replace('/[^0-9a-zA-Z-_]/', '', $text); 343 return $text; 344 } 345 346 347 /** 348 * Does this page: exist, is it visible, and does user have rights to see it? 349 */ 350 static function is_valid_page($id) { 351 $id = trim($id); 352 return (page_exists($id) && isVisiblePage($id) && ! (auth_quickaclcheck($id) < AUTH_READ)); 353 } 354}