1b4ce25e9SAndreas Gohr<?php 2b4ce25e9SAndreas Gohr/** 3fcd3bb7cSAndreas Gohr * Functions to create the fulltext search index 4b4ce25e9SAndreas Gohr * 5b4ce25e9SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6b4ce25e9SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 7*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 8b4ce25e9SAndreas Gohr */ 9b4ce25e9SAndreas Gohr 10fa8adffeSAndreas Gohrif(!defined('DOKU_INC')) die('meh.'); 11b4ce25e9SAndreas Gohr 127c2ef4e8STom N Harris// Version tag used to force rebuild on upgrade 13*00803e56STom N Harrisdefine('INDEXER_VERSION', 3); 147c2ef4e8STom N Harris 1533815ce2SChris Smith// set the minimum token length to use in the index (note, this doesn't apply to numeric tokens) 16d3fb3219SAndreas Gohrif (!defined('IDX_MINWORDLENGTH')) define('IDX_MINWORDLENGTH',2); 1733815ce2SChris Smith 1893a60ad2SAndreas Gohr// Asian characters are handled as words. The following regexp defines the 1993a60ad2SAndreas Gohr// Unicode-Ranges for Asian characters 2093a60ad2SAndreas Gohr// Ranges taken from http://en.wikipedia.org/wiki/Unicode_block 2193a60ad2SAndreas Gohr// I'm no language expert. If you think some ranges are wrongly chosen or 2293a60ad2SAndreas Gohr// a range is missing, please contact me 23d5b23302STom N Harrisdefine('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai 24d5b23302STom N Harrisdefine('IDX_ASIAN2','['. 25d5b23302STom N Harris '\x{2E80}-\x{3040}'. // CJK -> Hangul 26d5b23302STom N Harris '\x{309D}-\x{30A0}'. 27a0c5c349STom N Harris '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'. 2893a60ad2SAndreas Gohr '\x{F900}-\x{FAFF}'. // CJK Compatibility Ideographs 2993a60ad2SAndreas Gohr '\x{FE30}-\x{FE4F}'. // CJK Compatibility Forms 3093a60ad2SAndreas Gohr ']'); 31d5b23302STom N Harrisdefine('IDX_ASIAN3','['. // Hiragana/Katakana (can be two characters) 32d5b23302STom N Harris '\x{3042}\x{3044}\x{3046}\x{3048}'. 33d5b23302STom N Harris '\x{304A}-\x{3062}\x{3064}-\x{3082}'. 34d5b23302STom N Harris '\x{3084}\x{3086}\x{3088}-\x{308D}'. 35d5b23302STom N Harris '\x{308F}-\x{3094}'. 36d5b23302STom N Harris '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'. 37d5b23302STom N Harris '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'. 38d5b23302STom N Harris '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'. 39d5b23302STom N Harris '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'. 40d5b23302STom N Harris ']['. 41d5b23302STom N Harris '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'. 42d5b23302STom N Harris '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'. 43d5b23302STom N Harris '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'. 44d5b23302STom N Harris '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'. 45d5b23302STom N Harris '\x{31F0}-\x{31FF}'. 46d5b23302STom N Harris ']?'); 47699b8a0bSAndreas Gohrdefine('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')'); 4893a60ad2SAndreas Gohr 49b4ce25e9SAndreas Gohr/** 507c2ef4e8STom N Harris * Version of the indexer taking into consideration the external tokenizer. 517c2ef4e8STom N Harris * The indexer is only compatible with data written by the same version. 527c2ef4e8STom N Harris * 537c2ef4e8STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 547c2ef4e8STom N Harris */ 557c2ef4e8STom N Harrisfunction idx_get_version(){ 567c2ef4e8STom N Harris global $conf; 577c2ef4e8STom N Harris if($conf['external_tokenizer']) 587c2ef4e8STom N Harris return INDEXER_VERSION . '+' . trim($conf['tokenizer_cmd']); 597c2ef4e8STom N Harris else 607c2ef4e8STom N Harris return INDEXER_VERSION; 617c2ef4e8STom N Harris} 627c2ef4e8STom N Harris 637c2ef4e8STom N Harris/** 64d5b23302STom N Harris * Measure the length of a string. 65d5b23302STom N Harris * Differs from strlen in handling of asian characters. 66d5b23302STom N Harris * 67d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 68d5b23302STom N Harris */ 69d5b23302STom N Harrisfunction wordlen($w){ 70d5b23302STom N Harris $l = strlen($w); 71d5b23302STom N Harris // If left alone, all chinese "words" will get put into w3.idx 72d5b23302STom N Harris // So the "length" of a "word" is faked 734b9792c6STom N Harris if(preg_match_all('/[\xE2-\xEF]/',$w,$leadbytes)) { 744b9792c6STom N Harris foreach($leadbytes[0] as $b) 754b9792c6STom N Harris $l += ord($b) - 0xE1; 764b9792c6STom N Harris } 77d5b23302STom N Harris return $l; 78d5b23302STom N Harris} 79d5b23302STom N Harris 80d5b23302STom N Harris/** 81*00803e56STom N Harris * Class that encapsulates operations on the indexer database. 82579b0f7eSTNHarris * 83579b0f7eSTNHarris * @author Tom N Harris <tnharris@whoopdedo.org> 84579b0f7eSTNHarris */ 85*00803e56STom N Harrisclass Doku_Indexer { 86579b0f7eSTNHarris 87579b0f7eSTNHarris /** 88*00803e56STom N Harris * Adds the contents of a page to the fulltext index 89dd35e9c9SAndreas Gohr * 90*00803e56STom N Harris * The added text replaces previous words for the same page. 91*00803e56STom N Harris * An empty value erases the page. 92*00803e56STom N Harris * 93*00803e56STom N Harris * @param string $page a page name 94*00803e56STom N Harris * @param string $text the body of the page 95*00803e56STom N Harris * @return boolean the function completed successfully 96*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 97dd35e9c9SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 98dd35e9c9SAndreas Gohr */ 99*00803e56STom N Harris public function addPageWords($page, $text) { 100*00803e56STom N Harris $this->_lock(); 101*00803e56STom N Harris 102*00803e56STom N Harris // load known documents 103*00803e56STom N Harris $page_idx = $this->_addIndexKey('page', '', $page); 104*00803e56STom N Harris if ($page_idx === false) { 105*00803e56STom N Harris $this->_unlock(); 106*00803e56STom N Harris return false; 107*00803e56STom N Harris } 108*00803e56STom N Harris 109*00803e56STom N Harris $pagewords = array(); 110*00803e56STom N Harris // get word usage in page 111*00803e56STom N Harris $words = $this->_getPageWords($text); 112*00803e56STom N Harris if ($words === false) { 113*00803e56STom N Harris $this->_unlock(); 114*00803e56STom N Harris return false; 115*00803e56STom N Harris } 116*00803e56STom N Harris 117*00803e56STom N Harris if (!empty($words)) { 118*00803e56STom N Harris foreach (array_keys($words) as $wlen) { 119*00803e56STom N Harris $index = $this->_getIndex('i', $wlen); 120*00803e56STom N Harris foreach ($words[$wlen] as $wid => $freq) { 121*00803e56STom N Harris $idx = ($wid<count($index)) ? $index[$wid] : ''; 122*00803e56STom N Harris $index[$wid] = $this->_updateTuple($idx, $pid, $freq); 123*00803e56STom N Harris $pagewords[] = "$wlen*$wid"; 124*00803e56STom N Harris } 125*00803e56STom N Harris if (!$this->_saveIndex('i', $wlen, $index)) { 126*00803e56STom N Harris $this->_unlock(); 127*00803e56STom N Harris return false; 128*00803e56STom N Harris } 129*00803e56STom N Harris } 130*00803e56STom N Harris } 131*00803e56STom N Harris 132*00803e56STom N Harris // Remove obsolete index entries 133*00803e56STom N Harris $pageword_idx = $this->_getIndexKey('pageword', '', $pid); 134*00803e56STom N Harris if ($pageword_idx !== '') { 135*00803e56STom N Harris $oldwords = explode(':',$pageword_idx); 136*00803e56STom N Harris $delwords = array_diff($oldwords, $pagewords); 137*00803e56STom N Harris $upwords = array(); 138*00803e56STom N Harris foreach ($delwords as $word) { 139*00803e56STom N Harris if ($word != '') { 140*00803e56STom N Harris list($wlen,$wid) = explode('*', $word); 141*00803e56STom N Harris $wid = (int)$wid; 142*00803e56STom N Harris $upwords[$wlen][] = $wid; 143*00803e56STom N Harris } 144*00803e56STom N Harris } 145*00803e56STom N Harris foreach ($upwords as $wlen => $widx) { 146*00803e56STom N Harris $index = $this->_getIndex('i', $wlen); 147*00803e56STom N Harris foreach ($widx as $wid) { 148*00803e56STom N Harris $index[$wid] = $this->_updateTuple($index[$wid], $pid, 0); 149*00803e56STom N Harris } 150*00803e56STom N Harris $this->_saveIndex('i', $wlen, $index); 151*00803e56STom N Harris } 152*00803e56STom N Harris } 153*00803e56STom N Harris // Save the reverse index 154*00803e56STom N Harris $pageword_idx = join(':', $pagewords); 155*00803e56STom N Harris if (!$this->_saveIndexKey('pageword', '', $pid, $pageword_idx)) { 156*00803e56STom N Harris $this->_unlock(); 157*00803e56STom N Harris return false; 158*00803e56STom N Harris } 159*00803e56STom N Harris 160*00803e56STom N Harris $this->_unlock(); 161dd35e9c9SAndreas Gohr return true; 162dd35e9c9SAndreas Gohr } 163dd35e9c9SAndreas Gohr 164dd35e9c9SAndreas Gohr /** 165*00803e56STom N Harris * Split the words in a page and add them to the index. 16644ca0adfSAndreas Gohr * 16744ca0adfSAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 16817f42b01SChris Smith * @author Christopher Smith <chris@jalakai.co.uk> 169*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 170b4ce25e9SAndreas Gohr */ 171*00803e56STom N Harris private function _getPageWords($text) { 17244ca0adfSAndreas Gohr global $conf; 17344ca0adfSAndreas Gohr 174*00803e56STom N Harris $tokens = $this->tokenizer($text); 17517f42b01SChris Smith $tokens = array_count_values($tokens); // count the frequency of each token 17617f42b01SChris Smith 17717f42b01SChris Smith $words = array(); 1784e1bf408STom N Harris foreach ($tokens as $w=>$c) { 179d5b23302STom N Harris $l = wordlen($w); 180579b0f7eSTNHarris if (isset($words[$l])){ 1814e1bf408STom N Harris $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 18217f42b01SChris Smith }else{ 1834e1bf408STom N Harris $words[$l] = array($w => $c); 18417f42b01SChris Smith } 18517f42b01SChris Smith } 18617f42b01SChris Smith 187579b0f7eSTNHarris // arrive here with $words = array(wordlen => array(word => frequency)) 188e5e50383SMichael Hamann $word_idx_modified = false; 189b4ce25e9SAndreas Gohr $index = array(); //resulting index 190579b0f7eSTNHarris foreach (array_keys($words) as $wlen) { 191*00803e56STom N Harris $word_idx = $this->_getIndex('w', $wlen); 192579b0f7eSTNHarris foreach ($words[$wlen] as $word => $freq) { 193*00803e56STom N Harris $wid = array_search($word, $word_idx); 194*00803e56STom N Harris if ($wid === false) { 195d5b23302STom N Harris $wid = count($word_idx); 196*00803e56STom N Harris $word_idx[] = $word; 197e5e50383SMichael Hamann $word_idx_modified = true; 198b4ce25e9SAndreas Gohr } 199579b0f7eSTNHarris if (!isset($index[$wlen])) 200579b0f7eSTNHarris $index[$wlen] = array(); 201579b0f7eSTNHarris $index[$wlen][$wid] = $freq; 20244ca0adfSAndreas Gohr } 203*00803e56STom N Harris // save back the word index 204*00803e56STom N Harris if ($word_idx_modified && !$this->_saveIndex('w', $wlen, $word_idx)) 20544ca0adfSAndreas Gohr return false; 20644ca0adfSAndreas Gohr } 207b4ce25e9SAndreas Gohr 208b4ce25e9SAndreas Gohr return $index; 209b4ce25e9SAndreas Gohr } 210b4ce25e9SAndreas Gohr 21144ca0adfSAndreas Gohr /** 212*00803e56STom N Harris * Add keys to the metadata index. 21344ca0adfSAndreas Gohr * 214*00803e56STom N Harris * Adding new keys does not remove other keys for the page. 215*00803e56STom N Harris * An empty value will erase the key. 216*00803e56STom N Harris * The $key parameter can be an array to add multiple keys. $value will 217*00803e56STom N Harris * not be used if $key is an array. 21844ca0adfSAndreas Gohr * 219*00803e56STom N Harris * @param string $page a page name 220*00803e56STom N Harris * @param mixed $key a key string or array of key=>value pairs 221*00803e56STom N Harris * @param mixed $value the value or list of values 222*00803e56STom N Harris * @return boolean the function completed successfully 223*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 22444ca0adfSAndreas Gohr */ 225*00803e56STom N Harris public function addMetaKeys($page, $key, $value=null) { 226*00803e56STom N Harris if (!is_array($key)) { 227*00803e56STom N Harris $key = array($key => $value); 228*00803e56STom N Harris } elseif (!is_null($value)) { 229*00803e56STom N Harris // $key is array, but $value is not null 230*00803e56STom N Harris trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 231*00803e56STom N Harris } 232*00803e56STom N Harris 233*00803e56STom N Harris $this->_lock(); 234b4ce25e9SAndreas Gohr 235488dd6ceSAndreas Gohr // load known documents 236*00803e56STom N Harris $pid = $this->_addIndexKey('page', '', $page); 237*00803e56STom N Harris if ($pid === false) { 238*00803e56STom N Harris $this->_unlock(); 239579b0f7eSTNHarris return false; 24044ca0adfSAndreas Gohr } 241*00803e56STom N Harris 242*00803e56STom N Harris foreach ($key as $name => $values) { 243*00803e56STom N Harris $metaname = idx_cleanName($name); 244*00803e56STom N Harris $metaidx = $this->_getIndex($metaname, '_i'); 245*00803e56STom N Harris $metawords = $this->_getIndex($metaname, '_w'); 246*00803e56STom N Harris $addwords = false; 247*00803e56STom N Harris $update = array(); 248*00803e56STom N Harris if (!is_array($val)) $values = array($values); 249*00803e56STom N Harris foreach ($values as $val) { 250*00803e56STom N Harris $val = (string)$val; 251*00803e56STom N Harris if ($val !== "") { 252*00803e56STom N Harris $id = array_search($val, $metawords); 253*00803e56STom N Harris if ($id === false) { 254*00803e56STom N Harris $id = count($metawords); 255*00803e56STom N Harris $metawords[$id] = $val; 256*00803e56STom N Harris $addwords = true; 257d5b23302STom N Harris } 258*00803e56STom N Harris $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 1); 259*00803e56STom N Harris $update[$id] = 1; 260d5b23302STom N Harris } else { 261*00803e56STom N Harris $id = array_search($val, $metawords); 262*00803e56STom N Harris if ($id !== false) { 263*00803e56STom N Harris $metaidx[$id] = $this->_updateTuple($metaidx[$id], $pid, 0); 264*00803e56STom N Harris $update[$id] = 0; 26544ca0adfSAndreas Gohr } 266579b0f7eSTNHarris } 267a0c5c349STom N Harris } 268*00803e56STom N Harris if (!empty($update)) { 269*00803e56STom N Harris if ($addwords) 270*00803e56STom N Harris $this->_saveIndex($metaname.'_w', '', $metawords); 271*00803e56STom N Harris $this->_saveIndex($metaname.'_i', '', $metaidx); 272*00803e56STom N Harris $val_idx = $this->_getIndexKey($metaname, '_p', $pid); 273*00803e56STom N Harris $val_idx = array_flip(explode(':', $val_idx)); 274*00803e56STom N Harris foreach ($update as $id => $add) { 275*00803e56STom N Harris if ($add) $val_idx[$id] = 1; 276*00803e56STom N Harris else unset($val_idx[$id]); 277b6344591STom N Harris } 278*00803e56STom N Harris $val_idx = array_keys($val_idx); 279*00803e56STom N Harris $this->_saveIndexKey($metaname.'_p', '', $pid, $val_idx); 280b6344591STom N Harris } 281*00803e56STom N Harris unset($metaidx); 282*00803e56STom N Harris unset($metawords); 283a0c5c349STom N Harris } 284579b0f7eSTNHarris return true; 28544ca0adfSAndreas Gohr } 28644ca0adfSAndreas Gohr 28744ca0adfSAndreas Gohr /** 288*00803e56STom N Harris * Remove a page from the index 28944ca0adfSAndreas Gohr * 290*00803e56STom N Harris * Erases entries in all known indexes. 29144ca0adfSAndreas Gohr * 292*00803e56STom N Harris * @param string $page a page name 293*00803e56STom N Harris * @return boolean the function completed successfully 294*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 29544ca0adfSAndreas Gohr */ 296*00803e56STom N Harris public function deletePage($page) { 297d5b23302STom N Harris } 29844ca0adfSAndreas Gohr 299d5b23302STom N Harris /** 300*00803e56STom N Harris * Split the text into words for fulltext search 301d5b23302STom N Harris * 302*00803e56STom N Harris * TODO: does this also need &$stopwords ? 303d5b23302STom N Harris * 304*00803e56STom N Harris * @param string $text plain text 305*00803e56STom N Harris * @param boolean $wc are wildcards allowed? 306*00803e56STom N Harris * @return array list of words in the text 307d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 308d5b23302STom N Harris * @author Andreas Gohr <andi@splitbrain.org> 309d5b23302STom N Harris */ 310*00803e56STom N Harris public function tokenizer($text, $wc=false) { 31122952965SYoBoY global $conf; 312*00803e56STom N Harris $words = array(); 313*00803e56STom N Harris $wc = ($wc) ? '' : '\*'; 314*00803e56STom N Harris $stopwords =& idx_get_stopwords(); 315*00803e56STom N Harris 316*00803e56STom N Harris if ($conf['external_tokenizer'] && $conf['tokenizer_cmd'] != '') { 317*00803e56STom N Harris if (0 == io_exec($conf['tokenizer_cmd'], $text, $output)) 318*00803e56STom N Harris $text = $output; 31922952965SYoBoY } else { 320*00803e56STom N Harris if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 321*00803e56STom N Harris // handle asian chars as single words (may fail on older PHP version) 322*00803e56STom N Harris $asia = @preg_replace('/('.IDX_ASIAN.')/u', ' \1 ', $text); 323*00803e56STom N Harris if (!is_null($asia)) $text = $asia; // recover from regexp falure 32422952965SYoBoY } 32522952965SYoBoY } 326*00803e56STom N Harris $text = strtr($text, "\r\n\t", ' '); 327*00803e56STom N Harris if (preg_match('/[^0-9A-Za-z ]/u', $text)) 328*00803e56STom N Harris $text = utf8_stripspecials($text, ' ', '\._\-:'.$wc); 32922952965SYoBoY 330*00803e56STom N Harris $wordlist = explode(' ', $text); 331*00803e56STom N Harris foreach ($wordlist as $word) { 332*00803e56STom N Harris $word = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 333*00803e56STom N Harris utf8_strtolower($word) : strtolower($word); 334*00803e56STom N Harris if (!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) continue; 335*00803e56STom N Harris if (array_search($word, $stopwords) !== false) continue; 336*00803e56STom N Harris $words[] = $word; 33722952965SYoBoY } 338*00803e56STom N Harris return $words; 33922952965SYoBoY } 34022952965SYoBoY 34122952965SYoBoY /** 342*00803e56STom N Harris * Find pages in the fulltext index containing the words, 343579b0f7eSTNHarris * 344*00803e56STom N Harris * The search words must be pre-tokenized, meaning only letters and 345*00803e56STom N Harris * numbers with an optional wildcard 346579b0f7eSTNHarris * 347*00803e56STom N Harris * The returned array will have the original tokens as key. The values 348*00803e56STom N Harris * in the returned list is an array with the page names as keys and the 349*00803e56STom N Harris * number of times that token appeas on the page as value. 350*00803e56STom N Harris * 351*00803e56STom N Harris * @param array $tokens list of words to search for 352*00803e56STom N Harris * @return array list of page names with usage counts 353*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 354*00803e56STom N Harris * @author Andreas Gohr <andi@splitbrain.org> 355579b0f7eSTNHarris */ 356*00803e56STom N Harris public function lookup($tokens) { 357*00803e56STom N Harris $result = array(); 358*00803e56STom N Harris $wids = $this->_getIndexWords($tokens, $result); 359*00803e56STom N Harris if (empty($wids)) return array(); 360*00803e56STom N Harris // load known words and documents 361*00803e56STom N Harris $page_idx = $this->_getIndex('page', ''); 362*00803e56STom N Harris $docs = array(); 363*00803e56STom N Harris foreach (array_keys($wids) as $wlen) { 364*00803e56STom N Harris $wids[$wlen] = array_unique($wids[$wlen]); 365*00803e56STom N Harris $index = $this->_getIndex('i', $wlen); 366*00803e56STom N Harris foreach($wids[$wlen] as $ixid) { 367*00803e56STom N Harris if ($ixid < count($index)) 368*00803e56STom N Harris $docs["$wlen*$ixid"] = $this->_parseTuples($page_idx, $index[$ixid]); 369d5b23302STom N Harris } 370d5b23302STom N Harris } 371*00803e56STom N Harris // merge found pages into final result array 372*00803e56STom N Harris $final = array(); 373*00803e56STom N Harris foreach ($result as $word => $res) { 374*00803e56STom N Harris $final[$word] = array(); 375*00803e56STom N Harris foreach ($res as $wid) { 376*00803e56STom N Harris $hits = &$docs[$wid]; 377*00803e56STom N Harris foreach ($hits as $hitkey => $hitcnt) { 378*00803e56STom N Harris // make sure the document still exists 379*00803e56STom N Harris if (!page_exists($hitkey, '', false)) continue; 380*00803e56STom N Harris if (!isset($final[$word][$hitkey])) 381*00803e56STom N Harris $final[$word][$hitkey] = $hitcnt; 382*00803e56STom N Harris else 383*00803e56STom N Harris $final[$word][$hitkey] += $hitcnt; 384d5b23302STom N Harris } 385579b0f7eSTNHarris } 386579b0f7eSTNHarris } 387*00803e56STom N Harris return $final; 388579b0f7eSTNHarris } 389579b0f7eSTNHarris 390579b0f7eSTNHarris /** 391*00803e56STom N Harris * Find pages containing a metadata key. 392d5b23302STom N Harris * 393*00803e56STom N Harris * The metadata values are compared as case-sensitive strings. Pass a 394*00803e56STom N Harris * callback function that returns true or false to use a different 395*00803e56STom N Harris * comparison function 396d5b23302STom N Harris * 397*00803e56STom N Harris * @param string $key name of the metadata key to look for 398*00803e56STom N Harris * @param string $value search term to look for 399*00803e56STom N Harris * @param callback $func comparison function 400*00803e56STom N Harris * @return array list with page names 401*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 402*00803e56STom N Harris */ 403*00803e56STom N Harris public function lookupKey($key, $value, $func=null) { 404*00803e56STom N Harris } 405*00803e56STom N Harris 406*00803e56STom N Harris /** 407*00803e56STom N Harris * Find the index ID of each search term. 408*00803e56STom N Harris * 409*00803e56STom N Harris * The query terms should only contain valid characters, with a '*' at 410*00803e56STom N Harris * either the beginning or end of the word (or both). 411*00803e56STom N Harris * The $result parameter can be used to merge the index locations with 412*00803e56STom N Harris * the appropriate query term. 413*00803e56STom N Harris * 414*00803e56STom N Harris * @param array $words The query terms. 415*00803e56STom N Harris * @param arrayref $result Set to word => array("length*id" ...) 416d5b23302STom N Harris * @return array Set to length => array(id ...) 417d5b23302STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 418d5b23302STom N Harris */ 419*00803e56STom N Harris private function _getIndexWords($words, &$result) { 420d5b23302STom N Harris $tokens = array(); 421d5b23302STom N Harris $tokenlength = array(); 422d5b23302STom N Harris $tokenwild = array(); 423d5b23302STom N Harris foreach ($words as $word) { 424d5b23302STom N Harris $result[$word] = array(); 425*00803e56STom N Harris $caret = false; 426*00803e56STom N Harris $dollar = false; 427d5b23302STom N Harris $xword = $word; 428d5b23302STom N Harris $wlen = wordlen($word); 429d5b23302STom N Harris 430d5b23302STom N Harris // check for wildcards 431d5b23302STom N Harris if (substr($xword, 0, 1) == '*') { 432d5b23302STom N Harris $xword = substr($xword, 1); 433*00803e56STom N Harris $caret = true; 434d5b23302STom N Harris $wlen -= 1; 435d5b23302STom N Harris } 436d5b23302STom N Harris if (substr($xword, -1, 1) == '*') { 437d5b23302STom N Harris $xword = substr($xword, 0, -1); 438*00803e56STom N Harris $dollar = true; 439d5b23302STom N Harris $wlen -= 1; 440d5b23302STom N Harris } 441*00803e56STom N Harris if ($wlen < IDX_MINWORDLENGTH && !$caret && !$dollar && !is_numeric($xword)) 442*00803e56STom N Harris continue; 443*00803e56STom N Harris if (!isset($tokens[$xword])) 444d5b23302STom N Harris $tokenlength[$wlen][] = $xword; 445*00803e56STom N Harris if ($caret || $dollar) { 446*00803e56STom N Harris $re = preg_quote($xword, '/'); 447*00803e56STom N Harris if ($caret) $re = '^'.$re; 448*00803e56STom N Harris if ($dollar) $re = $re.'$'; 449*00803e56STom N Harris $tokens[$xword][] = array($word, '/'.$re.'/'); 450*00803e56STom N Harris if (!isset($tokenwild[$xword])) 451*00803e56STom N Harris $tokenwild[$xword] = $wlen; 452*00803e56STom N Harris } else { 453d5b23302STom N Harris $tokens[$xword][] = array($word, null); 454d5b23302STom N Harris } 455*00803e56STom N Harris } 456d5b23302STom N Harris asort($tokenwild); 457*00803e56STom N Harris // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 458d5b23302STom N Harris // $tokenlength = array( base word length => base word ... ) 459d5b23302STom N Harris // $tokenwild = array( base word => base word length ... ) 460d5b23302STom N Harris $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 461*00803e56STom N Harris $indexes_known = $this->_indexLengths($length_filter); 462d5b23302STom N Harris if (!empty($tokenwild)) sort($indexes_known); 463d5b23302STom N Harris // get word IDs 464d5b23302STom N Harris $wids = array(); 465d5b23302STom N Harris foreach ($indexes_known as $ixlen) { 466*00803e56STom N Harris $word_idx = $this->_getIndex('w', $ixlen); 467d5b23302STom N Harris // handle exact search 468d5b23302STom N Harris if (isset($tokenlength[$ixlen])) { 469d5b23302STom N Harris foreach ($tokenlength[$ixlen] as $xword) { 470*00803e56STom N Harris $wid = array_search($xword, $word_idx); 471*00803e56STom N Harris if ($wid !== false) { 472d5b23302STom N Harris $wids[$ixlen][] = $wid; 473d5b23302STom N Harris foreach ($tokens[$xword] as $w) 474d5b23302STom N Harris $result[$w[0]][] = "$ixlen*$wid"; 475d5b23302STom N Harris } 476d5b23302STom N Harris } 477d5b23302STom N Harris } 478d5b23302STom N Harris // handle wildcard search 479d5b23302STom N Harris foreach ($tokenwild as $xword => $wlen) { 480d5b23302STom N Harris if ($wlen >= $ixlen) break; 481d5b23302STom N Harris foreach ($tokens[$xword] as $w) { 482d5b23302STom N Harris if (is_null($w[1])) continue; 483d5b23302STom N Harris foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) { 484d5b23302STom N Harris $wids[$ixlen][] = $wid; 485d5b23302STom N Harris $result[$w[0]][] = "$ixlen*$wid"; 486d5b23302STom N Harris } 487d5b23302STom N Harris } 488d5b23302STom N Harris } 489d5b23302STom N Harris } 490d5b23302STom N Harris return $wids; 491d5b23302STom N Harris } 492d5b23302STom N Harris 493d5b23302STom N Harris /** 494*00803e56STom N Harris * Return a list of all pages 495488dd6ceSAndreas Gohr * 496*00803e56STom N Harris * @param string $key list only pages containing the metadata key (optional) 497*00803e56STom N Harris * @return array list of page names 498*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 499*00803e56STom N Harris */ 500*00803e56STom N Harris public function getPages($key=null) { 501*00803e56STom N Harris $page_idx = $this->_getIndex('page', ''); 502*00803e56STom N Harris if (is_null($key)) return $page_idx; 503*00803e56STom N Harris } 504*00803e56STom N Harris 505*00803e56STom N Harris /** 506*00803e56STom N Harris * Return a list of words sorted by number of times used 507*00803e56STom N Harris * 508*00803e56STom N Harris * @param int $min bottom frequency threshold 509*00803e56STom N Harris * @param int $max upper frequency limit. No limit if $max<$min 510*00803e56STom N Harris * @param string $key metadata key to list. Uses the fulltext index if not given 511*00803e56STom N Harris * @return array list of words as the keys and frequency as values 512*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 513*00803e56STom N Harris */ 514*00803e56STom N Harris public function histogram($min=1, $max=0, $key=null) { 515*00803e56STom N Harris } 516*00803e56STom N Harris 517*00803e56STom N Harris /** 518*00803e56STom N Harris * Lock the indexer. 519*00803e56STom N Harris * 520*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 521*00803e56STom N Harris */ 522*00803e56STom N Harris private function _lock() { 523*00803e56STom N Harris global $conf; 524*00803e56STom N Harris $status = true; 525*00803e56STom N Harris $lock = $conf['lockdir'].'/_indexer.lock'; 526*00803e56STom N Harris while (!@mkdir($lock, $conf['dmode'])) { 527*00803e56STom N Harris usleep(50); 528*00803e56STom N Harris if (time() - @filemtime($lock) > 60*5) { 529*00803e56STom N Harris // looks like a stale lock, remove it 530*00803e56STom N Harris @rmdir($lock); 531*00803e56STom N Harris $status = "stale lock removed"; 532*00803e56STom N Harris } else { 533*00803e56STom N Harris return false; 534*00803e56STom N Harris } 535*00803e56STom N Harris } 536*00803e56STom N Harris if ($conf['dperm']) 537*00803e56STom N Harris chmod($lock, $conf['dperm']); 538*00803e56STom N Harris return $status; 539*00803e56STom N Harris } 540*00803e56STom N Harris 541*00803e56STom N Harris /** 542*00803e56STom N Harris * Release the indexer lock. 543*00803e56STom N Harris * 544*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 545*00803e56STom N Harris */ 546*00803e56STom N Harris private function _unlock() { 547*00803e56STom N Harris global $conf; 548*00803e56STom N Harris @rmdir($conf['lockdir'].'/_indexer.lock'); 549*00803e56STom N Harris return true; 550*00803e56STom N Harris } 551*00803e56STom N Harris 552*00803e56STom N Harris /** 553*00803e56STom N Harris * Retrieve the entire index. 554*00803e56STom N Harris * 555*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 556*00803e56STom N Harris */ 557*00803e56STom N Harris private function _getIndex($idx, $suffix) { 558*00803e56STom N Harris global $conf; 559*00803e56STom N Harris $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 560*00803e56STom N Harris if (!@file_exists($fn, FILE_IGNORE_NEW_LINES)) return array(); 561*00803e56STom N Harris return file($fn); 562*00803e56STom N Harris } 563*00803e56STom N Harris 564*00803e56STom N Harris /** 565*00803e56STom N Harris * Replace the contents of the index with an array. 566*00803e56STom N Harris * 567*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 568*00803e56STom N Harris */ 569*00803e56STom N Harris private function _saveIndex($idx, $suffix, &$lines) { 570*00803e56STom N Harris global $conf; 571*00803e56STom N Harris $fn = $conf['indexdir'].'/'.$idx.$suffix; 572*00803e56STom N Harris $fh = @fopen($fn.'.tmp', 'w'); 573*00803e56STom N Harris if (!$fh) return false; 574*00803e56STom N Harris fwrite($fh, join("\n", $lines)); 575*00803e56STom N Harris fclose($fh); 576*00803e56STom N Harris if (isset($conf['fperm'])) 577*00803e56STom N Harris chmod($fn.'.tmp', $conf['fperm']); 578*00803e56STom N Harris io_rename($fn.'.tmp', $fn.'.idx'); 579*00803e56STom N Harris if ($suffix !== '') 580*00803e56STom N Harris $this->_cacheIndexDir($idx, $suffix, empty($lines)); 581*00803e56STom N Harris return true; 582*00803e56STom N Harris } 583*00803e56STom N Harris 584*00803e56STom N Harris /** 585*00803e56STom N Harris * Retrieve a line from the index. 586*00803e56STom N Harris * 587*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 588*00803e56STom N Harris */ 589*00803e56STom N Harris private function _getIndexKey($idx, $suffix, $id) { 590*00803e56STom N Harris global $conf; 591*00803e56STom N Harris $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 592*00803e56STom N Harris if (!@file_exists($fn)) return ''; 593*00803e56STom N Harris $fh = @fopen($fn, 'r'); 594*00803e56STom N Harris if (!$fh) return ''; 595*00803e56STom N Harris $ln = -1; 596*00803e56STom N Harris while (($line = fgets($fh)) !== false) { 597*00803e56STom N Harris if (++$ln == $id) break; 598*00803e56STom N Harris } 599*00803e56STom N Harris fclose($fh); 600*00803e56STom N Harris return rtrim((string)$line); 601*00803e56STom N Harris } 602*00803e56STom N Harris 603*00803e56STom N Harris /** 604*00803e56STom N Harris * Write a line into the index. 605*00803e56STom N Harris * 606*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 607*00803e56STom N Harris */ 608*00803e56STom N Harris private function _saveIndexKey($idx, $suffix, $id, $line) { 609*00803e56STom N Harris global $conf; 610*00803e56STom N Harris if (substr($line, -1) != "\n") 611*00803e56STom N Harris $line .= "\n"; 612*00803e56STom N Harris $fn = $conf['indexdir'].'/'.$idx.$suffix; 613*00803e56STom N Harris $fh = @fopen($fn.'.tmp', 'w'); 614*00803e56STom N Harris if (!fh) return false; 615*00803e56STom N Harris $ih = @fopen($fn.'.idx', 'r'); 616*00803e56STom N Harris if ($ih) { 617*00803e56STom N Harris $ln = -1; 618*00803e56STom N Harris while (($curline = fgets($ih)) !== false) { 619*00803e56STom N Harris fwrite($fh, (++$ln == $id) ? $line : $curline); 620*00803e56STom N Harris } 621*00803e56STom N Harris if ($id > $ln) 622*00803e56STom N Harris fwrite($fh, $line); 623*00803e56STom N Harris fclose($ih); 624*00803e56STom N Harris } else { 625*00803e56STom N Harris fwrite($fh, $line); 626*00803e56STom N Harris } 627*00803e56STom N Harris fclose($fh); 628*00803e56STom N Harris if (isset($conf['fperm'])) 629*00803e56STom N Harris chmod($fn.'.tmp', $conf['fperm']); 630*00803e56STom N Harris io_rename($fn.'.tmp', $fn.'.idx'); 631*00803e56STom N Harris if ($suffix !== '') 632*00803e56STom N Harris $this->_cacheIndexDir($idx, $suffix); 633*00803e56STom N Harris return true; 634*00803e56STom N Harris } 635*00803e56STom N Harris 636*00803e56STom N Harris /** 637*00803e56STom N Harris * Retrieve or insert a value in the index. 638*00803e56STom N Harris * 639*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 640*00803e56STom N Harris */ 641*00803e56STom N Harris private function _addIndexKey($idx, $suffix, $value) { 642*00803e56STom N Harris $index = $this->_getIndex($idx, $suffix); 643*00803e56STom N Harris $id = array_search($value, $index); 644*00803e56STom N Harris if ($id === false) { 645*00803e56STom N Harris $id = count($index); 646*00803e56STom N Harris $index[$id] = $value; 647*00803e56STom N Harris if (!$this->_saveIndex($idx, $suffix, $index)) { 648*00803e56STom N Harris trigger_error("Failed to write $idx index", E_USER_ERROR); 649*00803e56STom N Harris return false; 650*00803e56STom N Harris } 651*00803e56STom N Harris } 652*00803e56STom N Harris return $id; 653*00803e56STom N Harris } 654*00803e56STom N Harris 655*00803e56STom N Harris private function _cacheIndexDir($idx, $suffix, $delete=false) { 656*00803e56STom N Harris global $conf; 657*00803e56STom N Harris if ($idx == 'i') 658*00803e56STom N Harris $cachename = $conf['indexdir'].'/lengths'; 659*00803e56STom N Harris else 660*00803e56STom N Harris $cachename = $conf['indexdir'].'/'.$idx.'lengths'; 661*00803e56STom N Harris $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 662*00803e56STom N Harris if ($lengths === false) $lengths = array(); 663*00803e56STom N Harris $old = array_search((string)$suffix, $lengths); 664*00803e56STom N Harris if (empty($lines)) { 665*00803e56STom N Harris if ($old === false) return; 666*00803e56STom N Harris unset($lengths[$old]); 667*00803e56STom N Harris } else { 668*00803e56STom N Harris if ($old !== false) return; 669*00803e56STom N Harris $lengths[] = $suffix; 670*00803e56STom N Harris sort($lengths); 671*00803e56STom N Harris } 672*00803e56STom N Harris $fh = @fopen($cachename.'.tmp', 'w'); 673*00803e56STom N Harris if (!$fh) { 674*00803e56STom N Harris trigger_error("Failed to write index cache", E_USER_ERROR); 675*00803e56STom N Harris return; 676*00803e56STom N Harris } 677*00803e56STom N Harris @fwrite($fh, implode("\n", $lengths)); 678*00803e56STom N Harris @fclose($fh); 679*00803e56STom N Harris if (isset($conf['fperm'])) 680*00803e56STom N Harris chmod($cachename.'.tmp', $conf['fperm']); 681*00803e56STom N Harris io_rename($cachename.'.tmp', $cachename.'.idx'); 682*00803e56STom N Harris } 683*00803e56STom N Harris 684*00803e56STom N Harris /** 685*00803e56STom N Harris * Get the list of lengths indexed in the wiki. 686*00803e56STom N Harris * 687*00803e56STom N Harris * Read the index directory or a cache file and returns 688*00803e56STom N Harris * a sorted array of lengths of the words used in the wiki. 689*00803e56STom N Harris * 690*00803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 691*00803e56STom N Harris */ 692*00803e56STom N Harris private function _listIndexLengths() { 693*00803e56STom N Harris global $conf; 694*00803e56STom N Harris $cachename = $conf['indexdir'].'/lengths'; 695*00803e56STom N Harris clearstatcache(); 696*00803e56STom N Harris if (@file_exists($cachename.'.idx')) { 697*00803e56STom N Harris $lengths = @file($cachename.'.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); 698*00803e56STom N Harris if ($lengths !== false) { 699*00803e56STom N Harris $idx = array(); 700*00803e56STom N Harris foreach ($lengths as $length) 701*00803e56STom N Harris $idx[] = (int)$length; 702*00803e56STom N Harris return $idx; 703*00803e56STom N Harris } 704*00803e56STom N Harris } 705*00803e56STom N Harris 706*00803e56STom N Harris $dir = @opendir($conf['indexdir']); 707*00803e56STom N Harris if ($dir === false) 708*00803e56STom N Harris return array(); 709*00803e56STom N Harris $lengths[] = array(); 710*00803e56STom N Harris while (($f = readdir($dir)) !== false) { 711*00803e56STom N Harris if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 712*00803e56STom N Harris $i = substr($f, 1, -4); 713*00803e56STom N Harris if (is_numeric($i)) 714*00803e56STom N Harris $lengths[] = (int)$i; 715*00803e56STom N Harris } 716*00803e56STom N Harris } 717*00803e56STom N Harris closedir($dir); 718*00803e56STom N Harris sort($lengths); 719*00803e56STom N Harris // save this in a file 720*00803e56STom N Harris $fh = @fopen($cachename.'.tmp', 'w'); 721*00803e56STom N Harris if (!$fh) { 722*00803e56STom N Harris trigger_error("Failed to write index cache", E_USER_ERROR); 723*00803e56STom N Harris return; 724*00803e56STom N Harris } 725*00803e56STom N Harris @fwrite($fh, implode("\n", $lengths)); 726*00803e56STom N Harris @fclose($fh); 727*00803e56STom N Harris if (isset($conf['fperm'])) 728*00803e56STom N Harris chmod($cachename.'.tmp', $conf['fperm']); 729*00803e56STom N Harris io_rename($cachename.'.tmp', $cachename.'.idx'); 730*00803e56STom N Harris 731*00803e56STom N Harris return $lengths; 732*00803e56STom N Harris } 733*00803e56STom N Harris 734*00803e56STom N Harris /** 735*00803e56STom N Harris * Get the word lengths that have been indexed. 736*00803e56STom N Harris * 737*00803e56STom N Harris * Reads the index directory and returns an array of lengths 738*00803e56STom N Harris * that there are indices for. 739*00803e56STom N Harris * 740*00803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 741*00803e56STom N Harris */ 742*00803e56STom N Harris private function _indexLengths($filter) { 743*00803e56STom N Harris global $conf; 744*00803e56STom N Harris $idx = array(); 745*00803e56STom N Harris if (is_array($filter)) { 746*00803e56STom N Harris // testing if index files exist only 747*00803e56STom N Harris $path = $conf['indexdir']."/i"; 748*00803e56STom N Harris foreach ($filter as $key => $value) { 749*00803e56STom N Harris if (@file_exists($path.$key.'.idx')) 750*00803e56STom N Harris $idx[] = $key; 751*00803e56STom N Harris } 752*00803e56STom N Harris } else { 753*00803e56STom N Harris $lengths = idx_listIndexLengths(); 754*00803e56STom N Harris foreach ($lengths as $key => $length) { 755*00803e56STom N Harris // keep all the values equal or superior 756*00803e56STom N Harris if ((int)$length >= (int)$filter) 757*00803e56STom N Harris $idx[] = $length; 758*00803e56STom N Harris } 759*00803e56STom N Harris } 760*00803e56STom N Harris return $idx; 761*00803e56STom N Harris } 762*00803e56STom N Harris 763*00803e56STom N Harris /** 764*00803e56STom N Harris * Insert or replace a tuple in a line. 765*00803e56STom N Harris * 766*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 767*00803e56STom N Harris */ 768*00803e56STom N Harris private function _updateTuple($line, $id, $count) { 769*00803e56STom N Harris $newLine = $line; 770*00803e56STom N Harris if ($newLine !== '') 771*00803e56STom N Harris $newLine = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $newLine); 772*00803e56STom N Harris $newLine = trim($newLine, ':'); 773*00803e56STom N Harris if ($count) { 774*00803e56STom N Harris if ($strlen($newLine) > 0) 775*00803e56STom N Harris return "$id*$count:".$newLine; 776*00803e56STom N Harris else 777*00803e56STom N Harris return "$id*$count".$newLine; 778*00803e56STom N Harris } 779*00803e56STom N Harris return $newLine; 780*00803e56STom N Harris } 781*00803e56STom N Harris 782*00803e56STom N Harris /** 783*00803e56STom N Harris * Split a line into an array of tuples. 784*00803e56STom N Harris * 785*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 786*00803e56STom N Harris * @author Andreas Gohr <andi@splitbrain.org> 787*00803e56STom N Harris */ 788*00803e56STom N Harris private function _parseTuples(&$keys, $line) { 789*00803e56STom N Harris $result = array(); 790*00803e56STom N Harris if ($line == '') return $result; 791*00803e56STom N Harris $parts = explode(':', $line); 792*00803e56STom N Harris foreach ($parts as $tuple) { 793*00803e56STom N Harris if ($tuple == '') continue; 794*00803e56STom N Harris list($key, $cnt) = explode('*', $tuple); 795*00803e56STom N Harris if (!$cnd) continue; 796*00803e56STom N Harris $key = $keys[$key]; 797*00803e56STom N Harris if (!$key) continue; 798*00803e56STom N Harris $result[$key] = $cnt; 799*00803e56STom N Harris } 800*00803e56STom N Harris return $result; 801*00803e56STom N Harris } 802*00803e56STom N Harris} 803*00803e56STom N Harris 804*00803e56STom N Harris/** 805*00803e56STom N Harris * Create an instance of the indexer. 806*00803e56STom N Harris * 807*00803e56STom N Harris * @return object a Doku_Indexer 808*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 809*00803e56STom N Harris */ 810*00803e56STom N Harrisfunction & idx_get_indexer() { 811*00803e56STom N Harris static $Indexer = null; 812*00803e56STom N Harris if (is_null($Indexer)) { 813*00803e56STom N Harris $Indexer = new Doku_Indexer(); 814*00803e56STom N Harris } 815*00803e56STom N Harris return $Indexer; 816*00803e56STom N Harris} 817*00803e56STom N Harris 818*00803e56STom N Harris/** 819*00803e56STom N Harris * Returns words that will be ignored. 820*00803e56STom N Harris * 821*00803e56STom N Harris * @return array list of stop words 822*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 823*00803e56STom N Harris */ 824*00803e56STom N Harrisfunction & idx_get_stopwords() { 825*00803e56STom N Harris static $stopwords = null; 826*00803e56STom N Harris if (is_null($stopwords)) { 827*00803e56STom N Harris global $conf; 828*00803e56STom N Harris $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 829*00803e56STom N Harris if(@file_exists($swfile)){ 830*00803e56STom N Harris $stopwords = file($swfile, FILE_IGNORE_NEW_LINES); 831*00803e56STom N Harris }else{ 832*00803e56STom N Harris $stopwords = array(); 833*00803e56STom N Harris } 834*00803e56STom N Harris } 835*00803e56STom N Harris return $stopwords; 836*00803e56STom N Harris} 837*00803e56STom N Harris 838*00803e56STom N Harris/** 839*00803e56STom N Harris * Adds/updates the search index for the given page 840*00803e56STom N Harris * 841*00803e56STom N Harris * Locking is handled internally. 842*00803e56STom N Harris * 843*00803e56STom N Harris * @param string $page name of the page to index 844*00803e56STom N Harris * @return boolean the function completed successfully 845*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 846*00803e56STom N Harris */ 847*00803e56STom N Harrisfunction idx_addPage($page) { 848*00803e56STom N Harris $body = ''; 849*00803e56STom N Harris $data = array($page, $body); 850*00803e56STom N Harris $evt = new Doku_Event('INDEXER_PAGE_ADD', $data); 851*00803e56STom N Harris if ($evt->advise_before()) $data[1] = $data[1] . " " . rawWiki($page); 852*00803e56STom N Harris $evt->advise_after(); 853*00803e56STom N Harris unset($evt); 854*00803e56STom N Harris list($page,$body) = $data; 855*00803e56STom N Harris 856*00803e56STom N Harris $Indexer =& idx_get_indexer(); 857*00803e56STom N Harris return $Indexer->addPageWords($page, $body); 858*00803e56STom N Harris} 859*00803e56STom N Harris 860*00803e56STom N Harris/** 861*00803e56STom N Harris * Find tokens in the fulltext index 862*00803e56STom N Harris * 863*00803e56STom N Harris * Takes an array of words and will return a list of matching 864*00803e56STom N Harris * pages for each one. 865488dd6ceSAndreas Gohr * 86663773904SAndreas Gohr * Important: No ACL checking is done here! All results are 86763773904SAndreas Gohr * returned, regardless of permissions 86863773904SAndreas Gohr * 869*00803e56STom N Harris * @param array $words list of words to search for 870*00803e56STom N Harris * @return array list of pages found, associated with the search terms 871488dd6ceSAndreas Gohr */ 872488dd6ceSAndreas Gohrfunction idx_lookup($words) { 873*00803e56STom N Harris $Indexer =& idx_get_indexer(); 874*00803e56STom N Harris return $Indexer->lookup($words); 875488dd6ceSAndreas Gohr} 876488dd6ceSAndreas Gohr 877488dd6ceSAndreas Gohr/** 878*00803e56STom N Harris * Split a string into tokens 879488dd6ceSAndreas Gohr * 880488dd6ceSAndreas Gohr */ 881*00803e56STom N Harrisfunction idx_tokenizer($string, $wc=false) { 882*00803e56STom N Harris $Indexer =& idx_get_indexer(); 883*00803e56STom N Harris return $Indexer->tokenizer($string, $wc); 884488dd6ceSAndreas Gohr} 885*00803e56STom N Harris 886*00803e56STom N Harris/* For compatibility */ 887488dd6ceSAndreas Gohr 888f5eb7cf0SAndreas Gohr/** 889*00803e56STom N Harris * Read the list of words in an index (if it exists). 890f5eb7cf0SAndreas Gohr * 8914e1bf408STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 892f5eb7cf0SAndreas Gohr */ 893*00803e56STom N Harrisfunction idx_getIndex($idx, $suffix) { 8941c07b9e6STom N Harris global $conf; 895*00803e56STom N Harris $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 896*00803e56STom N Harris if (!@file_exists($fn)) return array(); 897*00803e56STom N Harris return file($fn); 898*00803e56STom N Harris} 899f5eb7cf0SAndreas Gohr 900*00803e56STom N Harris/** 901*00803e56STom N Harris * Get the list of lengths indexed in the wiki. 902*00803e56STom N Harris * 903*00803e56STom N Harris * Read the index directory or a cache file and returns 904*00803e56STom N Harris * a sorted array of lengths of the words used in the wiki. 905*00803e56STom N Harris * 906*00803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 907*00803e56STom N Harris */ 908*00803e56STom N Harrisfunction idx_listIndexLengths() { 909*00803e56STom N Harris global $conf; 910*00803e56STom N Harris // testing what we have to do, create a cache file or not. 911*00803e56STom N Harris if ($conf['readdircache'] == 0) { 912*00803e56STom N Harris $docache = false; 9131c07b9e6STom N Harris } else { 914*00803e56STom N Harris clearstatcache(); 915*00803e56STom N Harris if (@file_exists($conf['indexdir'].'/lengths.idx') 916*00803e56STom N Harris && (time() < @filemtime($conf['indexdir'].'/lengths.idx') + $conf['readdircache'])) { 917*00803e56STom N Harris if (($lengths = @file($conf['indexdir'].'/lengths.idx', FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES)) !== false) { 918*00803e56STom N Harris $idx = array(); 919*00803e56STom N Harris foreach ($lengths as $length) { 920*00803e56STom N Harris $idx[] = (int)$length; 921*00803e56STom N Harris } 922*00803e56STom N Harris return $idx; 923f5eb7cf0SAndreas Gohr } 9241c07b9e6STom N Harris } 925*00803e56STom N Harris $docache = true; 926*00803e56STom N Harris } 9274e1bf408STom N Harris 928*00803e56STom N Harris if ($conf['readdircache'] == 0 || $docache) { 929*00803e56STom N Harris $dir = @opendir($conf['indexdir']); 930*00803e56STom N Harris if ($dir === false) 931*00803e56STom N Harris return array(); 932*00803e56STom N Harris $idx[] = array(); 933*00803e56STom N Harris while (($f = readdir($dir)) !== false) { 934*00803e56STom N Harris if (substr($f, 0, 1) == 'i' && substr($f, -4) == '.idx') { 935*00803e56STom N Harris $i = substr($f, 1, -4); 936*00803e56STom N Harris if (is_numeric($i)) 937*00803e56STom N Harris $idx[] = (int)$i; 938*00803e56STom N Harris } 939*00803e56STom N Harris } 940*00803e56STom N Harris closedir($dir); 941*00803e56STom N Harris sort($idx); 942*00803e56STom N Harris // save this in a file 943*00803e56STom N Harris if ($docache) { 944*00803e56STom N Harris $handle = @fopen($conf['indexdir'].'/lengths.idx', 'w'); 945*00803e56STom N Harris @fwrite($handle, implode("\n", $idx)); 946*00803e56STom N Harris @fclose($handle); 947*00803e56STom N Harris } 948*00803e56STom N Harris return $idx; 949*00803e56STom N Harris } 950*00803e56STom N Harris 951*00803e56STom N Harris return array(); 952*00803e56STom N Harris} 953*00803e56STom N Harris 954*00803e56STom N Harris/** 955*00803e56STom N Harris * Get the word lengths that have been indexed. 956*00803e56STom N Harris * 957*00803e56STom N Harris * Reads the index directory and returns an array of lengths 958*00803e56STom N Harris * that there are indices for. 959*00803e56STom N Harris * 960*00803e56STom N Harris * @author YoBoY <yoboy.leguesh@gmail.com> 961*00803e56STom N Harris */ 962*00803e56STom N Harrisfunction idx_indexLengths($filter) { 963*00803e56STom N Harris global $conf; 964*00803e56STom N Harris $idx = array(); 965*00803e56STom N Harris if (is_array($filter)) { 966*00803e56STom N Harris // testing if index files exist only 967*00803e56STom N Harris $path = $conf['indexdir']."/i"; 968*00803e56STom N Harris foreach ($filter as $key => $value) { 969*00803e56STom N Harris if (@file_exists($path.$key.'.idx')) 970*00803e56STom N Harris $idx[] = $key; 971*00803e56STom N Harris } 972f5eb7cf0SAndreas Gohr } else { 973*00803e56STom N Harris $lengths = idx_listIndexLengths(); 974*00803e56STom N Harris foreach ($lengths as $key => $length) { 975*00803e56STom N Harris // keep all the values equal or superior 976*00803e56STom N Harris if ((int)$length >= (int)$filter) 977*00803e56STom N Harris $idx[] = $length; 978f5eb7cf0SAndreas Gohr } 979*00803e56STom N Harris } 980*00803e56STom N Harris return $idx; 981f5eb7cf0SAndreas Gohr} 982f5eb7cf0SAndreas Gohr 983*00803e56STom N Harris/** 984*00803e56STom N Harris * Clean a name of a key for use as a file name. 985*00803e56STom N Harris * 986*00803e56STom N Harris * Romanizes non-latin characters, then strips away anything that's 987*00803e56STom N Harris * not a letter, number, or underscore. 988*00803e56STom N Harris * 989*00803e56STom N Harris * @author Tom N Harris <tnharris@whoopdedo.org> 990*00803e56STom N Harris */ 991*00803e56STom N Harrisfunction idx_cleanName($name) { 992*00803e56STom N Harris $name = utf8_romanize(trim((string)$name)); 993*00803e56STom N Harris $name = preg_replace('#[ \./\\:-]+#', '_', $name); 994*00803e56STom N Harris $name = preg_replace('/[^A-Za-z0-9_]/', '', $name); 995*00803e56STom N Harris return strtolower($name); 996f5eb7cf0SAndreas Gohr} 997f5eb7cf0SAndreas Gohr 998*00803e56STom N Harris//Setup VIM: ex: et ts=4 : 999