1*6225b270SMichael Große<?php 2*6225b270SMichael Große 3*6225b270SMichael Großenamespace dokuwiki\Search; 4*6225b270SMichael Große 5*6225b270SMichael Großeuse dokuwiki\Extension\Event; 6*6225b270SMichael Große 7*6225b270SMichael Große/** 8*6225b270SMichael Große * Class that encapsulates operations on the indexer database. 9*6225b270SMichael Große * 10*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 11*6225b270SMichael Große */ 12*6225b270SMichael Großeclass Indexer { 13*6225b270SMichael Große /** 14*6225b270SMichael Große * @var array $pidCache Cache for getPID() 15*6225b270SMichael Große */ 16*6225b270SMichael Große protected $pidCache = array(); 17*6225b270SMichael Große 18*6225b270SMichael Große /** 19*6225b270SMichael Große * Adds the contents of a page to the fulltext index 20*6225b270SMichael Große * 21*6225b270SMichael Große * The added text replaces previous words for the same page. 22*6225b270SMichael Große * An empty value erases the page. 23*6225b270SMichael Große * 24*6225b270SMichael Große * @param string $page a page name 25*6225b270SMichael Große * @param string $text the body of the page 26*6225b270SMichael Große * @return string|boolean the function completed successfully 27*6225b270SMichael Große * 28*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 29*6225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 30*6225b270SMichael Große */ 31*6225b270SMichael Große public function addPageWords($page, $text) { 32*6225b270SMichael Große if (!$this->lock()) 33*6225b270SMichael Große return "locked"; 34*6225b270SMichael Große 35*6225b270SMichael Große // load known documents 36*6225b270SMichael Große $pid = $this->getPIDNoLock($page); 37*6225b270SMichael Große if ($pid === false) { 38*6225b270SMichael Große $this->unlock(); 39*6225b270SMichael Große return false; 40*6225b270SMichael Große } 41*6225b270SMichael Große 42*6225b270SMichael Große $pagewords = array(); 43*6225b270SMichael Große // get word usage in page 44*6225b270SMichael Große $words = $this->getPageWords($text); 45*6225b270SMichael Große if ($words === false) { 46*6225b270SMichael Große $this->unlock(); 47*6225b270SMichael Große return false; 48*6225b270SMichael Große } 49*6225b270SMichael Große 50*6225b270SMichael Große if (!empty($words)) { 51*6225b270SMichael Große foreach (array_keys($words) as $wlen) { 52*6225b270SMichael Große $index = $this->getIndex('i', $wlen); 53*6225b270SMichael Große foreach ($words[$wlen] as $wid => $freq) { 54*6225b270SMichael Große $idx = ($wid<count($index)) ? $index[$wid] : ''; 55*6225b270SMichael Große $index[$wid] = $this->updateTuple($idx, $pid, $freq); 56*6225b270SMichael Große $pagewords[] = "$wlen*$wid"; 57*6225b270SMichael Große } 58*6225b270SMichael Große if (!$this->saveIndex('i', $wlen, $index)) { 59*6225b270SMichael Große $this->unlock(); 60*6225b270SMichael Große return false; 61*6225b270SMichael Große } 62*6225b270SMichael Große } 63*6225b270SMichael Große } 64*6225b270SMichael Große 65*6225b270SMichael Große // Remove obsolete index entries 66*6225b270SMichael Große $pageword_idx = $this->getIndexKey('pageword', '', $pid); 67*6225b270SMichael Große if ($pageword_idx !== '') { 68*6225b270SMichael Große $oldwords = explode(':',$pageword_idx); 69*6225b270SMichael Große $delwords = array_diff($oldwords, $pagewords); 70*6225b270SMichael Große $upwords = array(); 71*6225b270SMichael Große foreach ($delwords as $word) { 72*6225b270SMichael Große if ($word != '') { 73*6225b270SMichael Große list($wlen, $wid) = explode('*', $word); 74*6225b270SMichael Große $wid = (int)$wid; 75*6225b270SMichael Große $upwords[$wlen][] = $wid; 76*6225b270SMichael Große } 77*6225b270SMichael Große } 78*6225b270SMichael Große foreach ($upwords as $wlen => $widx) { 79*6225b270SMichael Große $index = $this->getIndex('i', $wlen); 80*6225b270SMichael Große foreach ($widx as $wid) { 81*6225b270SMichael Große $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 82*6225b270SMichael Große } 83*6225b270SMichael Große $this->saveIndex('i', $wlen, $index); 84*6225b270SMichael Große } 85*6225b270SMichael Große } 86*6225b270SMichael Große // Save the reverse index 87*6225b270SMichael Große $pageword_idx = join(':', $pagewords); 88*6225b270SMichael Große if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) { 89*6225b270SMichael Große $this->unlock(); 90*6225b270SMichael Große return false; 91*6225b270SMichael Große } 92*6225b270SMichael Große 93*6225b270SMichael Große $this->unlock(); 94*6225b270SMichael Große return true; 95*6225b270SMichael Große } 96*6225b270SMichael Große 97*6225b270SMichael Große /** 98*6225b270SMichael Große * Split the words in a page and add them to the index. 99*6225b270SMichael Große * 100*6225b270SMichael Große * @param string $text content of the page 101*6225b270SMichael Große * @return array list of word IDs and number of times used 102*6225b270SMichael Große * 103*6225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 104*6225b270SMichael Große * @author Christopher Smith <chris@jalakai.co.uk> 105*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 106*6225b270SMichael Große */ 107*6225b270SMichael Große protected function getPageWords($text) { 108*6225b270SMichael Große 109*6225b270SMichael Große $tokens = $this->tokenizer($text); 110*6225b270SMichael Große $tokens = array_count_values($tokens); // count the frequency of each token 111*6225b270SMichael Große 112*6225b270SMichael Große $words = array(); 113*6225b270SMichael Große foreach ($tokens as $w=>$c) { 114*6225b270SMichael Große $l = wordlen($w); 115*6225b270SMichael Große if (isset($words[$l])){ 116*6225b270SMichael Große $words[$l][$w] = $c + (isset($words[$l][$w]) ? $words[$l][$w] : 0); 117*6225b270SMichael Große }else{ 118*6225b270SMichael Große $words[$l] = array($w => $c); 119*6225b270SMichael Große } 120*6225b270SMichael Große } 121*6225b270SMichael Große 122*6225b270SMichael Große // arrive here with $words = array(wordlen => array(word => frequency)) 123*6225b270SMichael Große $word_idx_modified = false; 124*6225b270SMichael Große $index = array(); //resulting index 125*6225b270SMichael Große foreach (array_keys($words) as $wlen) { 126*6225b270SMichael Große $word_idx = $this->getIndex('w', $wlen); 127*6225b270SMichael Große foreach ($words[$wlen] as $word => $freq) { 128*6225b270SMichael Große $word = (string)$word; 129*6225b270SMichael Große $wid = array_search($word, $word_idx, true); 130*6225b270SMichael Große if ($wid === false) { 131*6225b270SMichael Große $wid = count($word_idx); 132*6225b270SMichael Große $word_idx[] = $word; 133*6225b270SMichael Große $word_idx_modified = true; 134*6225b270SMichael Große } 135*6225b270SMichael Große if (!isset($index[$wlen])) 136*6225b270SMichael Große $index[$wlen] = array(); 137*6225b270SMichael Große $index[$wlen][$wid] = $freq; 138*6225b270SMichael Große } 139*6225b270SMichael Große // save back the word index 140*6225b270SMichael Große if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) 141*6225b270SMichael Große return false; 142*6225b270SMichael Große } 143*6225b270SMichael Große 144*6225b270SMichael Große return $index; 145*6225b270SMichael Große } 146*6225b270SMichael Große 147*6225b270SMichael Große /** 148*6225b270SMichael Große * Add/update keys to/of the metadata index. 149*6225b270SMichael Große * 150*6225b270SMichael Große * Adding new keys does not remove other keys for the page. 151*6225b270SMichael Große * An empty value will erase the key. 152*6225b270SMichael Große * The $key parameter can be an array to add multiple keys. $value will 153*6225b270SMichael Große * not be used if $key is an array. 154*6225b270SMichael Große * 155*6225b270SMichael Große * @param string $page a page name 156*6225b270SMichael Große * @param mixed $key a key string or array of key=>value pairs 157*6225b270SMichael Große * @param mixed $value the value or list of values 158*6225b270SMichael Große * @return boolean|string the function completed successfully 159*6225b270SMichael Große * 160*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 161*6225b270SMichael Große * @author Michael Hamann <michael@content-space.de> 162*6225b270SMichael Große */ 163*6225b270SMichael Große public function addMetaKeys($page, $key, $value=null) { 164*6225b270SMichael Große if (!is_array($key)) { 165*6225b270SMichael Große $key = array($key => $value); 166*6225b270SMichael Große } elseif (!is_null($value)) { 167*6225b270SMichael Große // $key is array, but $value is not null 168*6225b270SMichael Große trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 169*6225b270SMichael Große } 170*6225b270SMichael Große 171*6225b270SMichael Große if (!$this->lock()) 172*6225b270SMichael Große return "locked"; 173*6225b270SMichael Große 174*6225b270SMichael Große // load known documents 175*6225b270SMichael Große $pid = $this->getPIDNoLock($page); 176*6225b270SMichael Große if ($pid === false) { 177*6225b270SMichael Große $this->unlock(); 178*6225b270SMichael Große return false; 179*6225b270SMichael Große } 180*6225b270SMichael Große 181*6225b270SMichael Große // Special handling for titles so the index file is simpler 182*6225b270SMichael Große if (array_key_exists('title', $key)) { 183*6225b270SMichael Große $value = $key['title']; 184*6225b270SMichael Große if (is_array($value)) { 185*6225b270SMichael Große $value = $value[0]; 186*6225b270SMichael Große } 187*6225b270SMichael Große $this->saveIndexKey('title', '', $pid, $value); 188*6225b270SMichael Große unset($key['title']); 189*6225b270SMichael Große } 190*6225b270SMichael Große 191*6225b270SMichael Große foreach ($key as $name => $values) { 192*6225b270SMichael Große $metaname = idx_cleanName($name); 193*6225b270SMichael Große $this->addIndexKey('metadata', '', $metaname); 194*6225b270SMichael Große $metaidx = $this->getIndex($metaname.'_i', ''); 195*6225b270SMichael Große $metawords = $this->getIndex($metaname.'_w', ''); 196*6225b270SMichael Große $addwords = false; 197*6225b270SMichael Große 198*6225b270SMichael Große if (!is_array($values)) $values = array($values); 199*6225b270SMichael Große 200*6225b270SMichael Große $val_idx = $this->getIndexKey($metaname.'_p', '', $pid); 201*6225b270SMichael Große if ($val_idx !== '') { 202*6225b270SMichael Große $val_idx = explode(':', $val_idx); 203*6225b270SMichael Große // -1 means remove, 0 keep, 1 add 204*6225b270SMichael Große $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1)); 205*6225b270SMichael Große } else { 206*6225b270SMichael Große $val_idx = array(); 207*6225b270SMichael Große } 208*6225b270SMichael Große 209*6225b270SMichael Große foreach ($values as $val) { 210*6225b270SMichael Große $val = (string)$val; 211*6225b270SMichael Große if ($val !== "") { 212*6225b270SMichael Große $id = array_search($val, $metawords, true); 213*6225b270SMichael Große if ($id === false) { 214*6225b270SMichael Große // didn't find $val, so we'll add it to the end of metawords and create a placeholder in metaidx 215*6225b270SMichael Große $id = count($metawords); 216*6225b270SMichael Große $metawords[$id] = $val; 217*6225b270SMichael Große $metaidx[$id] = ''; 218*6225b270SMichael Große $addwords = true; 219*6225b270SMichael Große } 220*6225b270SMichael Große // test if value is already in the index 221*6225b270SMichael Große if (isset($val_idx[$id]) && $val_idx[$id] <= 0){ 222*6225b270SMichael Große $val_idx[$id] = 0; 223*6225b270SMichael Große } else { // else add it 224*6225b270SMichael Große $val_idx[$id] = 1; 225*6225b270SMichael Große } 226*6225b270SMichael Große } 227*6225b270SMichael Große } 228*6225b270SMichael Große 229*6225b270SMichael Große if ($addwords) { 230*6225b270SMichael Große $this->saveIndex($metaname.'_w', '', $metawords); 231*6225b270SMichael Große } 232*6225b270SMichael Große $vals_changed = false; 233*6225b270SMichael Große foreach ($val_idx as $id => $action) { 234*6225b270SMichael Große if ($action == -1) { 235*6225b270SMichael Große $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0); 236*6225b270SMichael Große $vals_changed = true; 237*6225b270SMichael Große unset($val_idx[$id]); 238*6225b270SMichael Große } elseif ($action == 1) { 239*6225b270SMichael Große $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1); 240*6225b270SMichael Große $vals_changed = true; 241*6225b270SMichael Große } 242*6225b270SMichael Große } 243*6225b270SMichael Große 244*6225b270SMichael Große if ($vals_changed) { 245*6225b270SMichael Große $this->saveIndex($metaname.'_i', '', $metaidx); 246*6225b270SMichael Große $val_idx = implode(':', array_keys($val_idx)); 247*6225b270SMichael Große $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx); 248*6225b270SMichael Große } 249*6225b270SMichael Große 250*6225b270SMichael Große unset($metaidx); 251*6225b270SMichael Große unset($metawords); 252*6225b270SMichael Große } 253*6225b270SMichael Große 254*6225b270SMichael Große $this->unlock(); 255*6225b270SMichael Große return true; 256*6225b270SMichael Große } 257*6225b270SMichael Große 258*6225b270SMichael Große /** 259*6225b270SMichael Große * Rename a page in the search index without changing the indexed content. This function doesn't check if the 260*6225b270SMichael Große * old or new name exists in the filesystem. It returns an error if the old page isn't in the page list of the 261*6225b270SMichael Große * indexer and it deletes all previously indexed content of the new page. 262*6225b270SMichael Große * 263*6225b270SMichael Große * @param string $oldpage The old page name 264*6225b270SMichael Große * @param string $newpage The new page name 265*6225b270SMichael Große * @return string|bool If the page was successfully renamed, can be a message in the case of an error 266*6225b270SMichael Große */ 267*6225b270SMichael Große public function renamePage($oldpage, $newpage) { 268*6225b270SMichael Große if (!$this->lock()) return 'locked'; 269*6225b270SMichael Große 270*6225b270SMichael Große $pages = $this->getPages(); 271*6225b270SMichael Große 272*6225b270SMichael Große $id = array_search($oldpage, $pages, true); 273*6225b270SMichael Große if ($id === false) { 274*6225b270SMichael Große $this->unlock(); 275*6225b270SMichael Große return 'page is not in index'; 276*6225b270SMichael Große } 277*6225b270SMichael Große 278*6225b270SMichael Große $new_id = array_search($newpage, $pages, true); 279*6225b270SMichael Große if ($new_id !== false) { 280*6225b270SMichael Große // make sure the page is not in the index anymore 281*6225b270SMichael Große if ($this->deletePageNoLock($newpage) !== true) { 282*6225b270SMichael Große return false; 283*6225b270SMichael Große } 284*6225b270SMichael Große 285*6225b270SMichael Große $pages[$new_id] = 'deleted:'.time().rand(0, 9999); 286*6225b270SMichael Große } 287*6225b270SMichael Große 288*6225b270SMichael Große $pages[$id] = $newpage; 289*6225b270SMichael Große 290*6225b270SMichael Große // update index 291*6225b270SMichael Große if (!$this->saveIndex('page', '', $pages)) { 292*6225b270SMichael Große $this->unlock(); 293*6225b270SMichael Große return false; 294*6225b270SMichael Große } 295*6225b270SMichael Große 296*6225b270SMichael Große // reset the pid cache 297*6225b270SMichael Große $this->pidCache = array(); 298*6225b270SMichael Große 299*6225b270SMichael Große $this->unlock(); 300*6225b270SMichael Große return true; 301*6225b270SMichael Große } 302*6225b270SMichael Große 303*6225b270SMichael Große /** 304*6225b270SMichael Große * Renames a meta value in the index. This doesn't change the meta value in the pages, it assumes that all pages 305*6225b270SMichael Große * will be updated. 306*6225b270SMichael Große * 307*6225b270SMichael Große * @param string $key The metadata key of which a value shall be changed 308*6225b270SMichael Große * @param string $oldvalue The old value that shall be renamed 309*6225b270SMichael Große * @param string $newvalue The new value to which the old value shall be renamed, if exists values will be merged 310*6225b270SMichael Große * @return bool|string If renaming the value has been successful, false or error message on error. 311*6225b270SMichael Große */ 312*6225b270SMichael Große public function renameMetaValue($key, $oldvalue, $newvalue) { 313*6225b270SMichael Große if (!$this->lock()) return 'locked'; 314*6225b270SMichael Große 315*6225b270SMichael Große // change the relation references index 316*6225b270SMichael Große $metavalues = $this->getIndex($key, '_w'); 317*6225b270SMichael Große $oldid = array_search($oldvalue, $metavalues, true); 318*6225b270SMichael Große if ($oldid !== false) { 319*6225b270SMichael Große $newid = array_search($newvalue, $metavalues, true); 320*6225b270SMichael Große if ($newid !== false) { 321*6225b270SMichael Große // free memory 322*6225b270SMichael Große unset ($metavalues); 323*6225b270SMichael Große 324*6225b270SMichael Große // okay, now we have two entries for the same value. we need to merge them. 325*6225b270SMichael Große $indexline = $this->getIndexKey($key.'_i', '', $oldid); 326*6225b270SMichael Große if ($indexline != '') { 327*6225b270SMichael Große $newindexline = $this->getIndexKey($key.'_i', '', $newid); 328*6225b270SMichael Große $pagekeys = $this->getIndex($key.'_p', ''); 329*6225b270SMichael Große $parts = explode(':', $indexline); 330*6225b270SMichael Große foreach ($parts as $part) { 331*6225b270SMichael Große list($id, $count) = explode('*', $part); 332*6225b270SMichael Große $newindexline = $this->updateTuple($newindexline, $id, $count); 333*6225b270SMichael Große 334*6225b270SMichael Große $keyline = explode(':', $pagekeys[$id]); 335*6225b270SMichael Große // remove old meta value 336*6225b270SMichael Große $keyline = array_diff($keyline, array($oldid)); 337*6225b270SMichael Große // add new meta value when not already present 338*6225b270SMichael Große if (!in_array($newid, $keyline)) { 339*6225b270SMichael Große array_push($keyline, $newid); 340*6225b270SMichael Große } 341*6225b270SMichael Große $pagekeys[$id] = implode(':', $keyline); 342*6225b270SMichael Große } 343*6225b270SMichael Große $this->saveIndex($key.'_p', '', $pagekeys); 344*6225b270SMichael Große unset($pagekeys); 345*6225b270SMichael Große $this->saveIndexKey($key.'_i', '', $oldid, ''); 346*6225b270SMichael Große $this->saveIndexKey($key.'_i', '', $newid, $newindexline); 347*6225b270SMichael Große } 348*6225b270SMichael Große } else { 349*6225b270SMichael Große $metavalues[$oldid] = $newvalue; 350*6225b270SMichael Große if (!$this->saveIndex($key.'_w', '', $metavalues)) { 351*6225b270SMichael Große $this->unlock(); 352*6225b270SMichael Große return false; 353*6225b270SMichael Große } 354*6225b270SMichael Große } 355*6225b270SMichael Große } 356*6225b270SMichael Große 357*6225b270SMichael Große $this->unlock(); 358*6225b270SMichael Große return true; 359*6225b270SMichael Große } 360*6225b270SMichael Große 361*6225b270SMichael Große /** 362*6225b270SMichael Große * Remove a page from the index 363*6225b270SMichael Große * 364*6225b270SMichael Große * Erases entries in all known indexes. 365*6225b270SMichael Große * 366*6225b270SMichael Große * @param string $page a page name 367*6225b270SMichael Große * @return string|boolean the function completed successfully 368*6225b270SMichael Große * 369*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 370*6225b270SMichael Große */ 371*6225b270SMichael Große public function deletePage($page) { 372*6225b270SMichael Große if (!$this->lock()) 373*6225b270SMichael Große return "locked"; 374*6225b270SMichael Große 375*6225b270SMichael Große $result = $this->deletePageNoLock($page); 376*6225b270SMichael Große 377*6225b270SMichael Große $this->unlock(); 378*6225b270SMichael Große 379*6225b270SMichael Große return $result; 380*6225b270SMichael Große } 381*6225b270SMichael Große 382*6225b270SMichael Große /** 383*6225b270SMichael Große * Remove a page from the index without locking the index, only use this function if the index is already locked 384*6225b270SMichael Große * 385*6225b270SMichael Große * Erases entries in all known indexes. 386*6225b270SMichael Große * 387*6225b270SMichael Große * @param string $page a page name 388*6225b270SMichael Große * @return boolean the function completed successfully 389*6225b270SMichael Große * 390*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 391*6225b270SMichael Große */ 392*6225b270SMichael Große protected function deletePageNoLock($page) { 393*6225b270SMichael Große // load known documents 394*6225b270SMichael Große $pid = $this->getPIDNoLock($page); 395*6225b270SMichael Große if ($pid === false) { 396*6225b270SMichael Große return false; 397*6225b270SMichael Große } 398*6225b270SMichael Große 399*6225b270SMichael Große // Remove obsolete index entries 400*6225b270SMichael Große $pageword_idx = $this->getIndexKey('pageword', '', $pid); 401*6225b270SMichael Große if ($pageword_idx !== '') { 402*6225b270SMichael Große $delwords = explode(':',$pageword_idx); 403*6225b270SMichael Große $upwords = array(); 404*6225b270SMichael Große foreach ($delwords as $word) { 405*6225b270SMichael Große if ($word != '') { 406*6225b270SMichael Große list($wlen,$wid) = explode('*', $word); 407*6225b270SMichael Große $wid = (int)$wid; 408*6225b270SMichael Große $upwords[$wlen][] = $wid; 409*6225b270SMichael Große } 410*6225b270SMichael Große } 411*6225b270SMichael Große foreach ($upwords as $wlen => $widx) { 412*6225b270SMichael Große $index = $this->getIndex('i', $wlen); 413*6225b270SMichael Große foreach ($widx as $wid) { 414*6225b270SMichael Große $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 415*6225b270SMichael Große } 416*6225b270SMichael Große $this->saveIndex('i', $wlen, $index); 417*6225b270SMichael Große } 418*6225b270SMichael Große } 419*6225b270SMichael Große // Save the reverse index 420*6225b270SMichael Große if (!$this->saveIndexKey('pageword', '', $pid, "")) { 421*6225b270SMichael Große return false; 422*6225b270SMichael Große } 423*6225b270SMichael Große 424*6225b270SMichael Große $this->saveIndexKey('title', '', $pid, ""); 425*6225b270SMichael Große $keyidx = $this->getIndex('metadata', ''); 426*6225b270SMichael Große foreach ($keyidx as $metaname) { 427*6225b270SMichael Große $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid)); 428*6225b270SMichael Große $meta_idx = $this->getIndex($metaname.'_i', ''); 429*6225b270SMichael Große foreach ($val_idx as $id) { 430*6225b270SMichael Große if ($id === '') continue; 431*6225b270SMichael Große $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0); 432*6225b270SMichael Große } 433*6225b270SMichael Große $this->saveIndex($metaname.'_i', '', $meta_idx); 434*6225b270SMichael Große $this->saveIndexKey($metaname.'_p', '', $pid, ''); 435*6225b270SMichael Große } 436*6225b270SMichael Große 437*6225b270SMichael Große return true; 438*6225b270SMichael Große } 439*6225b270SMichael Große 440*6225b270SMichael Große /** 441*6225b270SMichael Große * Clear the whole index 442*6225b270SMichael Große * 443*6225b270SMichael Große * @return bool If the index has been cleared successfully 444*6225b270SMichael Große */ 445*6225b270SMichael Große public function clear() { 446*6225b270SMichael Große global $conf; 447*6225b270SMichael Große 448*6225b270SMichael Große if (!$this->lock()) return false; 449*6225b270SMichael Große 450*6225b270SMichael Große @unlink($conf['indexdir'].'/page.idx'); 451*6225b270SMichael Große @unlink($conf['indexdir'].'/title.idx'); 452*6225b270SMichael Große @unlink($conf['indexdir'].'/pageword.idx'); 453*6225b270SMichael Große @unlink($conf['indexdir'].'/metadata.idx'); 454*6225b270SMichael Große $dir = @opendir($conf['indexdir']); 455*6225b270SMichael Große if($dir!==false){ 456*6225b270SMichael Große while(($f = readdir($dir)) !== false){ 457*6225b270SMichael Große if(substr($f,-4)=='.idx' && 458*6225b270SMichael Große (substr($f,0,1)=='i' || substr($f,0,1)=='w' 459*6225b270SMichael Große || substr($f,-6)=='_w.idx' || substr($f,-6)=='_i.idx' || substr($f,-6)=='_p.idx')) 460*6225b270SMichael Große @unlink($conf['indexdir']."/$f"); 461*6225b270SMichael Große } 462*6225b270SMichael Große } 463*6225b270SMichael Große @unlink($conf['indexdir'].'/lengths.idx'); 464*6225b270SMichael Große 465*6225b270SMichael Große // clear the pid cache 466*6225b270SMichael Große $this->pidCache = array(); 467*6225b270SMichael Große 468*6225b270SMichael Große $this->unlock(); 469*6225b270SMichael Große return true; 470*6225b270SMichael Große } 471*6225b270SMichael Große 472*6225b270SMichael Große /** 473*6225b270SMichael Große * Split the text into words for fulltext search 474*6225b270SMichael Große * 475*6225b270SMichael Große * TODO: does this also need &$stopwords ? 476*6225b270SMichael Große * 477*6225b270SMichael Große * @triggers INDEXER_TEXT_PREPARE 478*6225b270SMichael Große * This event allows plugins to modify the text before it gets tokenized. 479*6225b270SMichael Große * Plugins intercepting this event should also intercept INDEX_VERSION_GET 480*6225b270SMichael Große * 481*6225b270SMichael Große * @param string $text plain text 482*6225b270SMichael Große * @param boolean $wc are wildcards allowed? 483*6225b270SMichael Große * @return array list of words in the text 484*6225b270SMichael Große * 485*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 486*6225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 487*6225b270SMichael Große */ 488*6225b270SMichael Große public function tokenizer($text, $wc=false) { 489*6225b270SMichael Große $wc = ($wc) ? '' : '\*'; 490*6225b270SMichael Große $stopwords =& idx_get_stopwords(); 491*6225b270SMichael Große 492*6225b270SMichael Große // prepare the text to be tokenized 493*6225b270SMichael Große $evt = new Event('INDEXER_TEXT_PREPARE', $text); 494*6225b270SMichael Große if ($evt->advise_before(true)) { 495*6225b270SMichael Große if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 496*6225b270SMichael Große $text = \dokuwiki\Utf8\Asian::separateAsianWords($text); 497*6225b270SMichael Große } 498*6225b270SMichael Große } 499*6225b270SMichael Große $evt->advise_after(); 500*6225b270SMichael Große unset($evt); 501*6225b270SMichael Große 502*6225b270SMichael Große $text = strtr($text, 503*6225b270SMichael Große array( 504*6225b270SMichael Große "\r" => ' ', 505*6225b270SMichael Große "\n" => ' ', 506*6225b270SMichael Große "\t" => ' ', 507*6225b270SMichael Große "\xC2\xAD" => '', //soft-hyphen 508*6225b270SMichael Große ) 509*6225b270SMichael Große ); 510*6225b270SMichael Große if (preg_match('/[^0-9A-Za-z ]/u', $text)) 511*6225b270SMichael Große $text = \dokuwiki\Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc); 512*6225b270SMichael Große 513*6225b270SMichael Große $wordlist = explode(' ', $text); 514*6225b270SMichael Große foreach ($wordlist as $i => $word) { 515*6225b270SMichael Große $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 516*6225b270SMichael Große \dokuwiki\Utf8\PhpString::strtolower($word) : strtolower($word); 517*6225b270SMichael Große } 518*6225b270SMichael Große 519*6225b270SMichael Große foreach ($wordlist as $i => $word) { 520*6225b270SMichael Große if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) 521*6225b270SMichael Große || array_search($word, $stopwords, true) !== false) 522*6225b270SMichael Große unset($wordlist[$i]); 523*6225b270SMichael Große } 524*6225b270SMichael Große return array_values($wordlist); 525*6225b270SMichael Große } 526*6225b270SMichael Große 527*6225b270SMichael Große /** 528*6225b270SMichael Große * Get the numeric PID of a page 529*6225b270SMichael Große * 530*6225b270SMichael Große * @param string $page The page to get the PID for 531*6225b270SMichael Große * @return bool|int The page id on success, false on error 532*6225b270SMichael Große */ 533*6225b270SMichael Große public function getPID($page) { 534*6225b270SMichael Große // return PID without locking when it is in the cache 535*6225b270SMichael Große if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 536*6225b270SMichael Große 537*6225b270SMichael Große if (!$this->lock()) 538*6225b270SMichael Große return false; 539*6225b270SMichael Große 540*6225b270SMichael Große // load known documents 541*6225b270SMichael Große $pid = $this->getPIDNoLock($page); 542*6225b270SMichael Große if ($pid === false) { 543*6225b270SMichael Große $this->unlock(); 544*6225b270SMichael Große return false; 545*6225b270SMichael Große } 546*6225b270SMichael Große 547*6225b270SMichael Große $this->unlock(); 548*6225b270SMichael Große return $pid; 549*6225b270SMichael Große } 550*6225b270SMichael Große 551*6225b270SMichael Große /** 552*6225b270SMichael Große * Get the numeric PID of a page without locking the index. 553*6225b270SMichael Große * Only use this function when the index is already locked. 554*6225b270SMichael Große * 555*6225b270SMichael Große * @param string $page The page to get the PID for 556*6225b270SMichael Große * @return bool|int The page id on success, false on error 557*6225b270SMichael Große */ 558*6225b270SMichael Große protected function getPIDNoLock($page) { 559*6225b270SMichael Große // avoid expensive addIndexKey operation for the most recently requested pages by using a cache 560*6225b270SMichael Große if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 561*6225b270SMichael Große $pid = $this->addIndexKey('page', '', $page); 562*6225b270SMichael Große // limit cache to 10 entries by discarding the oldest element as in DokuWiki usually only the most recently 563*6225b270SMichael Große // added item will be requested again 564*6225b270SMichael Große if (count($this->pidCache) > 10) array_shift($this->pidCache); 565*6225b270SMichael Große $this->pidCache[$page] = $pid; 566*6225b270SMichael Große return $pid; 567*6225b270SMichael Große } 568*6225b270SMichael Große 569*6225b270SMichael Große /** 570*6225b270SMichael Große * Get the page id of a numeric PID 571*6225b270SMichael Große * 572*6225b270SMichael Große * @param int $pid The PID to get the page id for 573*6225b270SMichael Große * @return string The page id 574*6225b270SMichael Große */ 575*6225b270SMichael Große public function getPageFromPID($pid) { 576*6225b270SMichael Große return $this->getIndexKey('page', '', $pid); 577*6225b270SMichael Große } 578*6225b270SMichael Große 579*6225b270SMichael Große /** 580*6225b270SMichael Große * Find pages in the fulltext index containing the words, 581*6225b270SMichael Große * 582*6225b270SMichael Große * The search words must be pre-tokenized, meaning only letters and 583*6225b270SMichael Große * numbers with an optional wildcard 584*6225b270SMichael Große * 585*6225b270SMichael Große * The returned array will have the original tokens as key. The values 586*6225b270SMichael Große * in the returned list is an array with the page names as keys and the 587*6225b270SMichael Große * number of times that token appears on the page as value. 588*6225b270SMichael Große * 589*6225b270SMichael Große * @param array $tokens list of words to search for 590*6225b270SMichael Große * @return array list of page names with usage counts 591*6225b270SMichael Große * 592*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 593*6225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 594*6225b270SMichael Große */ 595*6225b270SMichael Große public function lookup(&$tokens) { 596*6225b270SMichael Große $result = array(); 597*6225b270SMichael Große $wids = $this->getIndexWords($tokens, $result); 598*6225b270SMichael Große if (empty($wids)) return array(); 599*6225b270SMichael Große // load known words and documents 600*6225b270SMichael Große $page_idx = $this->getIndex('page', ''); 601*6225b270SMichael Große $docs = array(); 602*6225b270SMichael Große foreach (array_keys($wids) as $wlen) { 603*6225b270SMichael Große $wids[$wlen] = array_unique($wids[$wlen]); 604*6225b270SMichael Große $index = $this->getIndex('i', $wlen); 605*6225b270SMichael Große foreach($wids[$wlen] as $ixid) { 606*6225b270SMichael Große if ($ixid < count($index)) 607*6225b270SMichael Große $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]); 608*6225b270SMichael Große } 609*6225b270SMichael Große } 610*6225b270SMichael Große // merge found pages into final result array 611*6225b270SMichael Große $final = array(); 612*6225b270SMichael Große foreach ($result as $word => $res) { 613*6225b270SMichael Große $final[$word] = array(); 614*6225b270SMichael Große foreach ($res as $wid) { 615*6225b270SMichael Große // handle the case when ($ixid < count($index)) has been false 616*6225b270SMichael Große // and thus $docs[$wid] hasn't been set. 617*6225b270SMichael Große if (!isset($docs[$wid])) continue; 618*6225b270SMichael Große $hits = &$docs[$wid]; 619*6225b270SMichael Große foreach ($hits as $hitkey => $hitcnt) { 620*6225b270SMichael Große // make sure the document still exists 621*6225b270SMichael Große if (!page_exists($hitkey, '', false)) continue; 622*6225b270SMichael Große if (!isset($final[$word][$hitkey])) 623*6225b270SMichael Große $final[$word][$hitkey] = $hitcnt; 624*6225b270SMichael Große else 625*6225b270SMichael Große $final[$word][$hitkey] += $hitcnt; 626*6225b270SMichael Große } 627*6225b270SMichael Große } 628*6225b270SMichael Große } 629*6225b270SMichael Große return $final; 630*6225b270SMichael Große } 631*6225b270SMichael Große 632*6225b270SMichael Große /** 633*6225b270SMichael Große * Find pages containing a metadata key. 634*6225b270SMichael Große * 635*6225b270SMichael Große * The metadata values are compared as case-sensitive strings. Pass a 636*6225b270SMichael Große * callback function that returns true or false to use a different 637*6225b270SMichael Große * comparison function. The function will be called with the $value being 638*6225b270SMichael Große * searched for as the first argument, and the word in the index as the 639*6225b270SMichael Große * second argument. The function preg_match can be used directly if the 640*6225b270SMichael Große * values are regexes. 641*6225b270SMichael Große * 642*6225b270SMichael Große * @param string $key name of the metadata key to look for 643*6225b270SMichael Große * @param string $value search term to look for, must be a string or array of strings 644*6225b270SMichael Große * @param callback $func comparison function 645*6225b270SMichael Große * @return array lists with page names, keys are query values if $value is array 646*6225b270SMichael Große * 647*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 648*6225b270SMichael Große * @author Michael Hamann <michael@content-space.de> 649*6225b270SMichael Große */ 650*6225b270SMichael Große public function lookupKey($key, &$value, $func=null) { 651*6225b270SMichael Große if (!is_array($value)) 652*6225b270SMichael Große $value_array = array($value); 653*6225b270SMichael Große else 654*6225b270SMichael Große $value_array =& $value; 655*6225b270SMichael Große 656*6225b270SMichael Große // the matching ids for the provided value(s) 657*6225b270SMichael Große $value_ids = array(); 658*6225b270SMichael Große 659*6225b270SMichael Große $metaname = idx_cleanName($key); 660*6225b270SMichael Große 661*6225b270SMichael Große // get all words in order to search the matching ids 662*6225b270SMichael Große if ($key == 'title') { 663*6225b270SMichael Große $words = $this->getIndex('title', ''); 664*6225b270SMichael Große } else { 665*6225b270SMichael Große $words = $this->getIndex($metaname.'_w', ''); 666*6225b270SMichael Große } 667*6225b270SMichael Große 668*6225b270SMichael Große if (!is_null($func)) { 669*6225b270SMichael Große foreach ($value_array as $val) { 670*6225b270SMichael Große foreach ($words as $i => $word) { 671*6225b270SMichael Große if (call_user_func_array($func, array($val, $word))) 672*6225b270SMichael Große $value_ids[$i][] = $val; 673*6225b270SMichael Große } 674*6225b270SMichael Große } 675*6225b270SMichael Große } else { 676*6225b270SMichael Große foreach ($value_array as $val) { 677*6225b270SMichael Große $xval = $val; 678*6225b270SMichael Große $caret = '^'; 679*6225b270SMichael Große $dollar = '$'; 680*6225b270SMichael Große // check for wildcards 681*6225b270SMichael Große if (substr($xval, 0, 1) == '*') { 682*6225b270SMichael Große $xval = substr($xval, 1); 683*6225b270SMichael Große $caret = ''; 684*6225b270SMichael Große } 685*6225b270SMichael Große if (substr($xval, -1, 1) == '*') { 686*6225b270SMichael Große $xval = substr($xval, 0, -1); 687*6225b270SMichael Große $dollar = ''; 688*6225b270SMichael Große } 689*6225b270SMichael Große if (!$caret || !$dollar) { 690*6225b270SMichael Große $re = $caret.preg_quote($xval, '/').$dollar; 691*6225b270SMichael Große foreach(array_keys(preg_grep('/'.$re.'/', $words)) as $i) 692*6225b270SMichael Große $value_ids[$i][] = $val; 693*6225b270SMichael Große } else { 694*6225b270SMichael Große if (($i = array_search($val, $words, true)) !== false) 695*6225b270SMichael Große $value_ids[$i][] = $val; 696*6225b270SMichael Große } 697*6225b270SMichael Große } 698*6225b270SMichael Große } 699*6225b270SMichael Große 700*6225b270SMichael Große unset($words); // free the used memory 701*6225b270SMichael Große 702*6225b270SMichael Große // initialize the result so it won't be null 703*6225b270SMichael Große $result = array(); 704*6225b270SMichael Große foreach ($value_array as $val) { 705*6225b270SMichael Große $result[$val] = array(); 706*6225b270SMichael Große } 707*6225b270SMichael Große 708*6225b270SMichael Große $page_idx = $this->getIndex('page', ''); 709*6225b270SMichael Große 710*6225b270SMichael Große // Special handling for titles 711*6225b270SMichael Große if ($key == 'title') { 712*6225b270SMichael Große foreach ($value_ids as $pid => $val_list) { 713*6225b270SMichael Große $page = $page_idx[$pid]; 714*6225b270SMichael Große foreach ($val_list as $val) { 715*6225b270SMichael Große $result[$val][] = $page; 716*6225b270SMichael Große } 717*6225b270SMichael Große } 718*6225b270SMichael Große } else { 719*6225b270SMichael Große // load all lines and pages so the used lines can be taken and matched with the pages 720*6225b270SMichael Große $lines = $this->getIndex($metaname.'_i', ''); 721*6225b270SMichael Große 722*6225b270SMichael Große foreach ($value_ids as $value_id => $val_list) { 723*6225b270SMichael Große // parse the tuples of the form page_id*1:page2_id*1 and so on, return value 724*6225b270SMichael Große // is an array with page_id => 1, page2_id => 1 etc. so take the keys only 725*6225b270SMichael Große $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id])); 726*6225b270SMichael Große foreach ($val_list as $val) { 727*6225b270SMichael Große $result[$val] = array_merge($result[$val], $pages); 728*6225b270SMichael Große } 729*6225b270SMichael Große } 730*6225b270SMichael Große } 731*6225b270SMichael Große if (!is_array($value)) $result = $result[$value]; 732*6225b270SMichael Große return $result; 733*6225b270SMichael Große } 734*6225b270SMichael Große 735*6225b270SMichael Große /** 736*6225b270SMichael Große * Find the index ID of each search term. 737*6225b270SMichael Große * 738*6225b270SMichael Große * The query terms should only contain valid characters, with a '*' at 739*6225b270SMichael Große * either the beginning or end of the word (or both). 740*6225b270SMichael Große * The $result parameter can be used to merge the index locations with 741*6225b270SMichael Große * the appropriate query term. 742*6225b270SMichael Große * 743*6225b270SMichael Große * @param array $words The query terms. 744*6225b270SMichael Große * @param array $result Set to word => array("length*id" ...) 745*6225b270SMichael Große * @return array Set to length => array(id ...) 746*6225b270SMichael Große * 747*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 748*6225b270SMichael Große */ 749*6225b270SMichael Große protected function getIndexWords(&$words, &$result) { 750*6225b270SMichael Große $tokens = array(); 751*6225b270SMichael Große $tokenlength = array(); 752*6225b270SMichael Große $tokenwild = array(); 753*6225b270SMichael Große foreach ($words as $word) { 754*6225b270SMichael Große $result[$word] = array(); 755*6225b270SMichael Große $caret = '^'; 756*6225b270SMichael Große $dollar = '$'; 757*6225b270SMichael Große $xword = $word; 758*6225b270SMichael Große $wlen = wordlen($word); 759*6225b270SMichael Große 760*6225b270SMichael Große // check for wildcards 761*6225b270SMichael Große if (substr($xword, 0, 1) == '*') { 762*6225b270SMichael Große $xword = substr($xword, 1); 763*6225b270SMichael Große $caret = ''; 764*6225b270SMichael Große $wlen -= 1; 765*6225b270SMichael Große } 766*6225b270SMichael Große if (substr($xword, -1, 1) == '*') { 767*6225b270SMichael Große $xword = substr($xword, 0, -1); 768*6225b270SMichael Große $dollar = ''; 769*6225b270SMichael Große $wlen -= 1; 770*6225b270SMichael Große } 771*6225b270SMichael Große if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) 772*6225b270SMichael Große continue; 773*6225b270SMichael Große if (!isset($tokens[$xword])) 774*6225b270SMichael Große $tokenlength[$wlen][] = $xword; 775*6225b270SMichael Große if (!$caret || !$dollar) { 776*6225b270SMichael Große $re = $caret.preg_quote($xword, '/').$dollar; 777*6225b270SMichael Große $tokens[$xword][] = array($word, '/'.$re.'/'); 778*6225b270SMichael Große if (!isset($tokenwild[$xword])) 779*6225b270SMichael Große $tokenwild[$xword] = $wlen; 780*6225b270SMichael Große } else { 781*6225b270SMichael Große $tokens[$xword][] = array($word, null); 782*6225b270SMichael Große } 783*6225b270SMichael Große } 784*6225b270SMichael Große asort($tokenwild); 785*6225b270SMichael Große // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 786*6225b270SMichael Große // $tokenlength = array( base word length => base word ... ) 787*6225b270SMichael Große // $tokenwild = array( base word => base word length ... ) 788*6225b270SMichael Große $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength)); 789*6225b270SMichael Große $indexes_known = $this->indexLengths($length_filter); 790*6225b270SMichael Große if (!empty($tokenwild)) sort($indexes_known); 791*6225b270SMichael Große // get word IDs 792*6225b270SMichael Große $wids = array(); 793*6225b270SMichael Große foreach ($indexes_known as $ixlen) { 794*6225b270SMichael Große $word_idx = $this->getIndex('w', $ixlen); 795*6225b270SMichael Große // handle exact search 796*6225b270SMichael Große if (isset($tokenlength[$ixlen])) { 797*6225b270SMichael Große foreach ($tokenlength[$ixlen] as $xword) { 798*6225b270SMichael Große $wid = array_search($xword, $word_idx, true); 799*6225b270SMichael Große if ($wid !== false) { 800*6225b270SMichael Große $wids[$ixlen][] = $wid; 801*6225b270SMichael Große foreach ($tokens[$xword] as $w) 802*6225b270SMichael Große $result[$w[0]][] = "$ixlen*$wid"; 803*6225b270SMichael Große } 804*6225b270SMichael Große } 805*6225b270SMichael Große } 806*6225b270SMichael Große // handle wildcard search 807*6225b270SMichael Große foreach ($tokenwild as $xword => $wlen) { 808*6225b270SMichael Große if ($wlen >= $ixlen) break; 809*6225b270SMichael Große foreach ($tokens[$xword] as $w) { 810*6225b270SMichael Große if (is_null($w[1])) continue; 811*6225b270SMichael Große foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) { 812*6225b270SMichael Große $wids[$ixlen][] = $wid; 813*6225b270SMichael Große $result[$w[0]][] = "$ixlen*$wid"; 814*6225b270SMichael Große } 815*6225b270SMichael Große } 816*6225b270SMichael Große } 817*6225b270SMichael Große } 818*6225b270SMichael Große return $wids; 819*6225b270SMichael Große } 820*6225b270SMichael Große 821*6225b270SMichael Große /** 822*6225b270SMichael Große * Return a list of all pages 823*6225b270SMichael Große * Warning: pages may not exist! 824*6225b270SMichael Große * 825*6225b270SMichael Große * @param string $key list only pages containing the metadata key (optional) 826*6225b270SMichael Große * @return array list of page names 827*6225b270SMichael Große * 828*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 829*6225b270SMichael Große */ 830*6225b270SMichael Große public function getPages($key=null) { 831*6225b270SMichael Große $page_idx = $this->getIndex('page', ''); 832*6225b270SMichael Große if (is_null($key)) return $page_idx; 833*6225b270SMichael Große 834*6225b270SMichael Große $metaname = idx_cleanName($key); 835*6225b270SMichael Große 836*6225b270SMichael Große // Special handling for titles 837*6225b270SMichael Große if ($key == 'title') { 838*6225b270SMichael Große $title_idx = $this->getIndex('title', ''); 839*6225b270SMichael Große array_splice($page_idx, count($title_idx)); 840*6225b270SMichael Große foreach ($title_idx as $i => $title) 841*6225b270SMichael Große if ($title === "") unset($page_idx[$i]); 842*6225b270SMichael Große return array_values($page_idx); 843*6225b270SMichael Große } 844*6225b270SMichael Große 845*6225b270SMichael Große $pages = array(); 846*6225b270SMichael Große $lines = $this->getIndex($metaname.'_i', ''); 847*6225b270SMichael Große foreach ($lines as $line) { 848*6225b270SMichael Große $pages = array_merge($pages, $this->parseTuples($page_idx, $line)); 849*6225b270SMichael Große } 850*6225b270SMichael Große return array_keys($pages); 851*6225b270SMichael Große } 852*6225b270SMichael Große 853*6225b270SMichael Große /** 854*6225b270SMichael Große * Return a list of words sorted by number of times used 855*6225b270SMichael Große * 856*6225b270SMichael Große * @param int $min bottom frequency threshold 857*6225b270SMichael Große * @param int $max upper frequency limit. No limit if $max<$min 858*6225b270SMichael Große * @param int $minlen minimum length of words to count 859*6225b270SMichael Große * @param string $key metadata key to list. Uses the fulltext index if not given 860*6225b270SMichael Große * @return array list of words as the keys and frequency as values 861*6225b270SMichael Große * 862*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 863*6225b270SMichael Große */ 864*6225b270SMichael Große public function histogram($min=1, $max=0, $minlen=3, $key=null) { 865*6225b270SMichael Große if ($min < 1) 866*6225b270SMichael Große $min = 1; 867*6225b270SMichael Große if ($max < $min) 868*6225b270SMichael Große $max = 0; 869*6225b270SMichael Große 870*6225b270SMichael Große $result = array(); 871*6225b270SMichael Große 872*6225b270SMichael Große if ($key == 'title') { 873*6225b270SMichael Große $index = $this->getIndex('title', ''); 874*6225b270SMichael Große $index = array_count_values($index); 875*6225b270SMichael Große foreach ($index as $val => $cnt) { 876*6225b270SMichael Große if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) 877*6225b270SMichael Große $result[$val] = $cnt; 878*6225b270SMichael Große } 879*6225b270SMichael Große } 880*6225b270SMichael Große elseif (!is_null($key)) { 881*6225b270SMichael Große $metaname = idx_cleanName($key); 882*6225b270SMichael Große $index = $this->getIndex($metaname.'_i', ''); 883*6225b270SMichael Große $val_idx = array(); 884*6225b270SMichael Große foreach ($index as $wid => $line) { 885*6225b270SMichael Große $freq = $this->countTuples($line); 886*6225b270SMichael Große if ($freq >= $min && (!$max || $freq <= $max)) 887*6225b270SMichael Große $val_idx[$wid] = $freq; 888*6225b270SMichael Große } 889*6225b270SMichael Große if (!empty($val_idx)) { 890*6225b270SMichael Große $words = $this->getIndex($metaname.'_w', ''); 891*6225b270SMichael Große foreach ($val_idx as $wid => $freq) { 892*6225b270SMichael Große if (strlen($words[$wid]) >= $minlen) 893*6225b270SMichael Große $result[$words[$wid]] = $freq; 894*6225b270SMichael Große } 895*6225b270SMichael Große } 896*6225b270SMichael Große } 897*6225b270SMichael Große else { 898*6225b270SMichael Große $lengths = idx_listIndexLengths(); 899*6225b270SMichael Große foreach ($lengths as $length) { 900*6225b270SMichael Große if ($length < $minlen) continue; 901*6225b270SMichael Große $index = $this->getIndex('i', $length); 902*6225b270SMichael Große $words = null; 903*6225b270SMichael Große foreach ($index as $wid => $line) { 904*6225b270SMichael Große $freq = $this->countTuples($line); 905*6225b270SMichael Große if ($freq >= $min && (!$max || $freq <= $max)) { 906*6225b270SMichael Große if ($words === null) 907*6225b270SMichael Große $words = $this->getIndex('w', $length); 908*6225b270SMichael Große $result[$words[$wid]] = $freq; 909*6225b270SMichael Große } 910*6225b270SMichael Große } 911*6225b270SMichael Große } 912*6225b270SMichael Große } 913*6225b270SMichael Große 914*6225b270SMichael Große arsort($result); 915*6225b270SMichael Große return $result; 916*6225b270SMichael Große } 917*6225b270SMichael Große 918*6225b270SMichael Große /** 919*6225b270SMichael Große * Lock the indexer. 920*6225b270SMichael Große * 921*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 922*6225b270SMichael Große * 923*6225b270SMichael Große * @return bool|string 924*6225b270SMichael Große */ 925*6225b270SMichael Große protected function lock() { 926*6225b270SMichael Große global $conf; 927*6225b270SMichael Große $status = true; 928*6225b270SMichael Große $run = 0; 929*6225b270SMichael Große $lock = $conf['lockdir'].'/_indexer.lock'; 930*6225b270SMichael Große while (!@mkdir($lock, $conf['dmode'])) { 931*6225b270SMichael Große usleep(50); 932*6225b270SMichael Große if(is_dir($lock) && time()-@filemtime($lock) > 60*5){ 933*6225b270SMichael Große // looks like a stale lock - remove it 934*6225b270SMichael Große if (!@rmdir($lock)) { 935*6225b270SMichael Große $status = "removing the stale lock failed"; 936*6225b270SMichael Große return false; 937*6225b270SMichael Große } else { 938*6225b270SMichael Große $status = "stale lock removed"; 939*6225b270SMichael Große } 940*6225b270SMichael Große }elseif($run++ == 1000){ 941*6225b270SMichael Große // we waited 5 seconds for that lock 942*6225b270SMichael Große return false; 943*6225b270SMichael Große } 944*6225b270SMichael Große } 945*6225b270SMichael Große if (!empty($conf['dperm'])) { 946*6225b270SMichael Große chmod($lock, $conf['dperm']); 947*6225b270SMichael Große } 948*6225b270SMichael Große return $status; 949*6225b270SMichael Große } 950*6225b270SMichael Große 951*6225b270SMichael Große /** 952*6225b270SMichael Große * Release the indexer lock. 953*6225b270SMichael Große * 954*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 955*6225b270SMichael Große * 956*6225b270SMichael Große * @return bool 957*6225b270SMichael Große */ 958*6225b270SMichael Große protected function unlock() { 959*6225b270SMichael Große global $conf; 960*6225b270SMichael Große @rmdir($conf['lockdir'].'/_indexer.lock'); 961*6225b270SMichael Große return true; 962*6225b270SMichael Große } 963*6225b270SMichael Große 964*6225b270SMichael Große /** 965*6225b270SMichael Große * Retrieve the entire index. 966*6225b270SMichael Große * 967*6225b270SMichael Große * The $suffix argument is for an index that is split into 968*6225b270SMichael Große * multiple parts. Different index files should use different 969*6225b270SMichael Große * base names. 970*6225b270SMichael Große * 971*6225b270SMichael Große * @param string $idx name of the index 972*6225b270SMichael Große * @param string $suffix subpart identifier 973*6225b270SMichael Große * @return array list of lines without CR or LF 974*6225b270SMichael Große * 975*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 976*6225b270SMichael Große */ 977*6225b270SMichael Große protected function getIndex($idx, $suffix) { 978*6225b270SMichael Große global $conf; 979*6225b270SMichael Große $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 980*6225b270SMichael Große if (!file_exists($fn)) return array(); 981*6225b270SMichael Große return file($fn, FILE_IGNORE_NEW_LINES); 982*6225b270SMichael Große } 983*6225b270SMichael Große 984*6225b270SMichael Große /** 985*6225b270SMichael Große * Replace the contents of the index with an array. 986*6225b270SMichael Große * 987*6225b270SMichael Große * @param string $idx name of the index 988*6225b270SMichael Große * @param string $suffix subpart identifier 989*6225b270SMichael Große * @param array $lines list of lines without LF 990*6225b270SMichael Große * @return bool If saving succeeded 991*6225b270SMichael Große * 992*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 993*6225b270SMichael Große */ 994*6225b270SMichael Große protected function saveIndex($idx, $suffix, &$lines) { 995*6225b270SMichael Große global $conf; 996*6225b270SMichael Große $fn = $conf['indexdir'].'/'.$idx.$suffix; 997*6225b270SMichael Große $fh = @fopen($fn.'.tmp', 'w'); 998*6225b270SMichael Große if (!$fh) return false; 999*6225b270SMichael Große fwrite($fh, join("\n", $lines)); 1000*6225b270SMichael Große if (!empty($lines)) 1001*6225b270SMichael Große fwrite($fh, "\n"); 1002*6225b270SMichael Große fclose($fh); 1003*6225b270SMichael Große if (isset($conf['fperm'])) 1004*6225b270SMichael Große chmod($fn.'.tmp', $conf['fperm']); 1005*6225b270SMichael Große io_rename($fn.'.tmp', $fn.'.idx'); 1006*6225b270SMichael Große return true; 1007*6225b270SMichael Große } 1008*6225b270SMichael Große 1009*6225b270SMichael Große /** 1010*6225b270SMichael Große * Retrieve a line from the index. 1011*6225b270SMichael Große * 1012*6225b270SMichael Große * @param string $idx name of the index 1013*6225b270SMichael Große * @param string $suffix subpart identifier 1014*6225b270SMichael Große * @param int $id the line number 1015*6225b270SMichael Große * @return string a line with trailing whitespace removed 1016*6225b270SMichael Große * 1017*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1018*6225b270SMichael Große */ 1019*6225b270SMichael Große protected function getIndexKey($idx, $suffix, $id) { 1020*6225b270SMichael Große global $conf; 1021*6225b270SMichael Große $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1022*6225b270SMichael Große if (!file_exists($fn)) return ''; 1023*6225b270SMichael Große $fh = @fopen($fn, 'r'); 1024*6225b270SMichael Große if (!$fh) return ''; 1025*6225b270SMichael Große $ln = -1; 1026*6225b270SMichael Große while (($line = fgets($fh)) !== false) { 1027*6225b270SMichael Große if (++$ln == $id) break; 1028*6225b270SMichael Große } 1029*6225b270SMichael Große fclose($fh); 1030*6225b270SMichael Große return rtrim((string)$line); 1031*6225b270SMichael Große } 1032*6225b270SMichael Große 1033*6225b270SMichael Große /** 1034*6225b270SMichael Große * Write a line into the index. 1035*6225b270SMichael Große * 1036*6225b270SMichael Große * @param string $idx name of the index 1037*6225b270SMichael Große * @param string $suffix subpart identifier 1038*6225b270SMichael Große * @param int $id the line number 1039*6225b270SMichael Große * @param string $line line to write 1040*6225b270SMichael Große * @return bool If saving succeeded 1041*6225b270SMichael Große * 1042*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1043*6225b270SMichael Große */ 1044*6225b270SMichael Große protected function saveIndexKey($idx, $suffix, $id, $line) { 1045*6225b270SMichael Große global $conf; 1046*6225b270SMichael Große if (substr($line, -1) != "\n") 1047*6225b270SMichael Große $line .= "\n"; 1048*6225b270SMichael Große $fn = $conf['indexdir'].'/'.$idx.$suffix; 1049*6225b270SMichael Große $fh = @fopen($fn.'.tmp', 'w'); 1050*6225b270SMichael Große if (!$fh) return false; 1051*6225b270SMichael Große $ih = @fopen($fn.'.idx', 'r'); 1052*6225b270SMichael Große if ($ih) { 1053*6225b270SMichael Große $ln = -1; 1054*6225b270SMichael Große while (($curline = fgets($ih)) !== false) { 1055*6225b270SMichael Große fwrite($fh, (++$ln == $id) ? $line : $curline); 1056*6225b270SMichael Große } 1057*6225b270SMichael Große if ($id > $ln) { 1058*6225b270SMichael Große while ($id > ++$ln) 1059*6225b270SMichael Große fwrite($fh, "\n"); 1060*6225b270SMichael Große fwrite($fh, $line); 1061*6225b270SMichael Große } 1062*6225b270SMichael Große fclose($ih); 1063*6225b270SMichael Große } else { 1064*6225b270SMichael Große $ln = -1; 1065*6225b270SMichael Große while ($id > ++$ln) 1066*6225b270SMichael Große fwrite($fh, "\n"); 1067*6225b270SMichael Große fwrite($fh, $line); 1068*6225b270SMichael Große } 1069*6225b270SMichael Große fclose($fh); 1070*6225b270SMichael Große if (isset($conf['fperm'])) 1071*6225b270SMichael Große chmod($fn.'.tmp', $conf['fperm']); 1072*6225b270SMichael Große io_rename($fn.'.tmp', $fn.'.idx'); 1073*6225b270SMichael Große return true; 1074*6225b270SMichael Große } 1075*6225b270SMichael Große 1076*6225b270SMichael Große /** 1077*6225b270SMichael Große * Retrieve or insert a value in the index. 1078*6225b270SMichael Große * 1079*6225b270SMichael Große * @param string $idx name of the index 1080*6225b270SMichael Große * @param string $suffix subpart identifier 1081*6225b270SMichael Große * @param string $value line to find in the index 1082*6225b270SMichael Große * @return int|bool line number of the value in the index or false if writing the index failed 1083*6225b270SMichael Große * 1084*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1085*6225b270SMichael Große */ 1086*6225b270SMichael Große protected function addIndexKey($idx, $suffix, $value) { 1087*6225b270SMichael Große $index = $this->getIndex($idx, $suffix); 1088*6225b270SMichael Große $id = array_search($value, $index, true); 1089*6225b270SMichael Große if ($id === false) { 1090*6225b270SMichael Große $id = count($index); 1091*6225b270SMichael Große $index[$id] = $value; 1092*6225b270SMichael Große if (!$this->saveIndex($idx, $suffix, $index)) { 1093*6225b270SMichael Große trigger_error("Failed to write $idx index", E_USER_ERROR); 1094*6225b270SMichael Große return false; 1095*6225b270SMichael Große } 1096*6225b270SMichael Große } 1097*6225b270SMichael Große return $id; 1098*6225b270SMichael Große } 1099*6225b270SMichael Große 1100*6225b270SMichael Große /** 1101*6225b270SMichael Große * Get the list of lengths indexed in the wiki. 1102*6225b270SMichael Große * 1103*6225b270SMichael Große * Read the index directory or a cache file and returns 1104*6225b270SMichael Große * a sorted array of lengths of the words used in the wiki. 1105*6225b270SMichael Große * 1106*6225b270SMichael Große * @author YoBoY <yoboy.leguesh@gmail.com> 1107*6225b270SMichael Große * 1108*6225b270SMichael Große * @return array 1109*6225b270SMichael Große */ 1110*6225b270SMichael Große protected function listIndexLengths() { 1111*6225b270SMichael Große return idx_listIndexLengths(); 1112*6225b270SMichael Große } 1113*6225b270SMichael Große 1114*6225b270SMichael Große /** 1115*6225b270SMichael Große * Get the word lengths that have been indexed. 1116*6225b270SMichael Große * 1117*6225b270SMichael Große * Reads the index directory and returns an array of lengths 1118*6225b270SMichael Große * that there are indices for. 1119*6225b270SMichael Große * 1120*6225b270SMichael Große * @author YoBoY <yoboy.leguesh@gmail.com> 1121*6225b270SMichael Große * 1122*6225b270SMichael Große * @param array|int $filter 1123*6225b270SMichael Große * @return array 1124*6225b270SMichael Große */ 1125*6225b270SMichael Große protected function indexLengths($filter) { 1126*6225b270SMichael Große global $conf; 1127*6225b270SMichael Große $idx = array(); 1128*6225b270SMichael Große if (is_array($filter)) { 1129*6225b270SMichael Große // testing if index files exist only 1130*6225b270SMichael Große $path = $conf['indexdir']."/i"; 1131*6225b270SMichael Große foreach ($filter as $key => $value) { 1132*6225b270SMichael Große if (file_exists($path.$key.'.idx')) 1133*6225b270SMichael Große $idx[] = $key; 1134*6225b270SMichael Große } 1135*6225b270SMichael Große } else { 1136*6225b270SMichael Große $lengths = idx_listIndexLengths(); 1137*6225b270SMichael Große foreach ($lengths as $key => $length) { 1138*6225b270SMichael Große // keep all the values equal or superior 1139*6225b270SMichael Große if ((int)$length >= (int)$filter) 1140*6225b270SMichael Große $idx[] = $length; 1141*6225b270SMichael Große } 1142*6225b270SMichael Große } 1143*6225b270SMichael Große return $idx; 1144*6225b270SMichael Große } 1145*6225b270SMichael Große 1146*6225b270SMichael Große /** 1147*6225b270SMichael Große * Insert or replace a tuple in a line. 1148*6225b270SMichael Große * 1149*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1150*6225b270SMichael Große * 1151*6225b270SMichael Große * @param string $line 1152*6225b270SMichael Große * @param string|int $id 1153*6225b270SMichael Große * @param int $count 1154*6225b270SMichael Große * @return string 1155*6225b270SMichael Große */ 1156*6225b270SMichael Große protected function updateTuple($line, $id, $count) { 1157*6225b270SMichael Große if ($line != ''){ 1158*6225b270SMichael Große $line = preg_replace('/(^|:)'.preg_quote($id,'/').'\*\d*/', '', $line); 1159*6225b270SMichael Große } 1160*6225b270SMichael Große $line = trim($line, ':'); 1161*6225b270SMichael Große if ($count) { 1162*6225b270SMichael Große if ($line) { 1163*6225b270SMichael Große return "$id*$count:".$line; 1164*6225b270SMichael Große } else { 1165*6225b270SMichael Große return "$id*$count"; 1166*6225b270SMichael Große } 1167*6225b270SMichael Große } 1168*6225b270SMichael Große return $line; 1169*6225b270SMichael Große } 1170*6225b270SMichael Große 1171*6225b270SMichael Große /** 1172*6225b270SMichael Große * Split a line into an array of tuples. 1173*6225b270SMichael Große * 1174*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1175*6225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 1176*6225b270SMichael Große * 1177*6225b270SMichael Große * @param array $keys 1178*6225b270SMichael Große * @param string $line 1179*6225b270SMichael Große * @return array 1180*6225b270SMichael Große */ 1181*6225b270SMichael Große protected function parseTuples(&$keys, $line) { 1182*6225b270SMichael Große $result = array(); 1183*6225b270SMichael Große if ($line == '') return $result; 1184*6225b270SMichael Große $parts = explode(':', $line); 1185*6225b270SMichael Große foreach ($parts as $tuple) { 1186*6225b270SMichael Große if ($tuple === '') continue; 1187*6225b270SMichael Große list($key, $cnt) = explode('*', $tuple); 1188*6225b270SMichael Große if (!$cnt) continue; 1189*6225b270SMichael Große $key = $keys[$key]; 1190*6225b270SMichael Große if ($key === false || is_null($key)) continue; 1191*6225b270SMichael Große $result[$key] = $cnt; 1192*6225b270SMichael Große } 1193*6225b270SMichael Große return $result; 1194*6225b270SMichael Große } 1195*6225b270SMichael Große 1196*6225b270SMichael Große /** 1197*6225b270SMichael Große * Sum the counts in a list of tuples. 1198*6225b270SMichael Große * 1199*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1200*6225b270SMichael Große * 1201*6225b270SMichael Große * @param string $line 1202*6225b270SMichael Große * @return int 1203*6225b270SMichael Große */ 1204*6225b270SMichael Große protected function countTuples($line) { 1205*6225b270SMichael Große $freq = 0; 1206*6225b270SMichael Große $parts = explode(':', $line); 1207*6225b270SMichael Große foreach ($parts as $tuple) { 1208*6225b270SMichael Große if ($tuple === '') continue; 1209*6225b270SMichael Große list(/* $pid */, $cnt) = explode('*', $tuple); 1210*6225b270SMichael Große $freq += (int)$cnt; 1211*6225b270SMichael Große } 1212*6225b270SMichael Große return $freq; 1213*6225b270SMichael Große } 1214*6225b270SMichael Große} 1215