1*6225b270SMichael Große<?php 2*6225b270SMichael Große 3*6225b270SMichael Großenamespace dokuwiki\Search; 4*6225b270SMichael Große 524870174SAndreas Gohruse dokuwiki\Utf8\Asian; 624870174SAndreas Gohruse dokuwiki\Utf8\Clean; 724870174SAndreas Gohruse dokuwiki\Utf8\PhpString; 8*6225b270SMichael Großeuse dokuwiki\Extension\Event; 9*6225b270SMichael Große 10*6225b270SMichael Große/** 11*6225b270SMichael Große * Class that encapsulates operations on the indexer database. 12*6225b270SMichael Große * 13*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 14*6225b270SMichael Große */ 158c7c53b0SAndreas Gohrclass Indexer 168c7c53b0SAndreas Gohr{ 17*6225b270SMichael Große /** 18*6225b270SMichael Große * @var array $pidCache Cache for getPID() 19*6225b270SMichael Große */ 2024870174SAndreas Gohr protected $pidCache = []; 21*6225b270SMichael Große 22*6225b270SMichael Große /** 23*6225b270SMichael Große * Adds the contents of a page to the fulltext index 24*6225b270SMichael Große * 25*6225b270SMichael Große * The added text replaces previous words for the same page. 26*6225b270SMichael Große * An empty value erases the page. 27*6225b270SMichael Große * 28*6225b270SMichael Große * @param string $page a page name 29*6225b270SMichael Große * @param string $text the body of the page 30*6225b270SMichael Große * @return string|boolean the function completed successfully 31*6225b270SMichael Große * 32*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 33*6225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 34*6225b270SMichael Große */ 35d868eb89SAndreas Gohr public function addPageWords($page, $text) 36d868eb89SAndreas Gohr { 37*6225b270SMichael Große if (!$this->lock()) 38*6225b270SMichael Große return "locked"; 39*6225b270SMichael Große 40*6225b270SMichael Große // load known documents 41*6225b270SMichael Große $pid = $this->getPIDNoLock($page); 42*6225b270SMichael Große if ($pid === false) { 43*6225b270SMichael Große $this->unlock(); 44*6225b270SMichael Große return false; 45*6225b270SMichael Große } 46*6225b270SMichael Große 4724870174SAndreas Gohr $pagewords = []; 48*6225b270SMichael Große // get word usage in page 49*6225b270SMichael Große $words = $this->getPageWords($text); 50*6225b270SMichael Große if ($words === false) { 51*6225b270SMichael Große $this->unlock(); 52*6225b270SMichael Große return false; 53*6225b270SMichael Große } 54*6225b270SMichael Große 55*6225b270SMichael Große if (!empty($words)) { 56*6225b270SMichael Große foreach (array_keys($words) as $wlen) { 57*6225b270SMichael Große $index = $this->getIndex('i', $wlen); 58*6225b270SMichael Große foreach ($words[$wlen] as $wid => $freq) { 59*6225b270SMichael Große $idx = ($wid<count($index)) ? $index[$wid] : ''; 60*6225b270SMichael Große $index[$wid] = $this->updateTuple($idx, $pid, $freq); 61*6225b270SMichael Große $pagewords[] = "$wlen*$wid"; 62*6225b270SMichael Große } 63*6225b270SMichael Große if (!$this->saveIndex('i', $wlen, $index)) { 64*6225b270SMichael Große $this->unlock(); 65*6225b270SMichael Große return false; 66*6225b270SMichael Große } 67*6225b270SMichael Große } 68*6225b270SMichael Große } 69*6225b270SMichael Große 70*6225b270SMichael Große // Remove obsolete index entries 71*6225b270SMichael Große $pageword_idx = $this->getIndexKey('pageword', '', $pid); 72*6225b270SMichael Große if ($pageword_idx !== '') { 73*6225b270SMichael Große $oldwords = explode(':', $pageword_idx); 74*6225b270SMichael Große $delwords = array_diff($oldwords, $pagewords); 7524870174SAndreas Gohr $upwords = []; 76*6225b270SMichael Große foreach ($delwords as $word) { 77*6225b270SMichael Große if ($word != '') { 7824870174SAndreas Gohr [$wlen, $wid] = explode('*', $word); 79*6225b270SMichael Große $wid = (int)$wid; 80*6225b270SMichael Große $upwords[$wlen][] = $wid; 81*6225b270SMichael Große } 82*6225b270SMichael Große } 83*6225b270SMichael Große foreach ($upwords as $wlen => $widx) { 84*6225b270SMichael Große $index = $this->getIndex('i', $wlen); 85*6225b270SMichael Große foreach ($widx as $wid) { 86*6225b270SMichael Große $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 87*6225b270SMichael Große } 88*6225b270SMichael Große $this->saveIndex('i', $wlen, $index); 89*6225b270SMichael Große } 90*6225b270SMichael Große } 91*6225b270SMichael Große // Save the reverse index 9224870174SAndreas Gohr $pageword_idx = implode(':', $pagewords); 93*6225b270SMichael Große if (!$this->saveIndexKey('pageword', '', $pid, $pageword_idx)) { 94*6225b270SMichael Große $this->unlock(); 95*6225b270SMichael Große return false; 96*6225b270SMichael Große } 97*6225b270SMichael Große 98*6225b270SMichael Große $this->unlock(); 99*6225b270SMichael Große return true; 100*6225b270SMichael Große } 101*6225b270SMichael Große 102*6225b270SMichael Große /** 103*6225b270SMichael Große * Split the words in a page and add them to the index. 104*6225b270SMichael Große * 105*6225b270SMichael Große * @param string $text content of the page 106*6225b270SMichael Große * @return array list of word IDs and number of times used 107*6225b270SMichael Große * 108*6225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 109*6225b270SMichael Große * @author Christopher Smith <chris@jalakai.co.uk> 110*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 111*6225b270SMichael Große */ 112d868eb89SAndreas Gohr protected function getPageWords($text) 113d868eb89SAndreas Gohr { 114*6225b270SMichael Große 115*6225b270SMichael Große $tokens = $this->tokenizer($text); 116*6225b270SMichael Große $tokens = array_count_values($tokens); // count the frequency of each token 117*6225b270SMichael Große 11824870174SAndreas Gohr $words = []; 119*6225b270SMichael Große foreach ($tokens as $w=>$c) { 120*6225b270SMichael Große $l = wordlen($w); 121*6225b270SMichael Große if (isset($words[$l])){ 12224870174SAndreas Gohr $words[$l][$w] = $c + ($words[$l][$w] ?? 0); 123*6225b270SMichael Große }else{ 12424870174SAndreas Gohr $words[$l] = [$w => $c]; 125*6225b270SMichael Große } 126*6225b270SMichael Große } 127*6225b270SMichael Große 128*6225b270SMichael Große // arrive here with $words = array(wordlen => array(word => frequency)) 12924870174SAndreas Gohr $index = []; //resulting index 130*6225b270SMichael Große foreach (array_keys($words) as $wlen) { 131*6225b270SMichael Große $word_idx = $this->getIndex('w', $wlen); 132173de31cSMoritz Raguschat $word_idx_modified = false; 133*6225b270SMichael Große foreach ($words[$wlen] as $word => $freq) { 134*6225b270SMichael Große $word = (string)$word; 135*6225b270SMichael Große $wid = array_search($word, $word_idx, true); 136*6225b270SMichael Große if ($wid === false) { 137*6225b270SMichael Große $wid = count($word_idx); 138*6225b270SMichael Große $word_idx[] = $word; 139*6225b270SMichael Große $word_idx_modified = true; 140*6225b270SMichael Große } 141*6225b270SMichael Große if (!isset($index[$wlen])) 14224870174SAndreas Gohr $index[$wlen] = []; 143*6225b270SMichael Große $index[$wlen][$wid] = $freq; 144*6225b270SMichael Große } 145*6225b270SMichael Große // save back the word index 146*6225b270SMichael Große if ($word_idx_modified && !$this->saveIndex('w', $wlen, $word_idx)) 147*6225b270SMichael Große return false; 148*6225b270SMichael Große } 149*6225b270SMichael Große 150*6225b270SMichael Große return $index; 151*6225b270SMichael Große } 152*6225b270SMichael Große 153*6225b270SMichael Große /** 154*6225b270SMichael Große * Add/update keys to/of the metadata index. 155*6225b270SMichael Große * 156*6225b270SMichael Große * Adding new keys does not remove other keys for the page. 157*6225b270SMichael Große * An empty value will erase the key. 158*6225b270SMichael Große * The $key parameter can be an array to add multiple keys. $value will 159*6225b270SMichael Große * not be used if $key is an array. 160*6225b270SMichael Große * 161*6225b270SMichael Große * @param string $page a page name 162*6225b270SMichael Große * @param mixed $key a key string or array of key=>value pairs 163*6225b270SMichael Große * @param mixed $value the value or list of values 164*6225b270SMichael Große * @return boolean|string the function completed successfully 165*6225b270SMichael Große * 166*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 167*6225b270SMichael Große * @author Michael Hamann <michael@content-space.de> 168*6225b270SMichael Große */ 169d868eb89SAndreas Gohr public function addMetaKeys($page, $key, $value = null) 170d868eb89SAndreas Gohr { 171*6225b270SMichael Große if (!is_array($key)) { 17224870174SAndreas Gohr $key = [$key => $value]; 173*6225b270SMichael Große } elseif (!is_null($value)) { 174*6225b270SMichael Große // $key is array, but $value is not null 175*6225b270SMichael Große trigger_error("array passed to addMetaKeys but value is not null", E_USER_WARNING); 176*6225b270SMichael Große } 177*6225b270SMichael Große 178*6225b270SMichael Große if (!$this->lock()) 179*6225b270SMichael Große return "locked"; 180*6225b270SMichael Große 181*6225b270SMichael Große // load known documents 182*6225b270SMichael Große $pid = $this->getPIDNoLock($page); 183*6225b270SMichael Große if ($pid === false) { 184*6225b270SMichael Große $this->unlock(); 185*6225b270SMichael Große return false; 186*6225b270SMichael Große } 187*6225b270SMichael Große 188*6225b270SMichael Große // Special handling for titles so the index file is simpler 189ac1d8211SAndreas Gohr if (isset($key['title'])) { 190*6225b270SMichael Große $value = $key['title']; 191*6225b270SMichael Große if (is_array($value)) { 192*6225b270SMichael Große $value = $value[0]; 193*6225b270SMichael Große } 194*6225b270SMichael Große $this->saveIndexKey('title', '', $pid, $value); 195*6225b270SMichael Große unset($key['title']); 196*6225b270SMichael Große } 197*6225b270SMichael Große 198*6225b270SMichael Große foreach ($key as $name => $values) { 199*6225b270SMichael Große $metaname = idx_cleanName($name); 200*6225b270SMichael Große $this->addIndexKey('metadata', '', $metaname); 201*6225b270SMichael Große $metaidx = $this->getIndex($metaname.'_i', ''); 202*6225b270SMichael Große $metawords = $this->getIndex($metaname.'_w', ''); 203*6225b270SMichael Große $addwords = false; 204*6225b270SMichael Große 20524870174SAndreas Gohr if (!is_array($values)) $values = [$values]; 206*6225b270SMichael Große 207*6225b270SMichael Große $val_idx = $this->getIndexKey($metaname.'_p', '', $pid); 208*6225b270SMichael Große if ($val_idx !== '') { 209*6225b270SMichael Große $val_idx = explode(':', $val_idx); 210*6225b270SMichael Große // -1 means remove, 0 keep, 1 add 211*6225b270SMichael Große $val_idx = array_combine($val_idx, array_fill(0, count($val_idx), -1)); 212*6225b270SMichael Große } else { 21324870174SAndreas Gohr $val_idx = []; 214*6225b270SMichael Große } 215*6225b270SMichael Große 216*6225b270SMichael Große foreach ($values as $val) { 217*6225b270SMichael Große $val = (string)$val; 218*6225b270SMichael Große if ($val !== "") { 219*6225b270SMichael Große $id = array_search($val, $metawords, true); 220*6225b270SMichael Große if ($id === false) { 221*6225b270SMichael Große // didn't find $val, so we'll add it to the end of metawords and create a placeholder in metaidx 222*6225b270SMichael Große $id = count($metawords); 223*6225b270SMichael Große $metawords[$id] = $val; 224*6225b270SMichael Große $metaidx[$id] = ''; 225*6225b270SMichael Große $addwords = true; 226*6225b270SMichael Große } 227*6225b270SMichael Große // test if value is already in the index 228*6225b270SMichael Große if (isset($val_idx[$id]) && $val_idx[$id] <= 0){ 229*6225b270SMichael Große $val_idx[$id] = 0; 230*6225b270SMichael Große } else { // else add it 231*6225b270SMichael Große $val_idx[$id] = 1; 232*6225b270SMichael Große } 233*6225b270SMichael Große } 234*6225b270SMichael Große } 235*6225b270SMichael Große 236*6225b270SMichael Große if ($addwords) { 237*6225b270SMichael Große $this->saveIndex($metaname.'_w', '', $metawords); 238*6225b270SMichael Große } 239*6225b270SMichael Große $vals_changed = false; 240*6225b270SMichael Große foreach ($val_idx as $id => $action) { 241*6225b270SMichael Große if ($action == -1) { 242*6225b270SMichael Große $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 0); 243*6225b270SMichael Große $vals_changed = true; 244*6225b270SMichael Große unset($val_idx[$id]); 245*6225b270SMichael Große } elseif ($action == 1) { 246*6225b270SMichael Große $metaidx[$id] = $this->updateTuple($metaidx[$id], $pid, 1); 247*6225b270SMichael Große $vals_changed = true; 248*6225b270SMichael Große } 249*6225b270SMichael Große } 250*6225b270SMichael Große 251*6225b270SMichael Große if ($vals_changed) { 252*6225b270SMichael Große $this->saveIndex($metaname.'_i', '', $metaidx); 253*6225b270SMichael Große $val_idx = implode(':', array_keys($val_idx)); 254*6225b270SMichael Große $this->saveIndexKey($metaname.'_p', '', $pid, $val_idx); 255*6225b270SMichael Große } 256*6225b270SMichael Große 257*6225b270SMichael Große unset($metaidx); 258*6225b270SMichael Große unset($metawords); 259*6225b270SMichael Große } 260*6225b270SMichael Große 261*6225b270SMichael Große $this->unlock(); 262*6225b270SMichael Große return true; 263*6225b270SMichael Große } 264*6225b270SMichael Große 265*6225b270SMichael Große /** 266*6225b270SMichael Große * Rename a page in the search index without changing the indexed content. This function doesn't check if the 267*6225b270SMichael Große * old or new name exists in the filesystem. It returns an error if the old page isn't in the page list of the 268*6225b270SMichael Große * indexer and it deletes all previously indexed content of the new page. 269*6225b270SMichael Große * 270*6225b270SMichael Große * @param string $oldpage The old page name 271*6225b270SMichael Große * @param string $newpage The new page name 272*6225b270SMichael Große * @return string|bool If the page was successfully renamed, can be a message in the case of an error 273*6225b270SMichael Große */ 274d868eb89SAndreas Gohr public function renamePage($oldpage, $newpage) 275d868eb89SAndreas Gohr { 276*6225b270SMichael Große if (!$this->lock()) return 'locked'; 277*6225b270SMichael Große 278*6225b270SMichael Große $pages = $this->getPages(); 279*6225b270SMichael Große 280*6225b270SMichael Große $id = array_search($oldpage, $pages, true); 281*6225b270SMichael Große if ($id === false) { 282*6225b270SMichael Große $this->unlock(); 283*6225b270SMichael Große return 'page is not in index'; 284*6225b270SMichael Große } 285*6225b270SMichael Große 286*6225b270SMichael Große $new_id = array_search($newpage, $pages, true); 287*6225b270SMichael Große if ($new_id !== false) { 288*6225b270SMichael Große // make sure the page is not in the index anymore 28924870174SAndreas Gohr if (!$this->deletePageNoLock($newpage)) { 290*6225b270SMichael Große return false; 291*6225b270SMichael Große } 292*6225b270SMichael Große 29324870174SAndreas Gohr $pages[$new_id] = 'deleted:'.time().random_int(0, 9999); 294*6225b270SMichael Große } 295*6225b270SMichael Große 296*6225b270SMichael Große $pages[$id] = $newpage; 297*6225b270SMichael Große 298*6225b270SMichael Große // update index 299*6225b270SMichael Große if (!$this->saveIndex('page', '', $pages)) { 300*6225b270SMichael Große $this->unlock(); 301*6225b270SMichael Große return false; 302*6225b270SMichael Große } 303*6225b270SMichael Große 304*6225b270SMichael Große // reset the pid cache 30524870174SAndreas Gohr $this->pidCache = []; 306*6225b270SMichael Große 307*6225b270SMichael Große $this->unlock(); 308*6225b270SMichael Große return true; 309*6225b270SMichael Große } 310*6225b270SMichael Große 311*6225b270SMichael Große /** 312*6225b270SMichael Große * Renames a meta value in the index. This doesn't change the meta value in the pages, it assumes that all pages 313*6225b270SMichael Große * will be updated. 314*6225b270SMichael Große * 315*6225b270SMichael Große * @param string $key The metadata key of which a value shall be changed 316*6225b270SMichael Große * @param string $oldvalue The old value that shall be renamed 317*6225b270SMichael Große * @param string $newvalue The new value to which the old value shall be renamed, if exists values will be merged 318*6225b270SMichael Große * @return bool|string If renaming the value has been successful, false or error message on error. 319*6225b270SMichael Große */ 320d868eb89SAndreas Gohr public function renameMetaValue($key, $oldvalue, $newvalue) 321d868eb89SAndreas Gohr { 322*6225b270SMichael Große if (!$this->lock()) return 'locked'; 323*6225b270SMichael Große 324*6225b270SMichael Große // change the relation references index 325*6225b270SMichael Große $metavalues = $this->getIndex($key, '_w'); 326*6225b270SMichael Große $oldid = array_search($oldvalue, $metavalues, true); 327*6225b270SMichael Große if ($oldid !== false) { 328*6225b270SMichael Große $newid = array_search($newvalue, $metavalues, true); 329*6225b270SMichael Große if ($newid !== false) { 330*6225b270SMichael Große // free memory 331*6225b270SMichael Große unset ($metavalues); 332*6225b270SMichael Große 333*6225b270SMichael Große // okay, now we have two entries for the same value. we need to merge them. 334*6225b270SMichael Große $indexline = $this->getIndexKey($key.'_i', '', $oldid); 335*6225b270SMichael Große if ($indexline != '') { 336*6225b270SMichael Große $newindexline = $this->getIndexKey($key.'_i', '', $newid); 337*6225b270SMichael Große $pagekeys = $this->getIndex($key.'_p', ''); 338*6225b270SMichael Große $parts = explode(':', $indexline); 339*6225b270SMichael Große foreach ($parts as $part) { 34024870174SAndreas Gohr [$id, $count] = explode('*', $part); 341*6225b270SMichael Große $newindexline = $this->updateTuple($newindexline, $id, $count); 342*6225b270SMichael Große 343*6225b270SMichael Große $keyline = explode(':', $pagekeys[$id]); 344*6225b270SMichael Große // remove old meta value 34524870174SAndreas Gohr $keyline = array_diff($keyline, [$oldid]); 346*6225b270SMichael Große // add new meta value when not already present 347*6225b270SMichael Große if (!in_array($newid, $keyline)) { 34824870174SAndreas Gohr $keyline[] = $newid; 349*6225b270SMichael Große } 350*6225b270SMichael Große $pagekeys[$id] = implode(':', $keyline); 351*6225b270SMichael Große } 352*6225b270SMichael Große $this->saveIndex($key.'_p', '', $pagekeys); 353*6225b270SMichael Große unset($pagekeys); 354*6225b270SMichael Große $this->saveIndexKey($key.'_i', '', $oldid, ''); 355*6225b270SMichael Große $this->saveIndexKey($key.'_i', '', $newid, $newindexline); 356*6225b270SMichael Große } 357*6225b270SMichael Große } else { 358*6225b270SMichael Große $metavalues[$oldid] = $newvalue; 359*6225b270SMichael Große if (!$this->saveIndex($key.'_w', '', $metavalues)) { 360*6225b270SMichael Große $this->unlock(); 361*6225b270SMichael Große return false; 362*6225b270SMichael Große } 363*6225b270SMichael Große } 364*6225b270SMichael Große } 365*6225b270SMichael Große 366*6225b270SMichael Große $this->unlock(); 367*6225b270SMichael Große return true; 368*6225b270SMichael Große } 369*6225b270SMichael Große 370*6225b270SMichael Große /** 371*6225b270SMichael Große * Remove a page from the index 372*6225b270SMichael Große * 373*6225b270SMichael Große * Erases entries in all known indexes. 374*6225b270SMichael Große * 375*6225b270SMichael Große * @param string $page a page name 376*6225b270SMichael Große * @return string|boolean the function completed successfully 377*6225b270SMichael Große * 378*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 379*6225b270SMichael Große */ 380d868eb89SAndreas Gohr public function deletePage($page) 381d868eb89SAndreas Gohr { 382*6225b270SMichael Große if (!$this->lock()) 383*6225b270SMichael Große return "locked"; 384*6225b270SMichael Große 385*6225b270SMichael Große $result = $this->deletePageNoLock($page); 386*6225b270SMichael Große 387*6225b270SMichael Große $this->unlock(); 388*6225b270SMichael Große 389*6225b270SMichael Große return $result; 390*6225b270SMichael Große } 391*6225b270SMichael Große 392*6225b270SMichael Große /** 393*6225b270SMichael Große * Remove a page from the index without locking the index, only use this function if the index is already locked 394*6225b270SMichael Große * 395*6225b270SMichael Große * Erases entries in all known indexes. 396*6225b270SMichael Große * 397*6225b270SMichael Große * @param string $page a page name 398*6225b270SMichael Große * @return boolean the function completed successfully 399*6225b270SMichael Große * 400*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 401*6225b270SMichael Große */ 402d868eb89SAndreas Gohr protected function deletePageNoLock($page) 403d868eb89SAndreas Gohr { 404*6225b270SMichael Große // load known documents 405*6225b270SMichael Große $pid = $this->getPIDNoLock($page); 406*6225b270SMichael Große if ($pid === false) { 407*6225b270SMichael Große return false; 408*6225b270SMichael Große } 409*6225b270SMichael Große 410*6225b270SMichael Große // Remove obsolete index entries 411*6225b270SMichael Große $pageword_idx = $this->getIndexKey('pageword', '', $pid); 412*6225b270SMichael Große if ($pageword_idx !== '') { 413*6225b270SMichael Große $delwords = explode(':', $pageword_idx); 41424870174SAndreas Gohr $upwords = []; 415*6225b270SMichael Große foreach ($delwords as $word) { 416*6225b270SMichael Große if ($word != '') { 41724870174SAndreas Gohr [$wlen, $wid] = explode('*', $word); 418*6225b270SMichael Große $wid = (int)$wid; 419*6225b270SMichael Große $upwords[$wlen][] = $wid; 420*6225b270SMichael Große } 421*6225b270SMichael Große } 422*6225b270SMichael Große foreach ($upwords as $wlen => $widx) { 423*6225b270SMichael Große $index = $this->getIndex('i', $wlen); 424*6225b270SMichael Große foreach ($widx as $wid) { 425*6225b270SMichael Große $index[$wid] = $this->updateTuple($index[$wid], $pid, 0); 426*6225b270SMichael Große } 427*6225b270SMichael Große $this->saveIndex('i', $wlen, $index); 428*6225b270SMichael Große } 429*6225b270SMichael Große } 430*6225b270SMichael Große // Save the reverse index 431*6225b270SMichael Große if (!$this->saveIndexKey('pageword', '', $pid, "")) { 432*6225b270SMichael Große return false; 433*6225b270SMichael Große } 434*6225b270SMichael Große 435*6225b270SMichael Große $this->saveIndexKey('title', '', $pid, ""); 436*6225b270SMichael Große $keyidx = $this->getIndex('metadata', ''); 437*6225b270SMichael Große foreach ($keyidx as $metaname) { 438*6225b270SMichael Große $val_idx = explode(':', $this->getIndexKey($metaname.'_p', '', $pid)); 439*6225b270SMichael Große $meta_idx = $this->getIndex($metaname.'_i', ''); 440*6225b270SMichael Große foreach ($val_idx as $id) { 441*6225b270SMichael Große if ($id === '') continue; 442*6225b270SMichael Große $meta_idx[$id] = $this->updateTuple($meta_idx[$id], $pid, 0); 443*6225b270SMichael Große } 444*6225b270SMichael Große $this->saveIndex($metaname.'_i', '', $meta_idx); 445*6225b270SMichael Große $this->saveIndexKey($metaname.'_p', '', $pid, ''); 446*6225b270SMichael Große } 447*6225b270SMichael Große 448*6225b270SMichael Große return true; 449*6225b270SMichael Große } 450*6225b270SMichael Große 451*6225b270SMichael Große /** 452*6225b270SMichael Große * Clear the whole index 453*6225b270SMichael Große * 454*6225b270SMichael Große * @return bool If the index has been cleared successfully 455*6225b270SMichael Große */ 456d868eb89SAndreas Gohr public function clear() 457d868eb89SAndreas Gohr { 458*6225b270SMichael Große global $conf; 459*6225b270SMichael Große 460*6225b270SMichael Große if (!$this->lock()) return false; 461*6225b270SMichael Große 462*6225b270SMichael Große @unlink($conf['indexdir'].'/page.idx'); 463*6225b270SMichael Große @unlink($conf['indexdir'].'/title.idx'); 464*6225b270SMichael Große @unlink($conf['indexdir'].'/pageword.idx'); 465*6225b270SMichael Große @unlink($conf['indexdir'].'/metadata.idx'); 466*6225b270SMichael Große $dir = @opendir($conf['indexdir']); 467*6225b270SMichael Große if($dir!==false){ 468*6225b270SMichael Große while(($f = readdir($dir)) !== false){ 469*6225b270SMichael Große if(substr($f, -4)=='.idx' && 470*6225b270SMichael Große (substr($f, 0, 1)=='i' || substr($f, 0, 1)=='w' 471*6225b270SMichael Große || substr($f, -6)=='_w.idx' || substr($f, -6)=='_i.idx' || substr($f, -6)=='_p.idx')) 472*6225b270SMichael Große @unlink($conf['indexdir']."/$f"); 473*6225b270SMichael Große } 474*6225b270SMichael Große } 475*6225b270SMichael Große @unlink($conf['indexdir'].'/lengths.idx'); 476*6225b270SMichael Große 477*6225b270SMichael Große // clear the pid cache 47824870174SAndreas Gohr $this->pidCache = []; 479*6225b270SMichael Große 480*6225b270SMichael Große $this->unlock(); 481*6225b270SMichael Große return true; 482*6225b270SMichael Große } 483*6225b270SMichael Große 484*6225b270SMichael Große /** 485*6225b270SMichael Große * Split the text into words for fulltext search 486*6225b270SMichael Große * 487*6225b270SMichael Große * TODO: does this also need &$stopwords ? 488*6225b270SMichael Große * 489*6225b270SMichael Große * @triggers INDEXER_TEXT_PREPARE 490*6225b270SMichael Große * This event allows plugins to modify the text before it gets tokenized. 491*6225b270SMichael Große * Plugins intercepting this event should also intercept INDEX_VERSION_GET 492*6225b270SMichael Große * 493*6225b270SMichael Große * @param string $text plain text 494*6225b270SMichael Große * @param boolean $wc are wildcards allowed? 495*6225b270SMichael Große * @return array list of words in the text 496*6225b270SMichael Große * 497*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 498*6225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 499*6225b270SMichael Große */ 500d868eb89SAndreas Gohr public function tokenizer($text, $wc = false) 501d868eb89SAndreas Gohr { 502*6225b270SMichael Große $wc = ($wc) ? '' : '\*'; 503*6225b270SMichael Große $stopwords =& idx_get_stopwords(); 504*6225b270SMichael Große 505*6225b270SMichael Große // prepare the text to be tokenized 506*6225b270SMichael Große $evt = new Event('INDEXER_TEXT_PREPARE', $text); 507*6225b270SMichael Große if ($evt->advise_before(true)) { 508*6225b270SMichael Große if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 50924870174SAndreas Gohr $text = Asian::separateAsianWords($text); 510*6225b270SMichael Große } 511*6225b270SMichael Große } 512*6225b270SMichael Große $evt->advise_after(); 513*6225b270SMichael Große unset($evt); 514*6225b270SMichael Große 515*6225b270SMichael Große $text = strtr($text, 51624870174SAndreas Gohr ["\r" => ' ', "\n" => ' ', "\t" => ' ', "\xC2\xAD" => ''] 517*6225b270SMichael Große ); 518*6225b270SMichael Große if (preg_match('/[^0-9A-Za-z ]/u', $text)) 51924870174SAndreas Gohr $text = Clean::stripspecials($text, ' ', '\._\-:'.$wc); 520*6225b270SMichael Große 521*6225b270SMichael Große $wordlist = explode(' ', $text); 522*6225b270SMichael Große foreach ($wordlist as $i => $word) { 523*6225b270SMichael Große $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 52424870174SAndreas Gohr PhpString::strtolower($word) : strtolower($word); 525*6225b270SMichael Große } 526*6225b270SMichael Große 527*6225b270SMichael Große foreach ($wordlist as $i => $word) { 528*6225b270SMichael Große if ((!is_numeric($word) && strlen($word) < IDX_MINWORDLENGTH) 52924870174SAndreas Gohr || in_array($word, $stopwords, true)) 530*6225b270SMichael Große unset($wordlist[$i]); 531*6225b270SMichael Große } 532*6225b270SMichael Große return array_values($wordlist); 533*6225b270SMichael Große } 534*6225b270SMichael Große 535*6225b270SMichael Große /** 536*6225b270SMichael Große * Get the numeric PID of a page 537*6225b270SMichael Große * 538*6225b270SMichael Große * @param string $page The page to get the PID for 539*6225b270SMichael Große * @return bool|int The page id on success, false on error 540*6225b270SMichael Große */ 541d868eb89SAndreas Gohr public function getPID($page) 542d868eb89SAndreas Gohr { 543*6225b270SMichael Große // return PID without locking when it is in the cache 544*6225b270SMichael Große if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 545*6225b270SMichael Große 546*6225b270SMichael Große if (!$this->lock()) 547*6225b270SMichael Große return false; 548*6225b270SMichael Große 549*6225b270SMichael Große // load known documents 550*6225b270SMichael Große $pid = $this->getPIDNoLock($page); 551*6225b270SMichael Große if ($pid === false) { 552*6225b270SMichael Große $this->unlock(); 553*6225b270SMichael Große return false; 554*6225b270SMichael Große } 555*6225b270SMichael Große 556*6225b270SMichael Große $this->unlock(); 557*6225b270SMichael Große return $pid; 558*6225b270SMichael Große } 559*6225b270SMichael Große 560*6225b270SMichael Große /** 561*6225b270SMichael Große * Get the numeric PID of a page without locking the index. 562*6225b270SMichael Große * Only use this function when the index is already locked. 563*6225b270SMichael Große * 564*6225b270SMichael Große * @param string $page The page to get the PID for 565*6225b270SMichael Große * @return bool|int The page id on success, false on error 566*6225b270SMichael Große */ 567d868eb89SAndreas Gohr protected function getPIDNoLock($page) 568d868eb89SAndreas Gohr { 569*6225b270SMichael Große // avoid expensive addIndexKey operation for the most recently requested pages by using a cache 570*6225b270SMichael Große if (isset($this->pidCache[$page])) return $this->pidCache[$page]; 571*6225b270SMichael Große $pid = $this->addIndexKey('page', '', $page); 572*6225b270SMichael Große // limit cache to 10 entries by discarding the oldest element as in DokuWiki usually only the most recently 573*6225b270SMichael Große // added item will be requested again 574*6225b270SMichael Große if (count($this->pidCache) > 10) array_shift($this->pidCache); 575*6225b270SMichael Große $this->pidCache[$page] = $pid; 576*6225b270SMichael Große return $pid; 577*6225b270SMichael Große } 578*6225b270SMichael Große 579*6225b270SMichael Große /** 580*6225b270SMichael Große * Get the page id of a numeric PID 581*6225b270SMichael Große * 582*6225b270SMichael Große * @param int $pid The PID to get the page id for 583*6225b270SMichael Große * @return string The page id 584*6225b270SMichael Große */ 585d868eb89SAndreas Gohr public function getPageFromPID($pid) 586d868eb89SAndreas Gohr { 587*6225b270SMichael Große return $this->getIndexKey('page', '', $pid); 588*6225b270SMichael Große } 589*6225b270SMichael Große 590*6225b270SMichael Große /** 591*6225b270SMichael Große * Find pages in the fulltext index containing the words, 592*6225b270SMichael Große * 593*6225b270SMichael Große * The search words must be pre-tokenized, meaning only letters and 594*6225b270SMichael Große * numbers with an optional wildcard 595*6225b270SMichael Große * 596*6225b270SMichael Große * The returned array will have the original tokens as key. The values 597*6225b270SMichael Große * in the returned list is an array with the page names as keys and the 598*6225b270SMichael Große * number of times that token appears on the page as value. 599*6225b270SMichael Große * 600*6225b270SMichael Große * @param array $tokens list of words to search for 601*6225b270SMichael Große * @return array list of page names with usage counts 602*6225b270SMichael Große * 603*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 604*6225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 605*6225b270SMichael Große */ 606d868eb89SAndreas Gohr public function lookup(&$tokens) 607d868eb89SAndreas Gohr { 60824870174SAndreas Gohr $result = []; 609*6225b270SMichael Große $wids = $this->getIndexWords($tokens, $result); 61024870174SAndreas Gohr if (empty($wids)) return []; 611*6225b270SMichael Große // load known words and documents 612*6225b270SMichael Große $page_idx = $this->getIndex('page', ''); 61324870174SAndreas Gohr $docs = []; 614*6225b270SMichael Große foreach (array_keys($wids) as $wlen) { 615*6225b270SMichael Große $wids[$wlen] = array_unique($wids[$wlen]); 616*6225b270SMichael Große $index = $this->getIndex('i', $wlen); 617*6225b270SMichael Große foreach($wids[$wlen] as $ixid) { 618*6225b270SMichael Große if ($ixid < count($index)) 619*6225b270SMichael Große $docs["$wlen*$ixid"] = $this->parseTuples($page_idx, $index[$ixid]); 620*6225b270SMichael Große } 621*6225b270SMichael Große } 622*6225b270SMichael Große // merge found pages into final result array 62324870174SAndreas Gohr $final = []; 624*6225b270SMichael Große foreach ($result as $word => $res) { 62524870174SAndreas Gohr $final[$word] = []; 626*6225b270SMichael Große foreach ($res as $wid) { 627*6225b270SMichael Große // handle the case when ($ixid < count($index)) has been false 628*6225b270SMichael Große // and thus $docs[$wid] hasn't been set. 629*6225b270SMichael Große if (!isset($docs[$wid])) continue; 630*6225b270SMichael Große $hits = &$docs[$wid]; 631*6225b270SMichael Große foreach ($hits as $hitkey => $hitcnt) { 632*6225b270SMichael Große // make sure the document still exists 633*6225b270SMichael Große if (!page_exists($hitkey, '', false)) continue; 634*6225b270SMichael Große if (!isset($final[$word][$hitkey])) 635*6225b270SMichael Große $final[$word][$hitkey] = $hitcnt; 636*6225b270SMichael Große else 637*6225b270SMichael Große $final[$word][$hitkey] += $hitcnt; 638*6225b270SMichael Große } 639*6225b270SMichael Große } 640*6225b270SMichael Große } 641*6225b270SMichael Große return $final; 642*6225b270SMichael Große } 643*6225b270SMichael Große 644*6225b270SMichael Große /** 645*6225b270SMichael Große * Find pages containing a metadata key. 646*6225b270SMichael Große * 647*6225b270SMichael Große * The metadata values are compared as case-sensitive strings. Pass a 648*6225b270SMichael Große * callback function that returns true or false to use a different 649*6225b270SMichael Große * comparison function. The function will be called with the $value being 650*6225b270SMichael Große * searched for as the first argument, and the word in the index as the 651*6225b270SMichael Große * second argument. The function preg_match can be used directly if the 652*6225b270SMichael Große * values are regexes. 653*6225b270SMichael Große * 654*6225b270SMichael Große * @param string $key name of the metadata key to look for 655*6225b270SMichael Große * @param string $value search term to look for, must be a string or array of strings 656*6225b270SMichael Große * @param callback $func comparison function 657*6225b270SMichael Große * @return array lists with page names, keys are query values if $value is array 658*6225b270SMichael Große * 659*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 660*6225b270SMichael Große * @author Michael Hamann <michael@content-space.de> 661*6225b270SMichael Große */ 662d868eb89SAndreas Gohr public function lookupKey($key, &$value, $func = null) 663d868eb89SAndreas Gohr { 664*6225b270SMichael Große if (!is_array($value)) 66524870174SAndreas Gohr $value_array = [$value]; 666*6225b270SMichael Große else 667*6225b270SMichael Große $value_array =& $value; 668*6225b270SMichael Große 669*6225b270SMichael Große // the matching ids for the provided value(s) 67024870174SAndreas Gohr $value_ids = []; 671*6225b270SMichael Große 672*6225b270SMichael Große $metaname = idx_cleanName($key); 673*6225b270SMichael Große 674*6225b270SMichael Große // get all words in order to search the matching ids 675*6225b270SMichael Große if ($key == 'title') { 676*6225b270SMichael Große $words = $this->getIndex('title', ''); 677*6225b270SMichael Große } else { 678*6225b270SMichael Große $words = $this->getIndex($metaname.'_w', ''); 679*6225b270SMichael Große } 680*6225b270SMichael Große 681*6225b270SMichael Große if (!is_null($func)) { 682*6225b270SMichael Große foreach ($value_array as $val) { 683*6225b270SMichael Große foreach ($words as $i => $word) { 68424870174SAndreas Gohr if (call_user_func_array($func, [$val, $word])) 685*6225b270SMichael Große $value_ids[$i][] = $val; 686*6225b270SMichael Große } 687*6225b270SMichael Große } 688*6225b270SMichael Große } else { 689*6225b270SMichael Große foreach ($value_array as $val) { 690*6225b270SMichael Große $xval = $val; 691*6225b270SMichael Große $caret = '^'; 692*6225b270SMichael Große $dollar = '$'; 693*6225b270SMichael Große // check for wildcards 694*6225b270SMichael Große if (substr($xval, 0, 1) == '*') { 695*6225b270SMichael Große $xval = substr($xval, 1); 696*6225b270SMichael Große $caret = ''; 697*6225b270SMichael Große } 698*6225b270SMichael Große if (substr($xval, -1, 1) == '*') { 699*6225b270SMichael Große $xval = substr($xval, 0, -1); 700*6225b270SMichael Große $dollar = ''; 701*6225b270SMichael Große } 702*6225b270SMichael Große if (!$caret || !$dollar) { 703*6225b270SMichael Große $re = $caret.preg_quote($xval, '/').$dollar; 704*6225b270SMichael Große foreach(array_keys(preg_grep('/'.$re.'/', $words)) as $i) 705*6225b270SMichael Große $value_ids[$i][] = $val; 70624870174SAndreas Gohr } elseif (($i = array_search($val, $words, true)) !== false) { 707*6225b270SMichael Große $value_ids[$i][] = $val; 708*6225b270SMichael Große } 709*6225b270SMichael Große } 710*6225b270SMichael Große } 711*6225b270SMichael Große 712*6225b270SMichael Große unset($words); // free the used memory 713*6225b270SMichael Große 714*6225b270SMichael Große // initialize the result so it won't be null 71524870174SAndreas Gohr $result = []; 716*6225b270SMichael Große foreach ($value_array as $val) { 71724870174SAndreas Gohr $result[$val] = []; 718*6225b270SMichael Große } 719*6225b270SMichael Große 720*6225b270SMichael Große $page_idx = $this->getIndex('page', ''); 721*6225b270SMichael Große 722*6225b270SMichael Große // Special handling for titles 723*6225b270SMichael Große if ($key == 'title') { 724*6225b270SMichael Große foreach ($value_ids as $pid => $val_list) { 725*6225b270SMichael Große $page = $page_idx[$pid]; 726*6225b270SMichael Große foreach ($val_list as $val) { 727*6225b270SMichael Große $result[$val][] = $page; 728*6225b270SMichael Große } 729*6225b270SMichael Große } 730*6225b270SMichael Große } else { 731*6225b270SMichael Große // load all lines and pages so the used lines can be taken and matched with the pages 732*6225b270SMichael Große $lines = $this->getIndex($metaname.'_i', ''); 733*6225b270SMichael Große 734*6225b270SMichael Große foreach ($value_ids as $value_id => $val_list) { 735*6225b270SMichael Große // parse the tuples of the form page_id*1:page2_id*1 and so on, return value 736*6225b270SMichael Große // is an array with page_id => 1, page2_id => 1 etc. so take the keys only 737*6225b270SMichael Große $pages = array_keys($this->parseTuples($page_idx, $lines[$value_id])); 738*6225b270SMichael Große foreach ($val_list as $val) { 73924870174SAndreas Gohr $result[$val] = [...$result[$val], ...$pages]; 740*6225b270SMichael Große } 741*6225b270SMichael Große } 742*6225b270SMichael Große } 743*6225b270SMichael Große if (!is_array($value)) $result = $result[$value]; 744*6225b270SMichael Große return $result; 745*6225b270SMichael Große } 746*6225b270SMichael Große 747*6225b270SMichael Große /** 748*6225b270SMichael Große * Find the index ID of each search term. 749*6225b270SMichael Große * 750*6225b270SMichael Große * The query terms should only contain valid characters, with a '*' at 751*6225b270SMichael Große * either the beginning or end of the word (or both). 752*6225b270SMichael Große * The $result parameter can be used to merge the index locations with 753*6225b270SMichael Große * the appropriate query term. 754*6225b270SMichael Große * 755*6225b270SMichael Große * @param array $words The query terms. 756*6225b270SMichael Große * @param array $result Set to word => array("length*id" ...) 757*6225b270SMichael Große * @return array Set to length => array(id ...) 758*6225b270SMichael Große * 759*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 760*6225b270SMichael Große */ 761d868eb89SAndreas Gohr protected function getIndexWords(&$words, &$result) 762d868eb89SAndreas Gohr { 76324870174SAndreas Gohr $tokens = []; 76424870174SAndreas Gohr $tokenlength = []; 76524870174SAndreas Gohr $tokenwild = []; 766*6225b270SMichael Große foreach ($words as $word) { 76724870174SAndreas Gohr $result[$word] = []; 768*6225b270SMichael Große $caret = '^'; 769*6225b270SMichael Große $dollar = '$'; 770*6225b270SMichael Große $xword = $word; 771*6225b270SMichael Große $wlen = wordlen($word); 772*6225b270SMichael Große 773*6225b270SMichael Große // check for wildcards 774*6225b270SMichael Große if (substr($xword, 0, 1) == '*') { 775*6225b270SMichael Große $xword = substr($xword, 1); 776*6225b270SMichael Große $caret = ''; 77724870174SAndreas Gohr --$wlen; 778*6225b270SMichael Große } 779*6225b270SMichael Große if (substr($xword, -1, 1) == '*') { 780*6225b270SMichael Große $xword = substr($xword, 0, -1); 781*6225b270SMichael Große $dollar = ''; 78224870174SAndreas Gohr --$wlen; 783*6225b270SMichael Große } 784*6225b270SMichael Große if ($wlen < IDX_MINWORDLENGTH && $caret && $dollar && !is_numeric($xword)) 785*6225b270SMichael Große continue; 786*6225b270SMichael Große if (!isset($tokens[$xword])) 787*6225b270SMichael Große $tokenlength[$wlen][] = $xword; 788*6225b270SMichael Große if (!$caret || !$dollar) { 789*6225b270SMichael Große $re = $caret.preg_quote($xword, '/').$dollar; 79024870174SAndreas Gohr $tokens[$xword][] = [$word, '/'.$re.'/']; 791*6225b270SMichael Große if (!isset($tokenwild[$xword])) 792*6225b270SMichael Große $tokenwild[$xword] = $wlen; 793*6225b270SMichael Große } else { 79424870174SAndreas Gohr $tokens[$xword][] = [$word, null]; 795*6225b270SMichael Große } 796*6225b270SMichael Große } 797*6225b270SMichael Große asort($tokenwild); 798*6225b270SMichael Große // $tokens = array( base word => array( [ query term , regexp ] ... ) ... ) 799*6225b270SMichael Große // $tokenlength = array( base word length => base word ... ) 800*6225b270SMichael Große // $tokenwild = array( base word => base word length ... ) 80124870174SAndreas Gohr $length_filter = $tokenwild === [] ? $tokenlength : min(array_keys($tokenlength)); 802*6225b270SMichael Große $indexes_known = $this->indexLengths($length_filter); 80324870174SAndreas Gohr if ($tokenwild !== []) sort($indexes_known); 804*6225b270SMichael Große // get word IDs 80524870174SAndreas Gohr $wids = []; 806*6225b270SMichael Große foreach ($indexes_known as $ixlen) { 807*6225b270SMichael Große $word_idx = $this->getIndex('w', $ixlen); 808*6225b270SMichael Große // handle exact search 809*6225b270SMichael Große if (isset($tokenlength[$ixlen])) { 810*6225b270SMichael Große foreach ($tokenlength[$ixlen] as $xword) { 811*6225b270SMichael Große $wid = array_search($xword, $word_idx, true); 812*6225b270SMichael Große if ($wid !== false) { 813*6225b270SMichael Große $wids[$ixlen][] = $wid; 814*6225b270SMichael Große foreach ($tokens[$xword] as $w) 815*6225b270SMichael Große $result[$w[0]][] = "$ixlen*$wid"; 816*6225b270SMichael Große } 817*6225b270SMichael Große } 818*6225b270SMichael Große } 819*6225b270SMichael Große // handle wildcard search 820*6225b270SMichael Große foreach ($tokenwild as $xword => $wlen) { 821*6225b270SMichael Große if ($wlen >= $ixlen) break; 822*6225b270SMichael Große foreach ($tokens[$xword] as $w) { 823*6225b270SMichael Große if (is_null($w[1])) continue; 824*6225b270SMichael Große foreach(array_keys(preg_grep($w[1], $word_idx)) as $wid) { 825*6225b270SMichael Große $wids[$ixlen][] = $wid; 826*6225b270SMichael Große $result[$w[0]][] = "$ixlen*$wid"; 827*6225b270SMichael Große } 828*6225b270SMichael Große } 829*6225b270SMichael Große } 830*6225b270SMichael Große } 831*6225b270SMichael Große return $wids; 832*6225b270SMichael Große } 833*6225b270SMichael Große 834*6225b270SMichael Große /** 835*6225b270SMichael Große * Return a list of all pages 836*6225b270SMichael Große * Warning: pages may not exist! 837*6225b270SMichael Große * 838*6225b270SMichael Große * @param string $key list only pages containing the metadata key (optional) 839*6225b270SMichael Große * @return array list of page names 840*6225b270SMichael Große * 841*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 842*6225b270SMichael Große */ 843d868eb89SAndreas Gohr public function getPages($key = null) 844d868eb89SAndreas Gohr { 845*6225b270SMichael Große $page_idx = $this->getIndex('page', ''); 846*6225b270SMichael Große if (is_null($key)) return $page_idx; 847*6225b270SMichael Große 848*6225b270SMichael Große $metaname = idx_cleanName($key); 849*6225b270SMichael Große 850*6225b270SMichael Große // Special handling for titles 851*6225b270SMichael Große if ($key == 'title') { 852*6225b270SMichael Große $title_idx = $this->getIndex('title', ''); 853*6225b270SMichael Große array_splice($page_idx, count($title_idx)); 854*6225b270SMichael Große foreach ($title_idx as $i => $title) 855*6225b270SMichael Große if ($title === "") unset($page_idx[$i]); 856*6225b270SMichael Große return array_values($page_idx); 857*6225b270SMichael Große } 858*6225b270SMichael Große 85924870174SAndreas Gohr $pages = []; 860*6225b270SMichael Große $lines = $this->getIndex($metaname.'_i', ''); 861*6225b270SMichael Große foreach ($lines as $line) { 862*6225b270SMichael Große $pages = array_merge($pages, $this->parseTuples($page_idx, $line)); 863*6225b270SMichael Große } 864*6225b270SMichael Große return array_keys($pages); 865*6225b270SMichael Große } 866*6225b270SMichael Große 867*6225b270SMichael Große /** 868*6225b270SMichael Große * Return a list of words sorted by number of times used 869*6225b270SMichael Große * 870*6225b270SMichael Große * @param int $min bottom frequency threshold 871*6225b270SMichael Große * @param int $max upper frequency limit. No limit if $max<$min 872*6225b270SMichael Große * @param int $minlen minimum length of words to count 873*6225b270SMichael Große * @param string $key metadata key to list. Uses the fulltext index if not given 874*6225b270SMichael Große * @return array list of words as the keys and frequency as values 875*6225b270SMichael Große * 876*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 877*6225b270SMichael Große */ 878d868eb89SAndreas Gohr public function histogram($min = 1, $max = 0, $minlen = 3, $key = null) 879d868eb89SAndreas Gohr { 880*6225b270SMichael Große if ($min < 1) 881*6225b270SMichael Große $min = 1; 882*6225b270SMichael Große if ($max < $min) 883*6225b270SMichael Große $max = 0; 884*6225b270SMichael Große 88524870174SAndreas Gohr $result = []; 886*6225b270SMichael Große 887*6225b270SMichael Große if ($key == 'title') { 888*6225b270SMichael Große $index = $this->getIndex('title', ''); 889*6225b270SMichael Große $index = array_count_values($index); 890*6225b270SMichael Große foreach ($index as $val => $cnt) { 891*6225b270SMichael Große if ($cnt >= $min && (!$max || $cnt <= $max) && strlen($val) >= $minlen) 892*6225b270SMichael Große $result[$val] = $cnt; 893*6225b270SMichael Große } 894*6225b270SMichael Große } 895*6225b270SMichael Große elseif (!is_null($key)) { 896*6225b270SMichael Große $metaname = idx_cleanName($key); 897*6225b270SMichael Große $index = $this->getIndex($metaname.'_i', ''); 89824870174SAndreas Gohr $val_idx = []; 899*6225b270SMichael Große foreach ($index as $wid => $line) { 900*6225b270SMichael Große $freq = $this->countTuples($line); 901*6225b270SMichael Große if ($freq >= $min && (!$max || $freq <= $max)) 902*6225b270SMichael Große $val_idx[$wid] = $freq; 903*6225b270SMichael Große } 904*6225b270SMichael Große if (!empty($val_idx)) { 905*6225b270SMichael Große $words = $this->getIndex($metaname.'_w', ''); 906*6225b270SMichael Große foreach ($val_idx as $wid => $freq) { 907*6225b270SMichael Große if (strlen($words[$wid]) >= $minlen) 908*6225b270SMichael Große $result[$words[$wid]] = $freq; 909*6225b270SMichael Große } 910*6225b270SMichael Große } 911*6225b270SMichael Große } 912*6225b270SMichael Große else { 913*6225b270SMichael Große $lengths = idx_listIndexLengths(); 914*6225b270SMichael Große foreach ($lengths as $length) { 915*6225b270SMichael Große if ($length < $minlen) continue; 916*6225b270SMichael Große $index = $this->getIndex('i', $length); 917*6225b270SMichael Große $words = null; 918*6225b270SMichael Große foreach ($index as $wid => $line) { 919*6225b270SMichael Große $freq = $this->countTuples($line); 920*6225b270SMichael Große if ($freq >= $min && (!$max || $freq <= $max)) { 921*6225b270SMichael Große if ($words === null) 922*6225b270SMichael Große $words = $this->getIndex('w', $length); 923*6225b270SMichael Große $result[$words[$wid]] = $freq; 924*6225b270SMichael Große } 925*6225b270SMichael Große } 926*6225b270SMichael Große } 927*6225b270SMichael Große } 928*6225b270SMichael Große 929*6225b270SMichael Große arsort($result); 930*6225b270SMichael Große return $result; 931*6225b270SMichael Große } 932*6225b270SMichael Große 933*6225b270SMichael Große /** 934*6225b270SMichael Große * Lock the indexer. 935*6225b270SMichael Große * 936*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 937*6225b270SMichael Große * 938*6225b270SMichael Große * @return bool|string 939*6225b270SMichael Große */ 940d868eb89SAndreas Gohr protected function lock() 941d868eb89SAndreas Gohr { 942*6225b270SMichael Große global $conf; 943*6225b270SMichael Große $status = true; 944*6225b270SMichael Große $run = 0; 945*6225b270SMichael Große $lock = $conf['lockdir'].'/_indexer.lock'; 946bd539124SAndreas Gohr while (!@mkdir($lock)) { 947*6225b270SMichael Große usleep(50); 948*6225b270SMichael Große if(is_dir($lock) && time()-@filemtime($lock) > 60*5){ 949*6225b270SMichael Große // looks like a stale lock - remove it 950*6225b270SMichael Große if (!@rmdir($lock)) { 951*6225b270SMichael Große $status = "removing the stale lock failed"; 952*6225b270SMichael Große return false; 953*6225b270SMichael Große } else { 954*6225b270SMichael Große $status = "stale lock removed"; 955*6225b270SMichael Große } 956*6225b270SMichael Große }elseif($run++ == 1000){ 957*6225b270SMichael Große // we waited 5 seconds for that lock 958*6225b270SMichael Große return false; 959*6225b270SMichael Große } 960*6225b270SMichael Große } 96123420346SDamien Regad if ($conf['dperm']) { 962*6225b270SMichael Große chmod($lock, $conf['dperm']); 963*6225b270SMichael Große } 964*6225b270SMichael Große return $status; 965*6225b270SMichael Große } 966*6225b270SMichael Große 967*6225b270SMichael Große /** 968*6225b270SMichael Große * Release the indexer lock. 969*6225b270SMichael Große * 970*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 971*6225b270SMichael Große * 972*6225b270SMichael Große * @return bool 973*6225b270SMichael Große */ 974d868eb89SAndreas Gohr protected function unlock() 975d868eb89SAndreas Gohr { 976*6225b270SMichael Große global $conf; 977*6225b270SMichael Große @rmdir($conf['lockdir'].'/_indexer.lock'); 978*6225b270SMichael Große return true; 979*6225b270SMichael Große } 980*6225b270SMichael Große 981*6225b270SMichael Große /** 982*6225b270SMichael Große * Retrieve the entire index. 983*6225b270SMichael Große * 984*6225b270SMichael Große * The $suffix argument is for an index that is split into 985*6225b270SMichael Große * multiple parts. Different index files should use different 986*6225b270SMichael Große * base names. 987*6225b270SMichael Große * 988*6225b270SMichael Große * @param string $idx name of the index 989*6225b270SMichael Große * @param string $suffix subpart identifier 990*6225b270SMichael Große * @return array list of lines without CR or LF 991*6225b270SMichael Große * 992*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 993*6225b270SMichael Große */ 994d868eb89SAndreas Gohr protected function getIndex($idx, $suffix) 995d868eb89SAndreas Gohr { 996*6225b270SMichael Große global $conf; 997*6225b270SMichael Große $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 99824870174SAndreas Gohr if (!file_exists($fn)) return []; 999*6225b270SMichael Große return file($fn, FILE_IGNORE_NEW_LINES); 1000*6225b270SMichael Große } 1001*6225b270SMichael Große 1002*6225b270SMichael Große /** 1003*6225b270SMichael Große * Replace the contents of the index with an array. 1004*6225b270SMichael Große * 1005*6225b270SMichael Große * @param string $idx name of the index 1006*6225b270SMichael Große * @param string $suffix subpart identifier 1007*6225b270SMichael Große * @param array $lines list of lines without LF 1008*6225b270SMichael Große * @return bool If saving succeeded 1009*6225b270SMichael Große * 1010*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1011*6225b270SMichael Große */ 1012d868eb89SAndreas Gohr protected function saveIndex($idx, $suffix, &$lines) 1013d868eb89SAndreas Gohr { 1014*6225b270SMichael Große global $conf; 1015*6225b270SMichael Große $fn = $conf['indexdir'].'/'.$idx.$suffix; 1016*6225b270SMichael Große $fh = @fopen($fn.'.tmp', 'w'); 1017*6225b270SMichael Große if (!$fh) return false; 101824870174SAndreas Gohr fwrite($fh, implode("\n", $lines)); 1019*6225b270SMichael Große if (!empty($lines)) 1020*6225b270SMichael Große fwrite($fh, "\n"); 1021*6225b270SMichael Große fclose($fh); 10223aa75874Smovatica if ($conf['fperm']) 1023*6225b270SMichael Große chmod($fn.'.tmp', $conf['fperm']); 1024*6225b270SMichael Große io_rename($fn.'.tmp', $fn.'.idx'); 1025*6225b270SMichael Große return true; 1026*6225b270SMichael Große } 1027*6225b270SMichael Große 1028*6225b270SMichael Große /** 1029*6225b270SMichael Große * Retrieve a line from the index. 1030*6225b270SMichael Große * 1031*6225b270SMichael Große * @param string $idx name of the index 1032*6225b270SMichael Große * @param string $suffix subpart identifier 1033*6225b270SMichael Große * @param int $id the line number 1034*6225b270SMichael Große * @return string a line with trailing whitespace removed 1035*6225b270SMichael Große * 1036*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1037*6225b270SMichael Große */ 1038d868eb89SAndreas Gohr protected function getIndexKey($idx, $suffix, $id) 1039d868eb89SAndreas Gohr { 1040*6225b270SMichael Große global $conf; 1041*6225b270SMichael Große $fn = $conf['indexdir'].'/'.$idx.$suffix.'.idx'; 1042*6225b270SMichael Große if (!file_exists($fn)) return ''; 1043*6225b270SMichael Große $fh = @fopen($fn, 'r'); 1044*6225b270SMichael Große if (!$fh) return ''; 1045*6225b270SMichael Große $ln = -1; 1046*6225b270SMichael Große while (($line = fgets($fh)) !== false) { 1047*6225b270SMichael Große if (++$ln == $id) break; 1048*6225b270SMichael Große } 1049*6225b270SMichael Große fclose($fh); 1050*6225b270SMichael Große return rtrim((string)$line); 1051*6225b270SMichael Große } 1052*6225b270SMichael Große 1053*6225b270SMichael Große /** 1054*6225b270SMichael Große * Write a line into the index. 1055*6225b270SMichael Große * 1056*6225b270SMichael Große * @param string $idx name of the index 1057*6225b270SMichael Große * @param string $suffix subpart identifier 1058*6225b270SMichael Große * @param int $id the line number 1059*6225b270SMichael Große * @param string $line line to write 1060*6225b270SMichael Große * @return bool If saving succeeded 1061*6225b270SMichael Große * 1062*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1063*6225b270SMichael Große */ 1064d868eb89SAndreas Gohr protected function saveIndexKey($idx, $suffix, $id, $line) 1065d868eb89SAndreas Gohr { 1066*6225b270SMichael Große global $conf; 1067*6225b270SMichael Große if (substr($line, -1) != "\n") 1068*6225b270SMichael Große $line .= "\n"; 1069*6225b270SMichael Große $fn = $conf['indexdir'].'/'.$idx.$suffix; 1070*6225b270SMichael Große $fh = @fopen($fn.'.tmp', 'w'); 1071*6225b270SMichael Große if (!$fh) return false; 1072*6225b270SMichael Große $ih = @fopen($fn.'.idx', 'r'); 1073*6225b270SMichael Große if ($ih) { 1074*6225b270SMichael Große $ln = -1; 1075*6225b270SMichael Große while (($curline = fgets($ih)) !== false) { 1076*6225b270SMichael Große fwrite($fh, (++$ln == $id) ? $line : $curline); 1077*6225b270SMichael Große } 1078*6225b270SMichael Große if ($id > $ln) { 1079*6225b270SMichael Große while ($id > ++$ln) 1080*6225b270SMichael Große fwrite($fh, "\n"); 1081*6225b270SMichael Große fwrite($fh, $line); 1082*6225b270SMichael Große } 1083*6225b270SMichael Große fclose($ih); 1084*6225b270SMichael Große } else { 1085*6225b270SMichael Große $ln = -1; 1086*6225b270SMichael Große while ($id > ++$ln) 1087*6225b270SMichael Große fwrite($fh, "\n"); 1088*6225b270SMichael Große fwrite($fh, $line); 1089*6225b270SMichael Große } 1090*6225b270SMichael Große fclose($fh); 10913aa75874Smovatica if ($conf['fperm']) 1092*6225b270SMichael Große chmod($fn.'.tmp', $conf['fperm']); 1093*6225b270SMichael Große io_rename($fn.'.tmp', $fn.'.idx'); 1094*6225b270SMichael Große return true; 1095*6225b270SMichael Große } 1096*6225b270SMichael Große 1097*6225b270SMichael Große /** 1098*6225b270SMichael Große * Retrieve or insert a value in the index. 1099*6225b270SMichael Große * 1100*6225b270SMichael Große * @param string $idx name of the index 1101*6225b270SMichael Große * @param string $suffix subpart identifier 1102*6225b270SMichael Große * @param string $value line to find in the index 1103*6225b270SMichael Große * @return int|bool line number of the value in the index or false if writing the index failed 1104*6225b270SMichael Große * 1105*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1106*6225b270SMichael Große */ 1107d868eb89SAndreas Gohr protected function addIndexKey($idx, $suffix, $value) 1108d868eb89SAndreas Gohr { 1109*6225b270SMichael Große $index = $this->getIndex($idx, $suffix); 1110*6225b270SMichael Große $id = array_search($value, $index, true); 1111*6225b270SMichael Große if ($id === false) { 1112*6225b270SMichael Große $id = count($index); 1113*6225b270SMichael Große $index[$id] = $value; 1114*6225b270SMichael Große if (!$this->saveIndex($idx, $suffix, $index)) { 1115*6225b270SMichael Große trigger_error("Failed to write $idx index", E_USER_ERROR); 1116*6225b270SMichael Große return false; 1117*6225b270SMichael Große } 1118*6225b270SMichael Große } 1119*6225b270SMichael Große return $id; 1120*6225b270SMichael Große } 1121*6225b270SMichael Große 1122*6225b270SMichael Große /** 1123*6225b270SMichael Große * Get the list of lengths indexed in the wiki. 1124*6225b270SMichael Große * 1125*6225b270SMichael Große * Read the index directory or a cache file and returns 1126*6225b270SMichael Große * a sorted array of lengths of the words used in the wiki. 1127*6225b270SMichael Große * 1128*6225b270SMichael Große * @author YoBoY <yoboy.leguesh@gmail.com> 1129*6225b270SMichael Große * 1130*6225b270SMichael Große * @return array 1131*6225b270SMichael Große */ 1132d868eb89SAndreas Gohr protected function listIndexLengths() 1133d868eb89SAndreas Gohr { 1134*6225b270SMichael Große return idx_listIndexLengths(); 1135*6225b270SMichael Große } 1136*6225b270SMichael Große 1137*6225b270SMichael Große /** 1138*6225b270SMichael Große * Get the word lengths that have been indexed. 1139*6225b270SMichael Große * 1140*6225b270SMichael Große * Reads the index directory and returns an array of lengths 1141*6225b270SMichael Große * that there are indices for. 1142*6225b270SMichael Große * 1143*6225b270SMichael Große * @author YoBoY <yoboy.leguesh@gmail.com> 1144*6225b270SMichael Große * 1145*6225b270SMichael Große * @param array|int $filter 1146*6225b270SMichael Große * @return array 1147*6225b270SMichael Große */ 1148d868eb89SAndreas Gohr protected function indexLengths($filter) 1149d868eb89SAndreas Gohr { 1150*6225b270SMichael Große global $conf; 115124870174SAndreas Gohr $idx = []; 1152*6225b270SMichael Große if (is_array($filter)) { 1153*6225b270SMichael Große // testing if index files exist only 1154*6225b270SMichael Große $path = $conf['indexdir']."/i"; 115524870174SAndreas Gohr foreach (array_keys($filter) as $key) { 1156*6225b270SMichael Große if (file_exists($path.$key.'.idx')) 1157*6225b270SMichael Große $idx[] = $key; 1158*6225b270SMichael Große } 1159*6225b270SMichael Große } else { 1160*6225b270SMichael Große $lengths = idx_listIndexLengths(); 116124870174SAndreas Gohr foreach ($lengths as $length) { 1162*6225b270SMichael Große // keep all the values equal or superior 1163*6225b270SMichael Große if ((int)$length >= (int)$filter) 1164*6225b270SMichael Große $idx[] = $length; 1165*6225b270SMichael Große } 1166*6225b270SMichael Große } 1167*6225b270SMichael Große return $idx; 1168*6225b270SMichael Große } 1169*6225b270SMichael Große 1170*6225b270SMichael Große /** 1171*6225b270SMichael Große * Insert or replace a tuple in a line. 1172*6225b270SMichael Große * 1173*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1174*6225b270SMichael Große * 1175*6225b270SMichael Große * @param string $line 1176*6225b270SMichael Große * @param string|int $id 1177*6225b270SMichael Große * @param int $count 1178*6225b270SMichael Große * @return string 1179*6225b270SMichael Große */ 1180d868eb89SAndreas Gohr protected function updateTuple($line, $id, $count) 1181d868eb89SAndreas Gohr { 1182*6225b270SMichael Große if ($line != ''){ 1183*6225b270SMichael Große $line = preg_replace('/(^|:)'.preg_quote($id, '/').'\*\d*/', '', $line); 1184*6225b270SMichael Große } 1185*6225b270SMichael Große $line = trim($line, ':'); 1186*6225b270SMichael Große if ($count) { 1187*6225b270SMichael Große if ($line) { 1188*6225b270SMichael Große return "$id*$count:".$line; 1189*6225b270SMichael Große } else { 1190*6225b270SMichael Große return "$id*$count"; 1191*6225b270SMichael Große } 1192*6225b270SMichael Große } 1193*6225b270SMichael Große return $line; 1194*6225b270SMichael Große } 1195*6225b270SMichael Große 1196*6225b270SMichael Große /** 1197*6225b270SMichael Große * Split a line into an array of tuples. 1198*6225b270SMichael Große * 1199*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1200*6225b270SMichael Große * @author Andreas Gohr <andi@splitbrain.org> 1201*6225b270SMichael Große * 1202*6225b270SMichael Große * @param array $keys 1203*6225b270SMichael Große * @param string $line 1204*6225b270SMichael Große * @return array 1205*6225b270SMichael Große */ 1206d868eb89SAndreas Gohr protected function parseTuples(&$keys, $line) 1207d868eb89SAndreas Gohr { 120824870174SAndreas Gohr $result = []; 1209*6225b270SMichael Große if ($line == '') return $result; 1210*6225b270SMichael Große $parts = explode(':', $line); 1211*6225b270SMichael Große foreach ($parts as $tuple) { 1212*6225b270SMichael Große if ($tuple === '') continue; 121324870174SAndreas Gohr [$key, $cnt] = explode('*', $tuple); 1214*6225b270SMichael Große if (!$cnt) continue; 12157c392639SDamien Regad if (isset($keys[$key])) { 1216*6225b270SMichael Große $key = $keys[$key]; 1217*6225b270SMichael Große if ($key === false || is_null($key)) continue; 12187c392639SDamien Regad } 1219*6225b270SMichael Große $result[$key] = $cnt; 1220*6225b270SMichael Große } 1221*6225b270SMichael Große return $result; 1222*6225b270SMichael Große } 1223*6225b270SMichael Große 1224*6225b270SMichael Große /** 1225*6225b270SMichael Große * Sum the counts in a list of tuples. 1226*6225b270SMichael Große * 1227*6225b270SMichael Große * @author Tom N Harris <tnharris@whoopdedo.org> 1228*6225b270SMichael Große * 1229*6225b270SMichael Große * @param string $line 1230*6225b270SMichael Große * @return int 1231*6225b270SMichael Große */ 1232d868eb89SAndreas Gohr protected function countTuples($line) 1233d868eb89SAndreas Gohr { 1234*6225b270SMichael Große $freq = 0; 1235*6225b270SMichael Große $parts = explode(':', $line); 1236*6225b270SMichael Große foreach ($parts as $tuple) { 1237*6225b270SMichael Große if ($tuple === '') continue; 123824870174SAndreas Gohr [, $cnt] = explode('*', $tuple); 1239*6225b270SMichael Große $freq += (int)$cnt; 1240*6225b270SMichael Große } 1241*6225b270SMichael Große return $freq; 1242*6225b270SMichael Große } 1243*6225b270SMichael Große} 1244