1*173bfbcaSSatoshi Sahara<?php 2*173bfbcaSSatoshi Saharanamespace dokuwiki\Search; 3*173bfbcaSSatoshi Sahara 4*173bfbcaSSatoshi Saharause dokuwiki\Extension\Event; 5*173bfbcaSSatoshi Sahara 6*173bfbcaSSatoshi Sahara 7*173bfbcaSSatoshi Sahara/** 8*173bfbcaSSatoshi Sahara * Class DokuWiki Fulltext Search 9*173bfbcaSSatoshi Sahara * 10*173bfbcaSSatoshi Sahara * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 11*173bfbcaSSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 12*173bfbcaSSatoshi Sahara */ 13*173bfbcaSSatoshi Saharaclass FulltextSearch 14*173bfbcaSSatoshi Sahara{ 15*173bfbcaSSatoshi Sahara /** 16*173bfbcaSSatoshi Sahara * Fulltext Search constructor. prevent direct object creation 17*173bfbcaSSatoshi Sahara */ 18*173bfbcaSSatoshi Sahara protected function __construct() {} 19*173bfbcaSSatoshi Sahara 20*173bfbcaSSatoshi Sahara /** 21*173bfbcaSSatoshi Sahara * The fulltext search 22*173bfbcaSSatoshi Sahara * 23*173bfbcaSSatoshi Sahara * Returns a list of matching documents for the given query 24*173bfbcaSSatoshi Sahara * 25*173bfbcaSSatoshi Sahara * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event() 26*173bfbcaSSatoshi Sahara * 27*173bfbcaSSatoshi Sahara * @param string $query 28*173bfbcaSSatoshi Sahara * @param array $highlight 29*173bfbcaSSatoshi Sahara * @param string $sort 30*173bfbcaSSatoshi Sahara * @param int|string $after only show results with mtime after this date, 31*173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 32*173bfbcaSSatoshi Sahara * @param int|string $before only show results with mtime before this date, 33*173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 34*173bfbcaSSatoshi Sahara * 35*173bfbcaSSatoshi Sahara * @return array 36*173bfbcaSSatoshi Sahara */ 37*173bfbcaSSatoshi Sahara public static function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null) 38*173bfbcaSSatoshi Sahara { 39*173bfbcaSSatoshi Sahara if ($sort === null) { 40*173bfbcaSSatoshi Sahara $sort = 'hits'; 41*173bfbcaSSatoshi Sahara } 42*173bfbcaSSatoshi Sahara $data = [ 43*173bfbcaSSatoshi Sahara 'query' => $query, 44*173bfbcaSSatoshi Sahara 'sort' => $sort, 45*173bfbcaSSatoshi Sahara 'after' => $after, 46*173bfbcaSSatoshi Sahara 'before' => $before 47*173bfbcaSSatoshi Sahara ]; 48*173bfbcaSSatoshi Sahara $data['highlight'] =& $highlight; 49*173bfbcaSSatoshi Sahara $action = static::class.'::callback_pageSearch'; 50*173bfbcaSSatoshi Sahara return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action); 51*173bfbcaSSatoshi Sahara } 52*173bfbcaSSatoshi Sahara 53*173bfbcaSSatoshi Sahara /** 54*173bfbcaSSatoshi Sahara * Returns a list of matching documents for the given query 55*173bfbcaSSatoshi Sahara * 56*173bfbcaSSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 57*173bfbcaSSatoshi Sahara * @author Kazutaka Miyasaka <kazmiya@gmail.com> 58*173bfbcaSSatoshi Sahara * 59*173bfbcaSSatoshi Sahara * @param array $data event data 60*173bfbcaSSatoshi Sahara * @return array matching documents 61*173bfbcaSSatoshi Sahara */ 62*173bfbcaSSatoshi Sahara public static function callback_pageSearch(&$data) 63*173bfbcaSSatoshi Sahara { 64*173bfbcaSSatoshi Sahara $Indexer = idx_get_indexer(); 65*173bfbcaSSatoshi Sahara 66*173bfbcaSSatoshi Sahara // parse the given query 67*173bfbcaSSatoshi Sahara $q = static::queryParser($Indexer, $data['query']); 68*173bfbcaSSatoshi Sahara $data['highlight'] = $q['highlight']; 69*173bfbcaSSatoshi Sahara 70*173bfbcaSSatoshi Sahara if (empty($q['parsed_ary'])) return array(); 71*173bfbcaSSatoshi Sahara 72*173bfbcaSSatoshi Sahara // lookup all words found in the query 73*173bfbcaSSatoshi Sahara $lookup = $Indexer->lookup($q['words']); 74*173bfbcaSSatoshi Sahara 75*173bfbcaSSatoshi Sahara // get all pages in this dokuwiki site (!: includes nonexistent pages) 76*173bfbcaSSatoshi Sahara $pages_all = array(); 77*173bfbcaSSatoshi Sahara foreach ($Indexer->getPages() as $id) { 78*173bfbcaSSatoshi Sahara $pages_all[$id] = 0; // base: 0 hit 79*173bfbcaSSatoshi Sahara } 80*173bfbcaSSatoshi Sahara 81*173bfbcaSSatoshi Sahara // process the query 82*173bfbcaSSatoshi Sahara $stack = array(); 83*173bfbcaSSatoshi Sahara foreach ($q['parsed_ary'] as $token) { 84*173bfbcaSSatoshi Sahara switch (substr($token, 0, 3)) { 85*173bfbcaSSatoshi Sahara case 'W+:': 86*173bfbcaSSatoshi Sahara case 'W-:': 87*173bfbcaSSatoshi Sahara case 'W_:': // word 88*173bfbcaSSatoshi Sahara $word = substr($token, 3); 89*173bfbcaSSatoshi Sahara $stack[] = (array) $lookup[$word]; 90*173bfbcaSSatoshi Sahara break; 91*173bfbcaSSatoshi Sahara case 'P+:': 92*173bfbcaSSatoshi Sahara case 'P-:': // phrase 93*173bfbcaSSatoshi Sahara $phrase = substr($token, 3); 94*173bfbcaSSatoshi Sahara // since phrases are always parsed as ((W1)(W2)...(P)), 95*173bfbcaSSatoshi Sahara // the end($stack) always points the pages that contain 96*173bfbcaSSatoshi Sahara // all words in this phrase 97*173bfbcaSSatoshi Sahara $pages = end($stack); 98*173bfbcaSSatoshi Sahara $pages_matched = array(); 99*173bfbcaSSatoshi Sahara foreach (array_keys($pages) as $id) { 100*173bfbcaSSatoshi Sahara $evdata = array( 101*173bfbcaSSatoshi Sahara 'id' => $id, 102*173bfbcaSSatoshi Sahara 'phrase' => $phrase, 103*173bfbcaSSatoshi Sahara 'text' => rawWiki($id) 104*173bfbcaSSatoshi Sahara ); 105*173bfbcaSSatoshi Sahara $evt = new Event('FULLTEXT_PHRASE_MATCH', $evdata); 106*173bfbcaSSatoshi Sahara if ($evt->advise_before() && $evt->result !== true) { 107*173bfbcaSSatoshi Sahara $text = \dokuwiki\Utf8\PhpString::strtolower($evdata['text']); 108*173bfbcaSSatoshi Sahara if (strpos($text, $phrase) !== false) { 109*173bfbcaSSatoshi Sahara $evt->result = true; 110*173bfbcaSSatoshi Sahara } 111*173bfbcaSSatoshi Sahara } 112*173bfbcaSSatoshi Sahara $evt->advise_after(); 113*173bfbcaSSatoshi Sahara if ($evt->result === true) { 114*173bfbcaSSatoshi Sahara $pages_matched[$id] = 0; // phrase: always 0 hit 115*173bfbcaSSatoshi Sahara } 116*173bfbcaSSatoshi Sahara } 117*173bfbcaSSatoshi Sahara $stack[] = $pages_matched; 118*173bfbcaSSatoshi Sahara break; 119*173bfbcaSSatoshi Sahara case 'N+:': 120*173bfbcaSSatoshi Sahara case 'N-:': // namespace 121*173bfbcaSSatoshi Sahara $ns = cleanID(substr($token, 3)) . ':'; 122*173bfbcaSSatoshi Sahara $pages_matched = array(); 123*173bfbcaSSatoshi Sahara foreach (array_keys($pages_all) as $id) { 124*173bfbcaSSatoshi Sahara if (strpos($id, $ns) === 0) { 125*173bfbcaSSatoshi Sahara $pages_matched[$id] = 0; // namespace: always 0 hit 126*173bfbcaSSatoshi Sahara } 127*173bfbcaSSatoshi Sahara } 128*173bfbcaSSatoshi Sahara $stack[] = $pages_matched; 129*173bfbcaSSatoshi Sahara break; 130*173bfbcaSSatoshi Sahara case 'AND': // and operation 131*173bfbcaSSatoshi Sahara list($pages1, $pages2) = array_splice($stack, -2); 132*173bfbcaSSatoshi Sahara $stack[] = static::resultCombine(array($pages1, $pages2)); 133*173bfbcaSSatoshi Sahara break; 134*173bfbcaSSatoshi Sahara case 'OR': // or operation 135*173bfbcaSSatoshi Sahara list($pages1, $pages2) = array_splice($stack, -2); 136*173bfbcaSSatoshi Sahara $stack[] = static::resultUnite(array($pages1, $pages2)); 137*173bfbcaSSatoshi Sahara break; 138*173bfbcaSSatoshi Sahara case 'NOT': // not operation (unary) 139*173bfbcaSSatoshi Sahara $pages = array_pop($stack); 140*173bfbcaSSatoshi Sahara $stack[] = static::resultComplement(array($pages_all, $pages)); 141*173bfbcaSSatoshi Sahara break; 142*173bfbcaSSatoshi Sahara } 143*173bfbcaSSatoshi Sahara } 144*173bfbcaSSatoshi Sahara $docs = array_pop($stack); 145*173bfbcaSSatoshi Sahara 146*173bfbcaSSatoshi Sahara if (empty($docs)) return array(); 147*173bfbcaSSatoshi Sahara 148*173bfbcaSSatoshi Sahara // check: settings, acls, existence 149*173bfbcaSSatoshi Sahara foreach (array_keys($docs) as $id) { 150*173bfbcaSSatoshi Sahara if (isHiddenPage($id) 151*173bfbcaSSatoshi Sahara || auth_quickaclcheck($id) < AUTH_READ 152*173bfbcaSSatoshi Sahara || !page_exists($id, '', false) 153*173bfbcaSSatoshi Sahara ) { 154*173bfbcaSSatoshi Sahara unset($docs[$id]); 155*173bfbcaSSatoshi Sahara } 156*173bfbcaSSatoshi Sahara } 157*173bfbcaSSatoshi Sahara 158*173bfbcaSSatoshi Sahara $docs = static::filterResultsByTime($docs, $data['after'], $data['before']); 159*173bfbcaSSatoshi Sahara 160*173bfbcaSSatoshi Sahara if ($data['sort'] === 'mtime') { 161*173bfbcaSSatoshi Sahara uksort($docs, static::class.'::pagemtimesorter'); 162*173bfbcaSSatoshi Sahara } else { 163*173bfbcaSSatoshi Sahara // sort docs by count 164*173bfbcaSSatoshi Sahara arsort($docs); 165*173bfbcaSSatoshi Sahara } 166*173bfbcaSSatoshi Sahara 167*173bfbcaSSatoshi Sahara return $docs; 168*173bfbcaSSatoshi Sahara } 169*173bfbcaSSatoshi Sahara 170*173bfbcaSSatoshi Sahara /** 171*173bfbcaSSatoshi Sahara * Quicksearch for pagenames 172*173bfbcaSSatoshi Sahara * 173*173bfbcaSSatoshi Sahara * By default it only matches the pagename and ignores the 174*173bfbcaSSatoshi Sahara * namespace. This can be changed with the second parameter. 175*173bfbcaSSatoshi Sahara * The third parameter allows to search in titles as well. 176*173bfbcaSSatoshi Sahara * 177*173bfbcaSSatoshi Sahara * The function always returns titles as well 178*173bfbcaSSatoshi Sahara * 179*173bfbcaSSatoshi Sahara * @triggers SEARCH_QUERY_PAGELOOKUP 180*173bfbcaSSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 181*173bfbcaSSatoshi Sahara * @author Adrian Lang <lang@cosmocode.de> 182*173bfbcaSSatoshi Sahara * 183*173bfbcaSSatoshi Sahara * @param string $id page id 184*173bfbcaSSatoshi Sahara * @param bool $in_ns match against namespace as well? 185*173bfbcaSSatoshi Sahara * @param bool $in_title search in title? 186*173bfbcaSSatoshi Sahara * @param int|string $after only show results with mtime after this date, 187*173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 188*173bfbcaSSatoshi Sahara * @param int|string $before only show results with mtime before this date, 189*173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 190*173bfbcaSSatoshi Sahara * 191*173bfbcaSSatoshi Sahara * @return string[] 192*173bfbcaSSatoshi Sahara */ 193*173bfbcaSSatoshi Sahara public static function pageLookup($id, $in_ns=false, $in_title=false, $after = null, $before = null) 194*173bfbcaSSatoshi Sahara { 195*173bfbcaSSatoshi Sahara $data = [ 196*173bfbcaSSatoshi Sahara 'id' => $id, 197*173bfbcaSSatoshi Sahara 'in_ns' => $in_ns, 198*173bfbcaSSatoshi Sahara 'in_title' => $in_title, 199*173bfbcaSSatoshi Sahara 'after' => $after, 200*173bfbcaSSatoshi Sahara 'before' => $before 201*173bfbcaSSatoshi Sahara ]; 202*173bfbcaSSatoshi Sahara $data['has_titles'] = true; // for plugin backward compatibility check 203*173bfbcaSSatoshi Sahara $action = static::class.'::callback_pageLookup'; 204*173bfbcaSSatoshi Sahara return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, $action); 205*173bfbcaSSatoshi Sahara } 206*173bfbcaSSatoshi Sahara 207*173bfbcaSSatoshi Sahara /** 208*173bfbcaSSatoshi Sahara * Returns list of pages as array(pageid => First Heading) 209*173bfbcaSSatoshi Sahara * 210*173bfbcaSSatoshi Sahara * @param array &$data event data 211*173bfbcaSSatoshi Sahara * @return string[] 212*173bfbcaSSatoshi Sahara */ 213*173bfbcaSSatoshi Sahara public static function callback_pageLookup(&$data) 214*173bfbcaSSatoshi Sahara { 215*173bfbcaSSatoshi Sahara // split out original parameters 216*173bfbcaSSatoshi Sahara $id = $data['id']; 217*173bfbcaSSatoshi Sahara $Indexer = idx_get_indexer(); 218*173bfbcaSSatoshi Sahara $parsedQuery = static::queryParser($Indexer, $id); 219*173bfbcaSSatoshi Sahara if (count($parsedQuery['ns']) > 0) { 220*173bfbcaSSatoshi Sahara $ns = cleanID($parsedQuery['ns'][0]) . ':'; 221*173bfbcaSSatoshi Sahara $id = implode(' ', $parsedQuery['highlight']); 222*173bfbcaSSatoshi Sahara } 223*173bfbcaSSatoshi Sahara 224*173bfbcaSSatoshi Sahara $in_ns = $data['in_ns']; 225*173bfbcaSSatoshi Sahara $in_title = $data['in_title']; 226*173bfbcaSSatoshi Sahara $cleaned = cleanID($id); 227*173bfbcaSSatoshi Sahara 228*173bfbcaSSatoshi Sahara $Indexer = idx_get_indexer(); 229*173bfbcaSSatoshi Sahara $page_idx = $Indexer->getPages(); 230*173bfbcaSSatoshi Sahara 231*173bfbcaSSatoshi Sahara $pages = array(); 232*173bfbcaSSatoshi Sahara if ($id !== '' && $cleaned !== '') { 233*173bfbcaSSatoshi Sahara foreach ($page_idx as $p_id) { 234*173bfbcaSSatoshi Sahara if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) { 235*173bfbcaSSatoshi Sahara if (!isset($pages[$p_id])) { 236*173bfbcaSSatoshi Sahara $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER); 237*173bfbcaSSatoshi Sahara } 238*173bfbcaSSatoshi Sahara } 239*173bfbcaSSatoshi Sahara } 240*173bfbcaSSatoshi Sahara if ($in_title) { 241*173bfbcaSSatoshi Sahara $func = static::class.'::pageLookupTitleCompare'; 242*173bfbcaSSatoshi Sahara foreach ($Indexer->lookupKey('title', $id, $func) as $p_id) { 243*173bfbcaSSatoshi Sahara if (!isset($pages[$p_id])) { 244*173bfbcaSSatoshi Sahara $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER); 245*173bfbcaSSatoshi Sahara } 246*173bfbcaSSatoshi Sahara } 247*173bfbcaSSatoshi Sahara } 248*173bfbcaSSatoshi Sahara } 249*173bfbcaSSatoshi Sahara 250*173bfbcaSSatoshi Sahara if (isset($ns)) { 251*173bfbcaSSatoshi Sahara foreach (array_keys($pages) as $p_id) { 252*173bfbcaSSatoshi Sahara if (strpos($p_id, $ns) !== 0) { 253*173bfbcaSSatoshi Sahara unset($pages[$p_id]); 254*173bfbcaSSatoshi Sahara } 255*173bfbcaSSatoshi Sahara } 256*173bfbcaSSatoshi Sahara } 257*173bfbcaSSatoshi Sahara 258*173bfbcaSSatoshi Sahara // discard hidden pages 259*173bfbcaSSatoshi Sahara // discard nonexistent pages 260*173bfbcaSSatoshi Sahara // check ACL permissions 261*173bfbcaSSatoshi Sahara foreach (array_keys($pages) as $idx) { 262*173bfbcaSSatoshi Sahara if (!isVisiblePage($idx) || !page_exists($idx) || auth_quickaclcheck($idx) < AUTH_READ) { 263*173bfbcaSSatoshi Sahara unset($pages[$idx]); 264*173bfbcaSSatoshi Sahara } 265*173bfbcaSSatoshi Sahara } 266*173bfbcaSSatoshi Sahara 267*173bfbcaSSatoshi Sahara $pages = static::filterResultsByTime($pages, $data['after'], $data['before']); 268*173bfbcaSSatoshi Sahara 269*173bfbcaSSatoshi Sahara uksort($pages, static::class.'::pagesorter'); 270*173bfbcaSSatoshi Sahara return $pages; 271*173bfbcaSSatoshi Sahara } 272*173bfbcaSSatoshi Sahara 273*173bfbcaSSatoshi Sahara /** 274*173bfbcaSSatoshi Sahara * @param array $results search results in the form pageid => value 275*173bfbcaSSatoshi Sahara * @param int|string $after only returns results with mtime after this date, 276*173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 277*173bfbcaSSatoshi Sahara * @param int|string $before only returns results with mtime after this date, 278*173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 279*173bfbcaSSatoshi Sahara * 280*173bfbcaSSatoshi Sahara * @return array 281*173bfbcaSSatoshi Sahara */ 282*173bfbcaSSatoshi Sahara protected static function filterResultsByTime(array $results, $after, $before) 283*173bfbcaSSatoshi Sahara { 284*173bfbcaSSatoshi Sahara if ($after || $before) { 285*173bfbcaSSatoshi Sahara $after = is_int($after) ? $after : strtotime($after); 286*173bfbcaSSatoshi Sahara $before = is_int($before) ? $before : strtotime($before); 287*173bfbcaSSatoshi Sahara 288*173bfbcaSSatoshi Sahara foreach ($results as $id => $value) { 289*173bfbcaSSatoshi Sahara $mTime = filemtime(wikiFN($id)); 290*173bfbcaSSatoshi Sahara if ($after && $after > $mTime) { 291*173bfbcaSSatoshi Sahara unset($results[$id]); 292*173bfbcaSSatoshi Sahara continue; 293*173bfbcaSSatoshi Sahara } 294*173bfbcaSSatoshi Sahara if ($before && $before < $mTime) { 295*173bfbcaSSatoshi Sahara unset($results[$id]); 296*173bfbcaSSatoshi Sahara } 297*173bfbcaSSatoshi Sahara } 298*173bfbcaSSatoshi Sahara } 299*173bfbcaSSatoshi Sahara 300*173bfbcaSSatoshi Sahara return $results; 301*173bfbcaSSatoshi Sahara } 302*173bfbcaSSatoshi Sahara 303*173bfbcaSSatoshi Sahara /** 304*173bfbcaSSatoshi Sahara * Tiny helper function for comparing the searched title with the title 305*173bfbcaSSatoshi Sahara * from the search index. This function is a wrapper around stripos with 306*173bfbcaSSatoshi Sahara * adapted argument order and return value. 307*173bfbcaSSatoshi Sahara * 308*173bfbcaSSatoshi Sahara * @param string $search searched title 309*173bfbcaSSatoshi Sahara * @param string $title title from index 310*173bfbcaSSatoshi Sahara * @return bool 311*173bfbcaSSatoshi Sahara */ 312*173bfbcaSSatoshi Sahara public static function pageLookupTitleCompare($search, $title) 313*173bfbcaSSatoshi Sahara { 314*173bfbcaSSatoshi Sahara return stripos($title, $search) !== false; 315*173bfbcaSSatoshi Sahara } 316*173bfbcaSSatoshi Sahara 317*173bfbcaSSatoshi Sahara /** 318*173bfbcaSSatoshi Sahara * Sort pages based on their namespace level first, then on their string 319*173bfbcaSSatoshi Sahara * values. This makes higher hierarchy pages rank higher than lower hierarchy 320*173bfbcaSSatoshi Sahara * pages. 321*173bfbcaSSatoshi Sahara * 322*173bfbcaSSatoshi Sahara * @param string $a 323*173bfbcaSSatoshi Sahara * @param string $b 324*173bfbcaSSatoshi Sahara * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b, 325*173bfbcaSSatoshi Sahara * and 0 if they are equal. 326*173bfbcaSSatoshi Sahara */ 327*173bfbcaSSatoshi Sahara protected static function pagesorter($a, $b) 328*173bfbcaSSatoshi Sahara { 329*173bfbcaSSatoshi Sahara $ac = count(explode(':',$a)); 330*173bfbcaSSatoshi Sahara $bc = count(explode(':',$b)); 331*173bfbcaSSatoshi Sahara if ($ac < $bc) { 332*173bfbcaSSatoshi Sahara return -1; 333*173bfbcaSSatoshi Sahara } elseif ($ac > $bc) { 334*173bfbcaSSatoshi Sahara return 1; 335*173bfbcaSSatoshi Sahara } 336*173bfbcaSSatoshi Sahara return strcmp ($a,$b); 337*173bfbcaSSatoshi Sahara } 338*173bfbcaSSatoshi Sahara 339*173bfbcaSSatoshi Sahara /** 340*173bfbcaSSatoshi Sahara * Sort pages by their mtime, from newest to oldest 341*173bfbcaSSatoshi Sahara * 342*173bfbcaSSatoshi Sahara * @param string $a 343*173bfbcaSSatoshi Sahara * @param string $b 344*173bfbcaSSatoshi Sahara * 345*173bfbcaSSatoshi Sahara * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a 346*173bfbcaSSatoshi Sahara * and 0 if they are of the same age 347*173bfbcaSSatoshi Sahara */ 348*173bfbcaSSatoshi Sahara protected static function pagemtimesorter($a, $b) 349*173bfbcaSSatoshi Sahara { 350*173bfbcaSSatoshi Sahara $mtimeA = filemtime(wikiFN($a)); 351*173bfbcaSSatoshi Sahara $mtimeB = filemtime(wikiFN($b)); 352*173bfbcaSSatoshi Sahara return $mtimeB - $mtimeA; 353*173bfbcaSSatoshi Sahara } 354*173bfbcaSSatoshi Sahara 355*173bfbcaSSatoshi Sahara /** 356*173bfbcaSSatoshi Sahara * Creates a snippet extract 357*173bfbcaSSatoshi Sahara * 358*173bfbcaSSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 359*173bfbcaSSatoshi Sahara * @triggers FULLTEXT_SNIPPET_CREATE 360*173bfbcaSSatoshi Sahara * 361*173bfbcaSSatoshi Sahara * @param string $id page id 362*173bfbcaSSatoshi Sahara * @param array $highlight 363*173bfbcaSSatoshi Sahara * @return mixed 364*173bfbcaSSatoshi Sahara */ 365*173bfbcaSSatoshi Sahara public static function snippet($id, $highlight) 366*173bfbcaSSatoshi Sahara { 367*173bfbcaSSatoshi Sahara $text = rawWiki($id); 368*173bfbcaSSatoshi Sahara $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens 369*173bfbcaSSatoshi Sahara $evdata = array( 370*173bfbcaSSatoshi Sahara 'id' => $id, 371*173bfbcaSSatoshi Sahara 'text' => &$text, 372*173bfbcaSSatoshi Sahara 'highlight' => &$highlight, 373*173bfbcaSSatoshi Sahara 'snippet' => '', 374*173bfbcaSSatoshi Sahara ); 375*173bfbcaSSatoshi Sahara 376*173bfbcaSSatoshi Sahara $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata); 377*173bfbcaSSatoshi Sahara if ($evt->advise_before()) { 378*173bfbcaSSatoshi Sahara $match = array(); 379*173bfbcaSSatoshi Sahara $snippets = array(); 380*173bfbcaSSatoshi Sahara $utf8_offset = $offset = $end = 0; 381*173bfbcaSSatoshi Sahara $len = \dokuwiki\Utf8\PhpString::strlen($text); 382*173bfbcaSSatoshi Sahara 383*173bfbcaSSatoshi Sahara // build a regexp from the phrases to highlight 384*173bfbcaSSatoshi Sahara $re1 = '(' . 385*173bfbcaSSatoshi Sahara join( 386*173bfbcaSSatoshi Sahara '|', 387*173bfbcaSSatoshi Sahara array_map( 388*173bfbcaSSatoshi Sahara static::class.'::snippet_re_preprocess', 389*173bfbcaSSatoshi Sahara array_map( 390*173bfbcaSSatoshi Sahara 'preg_quote_cb', 391*173bfbcaSSatoshi Sahara array_filter((array) $highlight) 392*173bfbcaSSatoshi Sahara ) 393*173bfbcaSSatoshi Sahara ) 394*173bfbcaSSatoshi Sahara ) . 395*173bfbcaSSatoshi Sahara ')'; 396*173bfbcaSSatoshi Sahara $re2 = "$re1.{0,75}(?!\\1)$re1"; 397*173bfbcaSSatoshi Sahara $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; 398*173bfbcaSSatoshi Sahara 399*173bfbcaSSatoshi Sahara for ($cnt=4; $cnt--;) { 400*173bfbcaSSatoshi Sahara if (0) { 401*173bfbcaSSatoshi Sahara } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 402*173bfbcaSSatoshi Sahara } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 403*173bfbcaSSatoshi Sahara } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 404*173bfbcaSSatoshi Sahara } else { 405*173bfbcaSSatoshi Sahara break; 406*173bfbcaSSatoshi Sahara } 407*173bfbcaSSatoshi Sahara 408*173bfbcaSSatoshi Sahara list($str, $idx) = $match[0]; 409*173bfbcaSSatoshi Sahara 410*173bfbcaSSatoshi Sahara // convert $idx (a byte offset) into a utf8 character offset 411*173bfbcaSSatoshi Sahara $utf8_idx = \dokuwiki\Utf8\PhpString::strlen(substr($text, 0, $idx)); 412*173bfbcaSSatoshi Sahara $utf8_len = \dokuwiki\Utf8\PhpString::strlen($str); 413*173bfbcaSSatoshi Sahara 414*173bfbcaSSatoshi Sahara // establish context, 100 bytes surrounding the match string 415*173bfbcaSSatoshi Sahara // first look to see if we can go 100 either side, 416*173bfbcaSSatoshi Sahara // then drop to 50 adding any excess if the other side can't go to 50, 417*173bfbcaSSatoshi Sahara $pre = min($utf8_idx - $utf8_offset, 100); 418*173bfbcaSSatoshi Sahara $post = min($len - $utf8_idx - $utf8_len, 100); 419*173bfbcaSSatoshi Sahara 420*173bfbcaSSatoshi Sahara if ($pre > 50 && $post > 50) { 421*173bfbcaSSatoshi Sahara $pre = $post = 50; 422*173bfbcaSSatoshi Sahara } elseif ($pre > 50) { 423*173bfbcaSSatoshi Sahara $pre = min($pre, 100 - $post); 424*173bfbcaSSatoshi Sahara } elseif ($post > 50) { 425*173bfbcaSSatoshi Sahara $post = min($post, 100 - $pre); 426*173bfbcaSSatoshi Sahara } elseif ($offset == 0) { 427*173bfbcaSSatoshi Sahara // both are less than 50, means the context is the whole string 428*173bfbcaSSatoshi Sahara // make it so and break out of this loop - there is no need for the 429*173bfbcaSSatoshi Sahara // complex snippet calculations 430*173bfbcaSSatoshi Sahara $snippets = array($text); 431*173bfbcaSSatoshi Sahara break; 432*173bfbcaSSatoshi Sahara } 433*173bfbcaSSatoshi Sahara 434*173bfbcaSSatoshi Sahara // establish context start and end points, try to append to previous 435*173bfbcaSSatoshi Sahara // context if possible 436*173bfbcaSSatoshi Sahara $start = $utf8_idx - $pre; 437*173bfbcaSSatoshi Sahara $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 438*173bfbcaSSatoshi Sahara $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 439*173bfbcaSSatoshi Sahara 440*173bfbcaSSatoshi Sahara if ($append) { 441*173bfbcaSSatoshi Sahara $snippets[count($snippets)-1] .= \dokuwiki\Utf8\PhpString::substr($text,$append,$end-$append); 442*173bfbcaSSatoshi Sahara } else { 443*173bfbcaSSatoshi Sahara $snippets[] = \dokuwiki\Utf8\PhpString::substr($text,$start,$end-$start); 444*173bfbcaSSatoshi Sahara } 445*173bfbcaSSatoshi Sahara 446*173bfbcaSSatoshi Sahara // set $offset for next match attempt 447*173bfbcaSSatoshi Sahara // continue matching after the current match 448*173bfbcaSSatoshi Sahara // if the current match is not the longest possible match starting at the current offset 449*173bfbcaSSatoshi Sahara // this prevents further matching of this snippet but for possible matches of length 450*173bfbcaSSatoshi Sahara // smaller than match length + context (at least 50 characters) this match is part of the context 451*173bfbcaSSatoshi Sahara $utf8_offset = $utf8_idx + $utf8_len; 452*173bfbcaSSatoshi Sahara $offset = $idx + strlen(\dokuwiki\Utf8\PhpString::substr($text,$utf8_idx,$utf8_len)); 453*173bfbcaSSatoshi Sahara $offset = \dokuwiki\Utf8\Clean::correctIdx($text,$offset); 454*173bfbcaSSatoshi Sahara } 455*173bfbcaSSatoshi Sahara 456*173bfbcaSSatoshi Sahara $m = "\1"; 457*173bfbcaSSatoshi Sahara $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets); 458*173bfbcaSSatoshi Sahara $snippet = preg_replace( 459*173bfbcaSSatoshi Sahara '/' . $m . '([^' . $m . ']*?)' . $m . '/iu', 460*173bfbcaSSatoshi Sahara '<strong class="search_hit">$1</strong>', 461*173bfbcaSSatoshi Sahara hsc(join('... ', $snippets)) 462*173bfbcaSSatoshi Sahara ); 463*173bfbcaSSatoshi Sahara 464*173bfbcaSSatoshi Sahara $evdata['snippet'] = $snippet; 465*173bfbcaSSatoshi Sahara } 466*173bfbcaSSatoshi Sahara $evt->advise_after(); 467*173bfbcaSSatoshi Sahara unset($evt); 468*173bfbcaSSatoshi Sahara 469*173bfbcaSSatoshi Sahara return $evdata['snippet']; 470*173bfbcaSSatoshi Sahara } 471*173bfbcaSSatoshi Sahara 472*173bfbcaSSatoshi Sahara /** 473*173bfbcaSSatoshi Sahara * Wraps a search term in regex boundary checks. 474*173bfbcaSSatoshi Sahara * 475*173bfbcaSSatoshi Sahara * @param string $term 476*173bfbcaSSatoshi Sahara * @return string 477*173bfbcaSSatoshi Sahara */ 478*173bfbcaSSatoshi Sahara public static function snippet_re_preprocess($term) 479*173bfbcaSSatoshi Sahara { 480*173bfbcaSSatoshi Sahara // do not process asian terms where word boundaries are not explicit 481*173bfbcaSSatoshi Sahara if (\dokuwiki\Utf8\Asian::isAsianWords($term)) return $term; 482*173bfbcaSSatoshi Sahara 483*173bfbcaSSatoshi Sahara if (UTF8_PROPERTYSUPPORT) { 484*173bfbcaSSatoshi Sahara // unicode word boundaries 485*173bfbcaSSatoshi Sahara // see http://stackoverflow.com/a/2449017/172068 486*173bfbcaSSatoshi Sahara $BL = '(?<!\pL)'; 487*173bfbcaSSatoshi Sahara $BR = '(?!\pL)'; 488*173bfbcaSSatoshi Sahara } else { 489*173bfbcaSSatoshi Sahara // not as correct as above, but at least won't break 490*173bfbcaSSatoshi Sahara $BL = '\b'; 491*173bfbcaSSatoshi Sahara $BR = '\b'; 492*173bfbcaSSatoshi Sahara } 493*173bfbcaSSatoshi Sahara 494*173bfbcaSSatoshi Sahara if (substr($term, 0, 2) == '\\*') { 495*173bfbcaSSatoshi Sahara $term = substr($term, 2); 496*173bfbcaSSatoshi Sahara } else { 497*173bfbcaSSatoshi Sahara $term = $BL.$term; 498*173bfbcaSSatoshi Sahara } 499*173bfbcaSSatoshi Sahara 500*173bfbcaSSatoshi Sahara if (substr($term, -2, 2) == '\\*') { 501*173bfbcaSSatoshi Sahara $term = substr($term, 0, -2); 502*173bfbcaSSatoshi Sahara } else { 503*173bfbcaSSatoshi Sahara $term = $term.$BR; 504*173bfbcaSSatoshi Sahara } 505*173bfbcaSSatoshi Sahara 506*173bfbcaSSatoshi Sahara if ($term == $BL || $term == $BR || $term == $BL.$BR) { 507*173bfbcaSSatoshi Sahara $term = ''; 508*173bfbcaSSatoshi Sahara } 509*173bfbcaSSatoshi Sahara return $term; 510*173bfbcaSSatoshi Sahara } 511*173bfbcaSSatoshi Sahara 512*173bfbcaSSatoshi Sahara /** 513*173bfbcaSSatoshi Sahara * Combine found documents and sum up their scores 514*173bfbcaSSatoshi Sahara * 515*173bfbcaSSatoshi Sahara * This function is used to combine searched words with a logical 516*173bfbcaSSatoshi Sahara * AND. Only documents available in all arrays are returned. 517*173bfbcaSSatoshi Sahara * 518*173bfbcaSSatoshi Sahara * based upon PEAR's PHP_Compat function for array_intersect_key() 519*173bfbcaSSatoshi Sahara * 520*173bfbcaSSatoshi Sahara * @param array $args An array of page arrays 521*173bfbcaSSatoshi Sahara * @return array 522*173bfbcaSSatoshi Sahara */ 523*173bfbcaSSatoshi Sahara protected static function resultCombine($args) 524*173bfbcaSSatoshi Sahara { 525*173bfbcaSSatoshi Sahara $array_count = count($args); 526*173bfbcaSSatoshi Sahara if ($array_count == 1) { 527*173bfbcaSSatoshi Sahara return $args[0]; 528*173bfbcaSSatoshi Sahara } 529*173bfbcaSSatoshi Sahara 530*173bfbcaSSatoshi Sahara $result = array(); 531*173bfbcaSSatoshi Sahara if ($array_count > 1) { 532*173bfbcaSSatoshi Sahara foreach ($args[0] as $key => $value) { 533*173bfbcaSSatoshi Sahara $result[$key] = $value; 534*173bfbcaSSatoshi Sahara for ($i = 1; $i !== $array_count; $i++) { 535*173bfbcaSSatoshi Sahara if (!isset($args[$i][$key])) { 536*173bfbcaSSatoshi Sahara unset($result[$key]); 537*173bfbcaSSatoshi Sahara break; 538*173bfbcaSSatoshi Sahara } 539*173bfbcaSSatoshi Sahara $result[$key] += $args[$i][$key]; 540*173bfbcaSSatoshi Sahara } 541*173bfbcaSSatoshi Sahara } 542*173bfbcaSSatoshi Sahara } 543*173bfbcaSSatoshi Sahara return $result; 544*173bfbcaSSatoshi Sahara } 545*173bfbcaSSatoshi Sahara 546*173bfbcaSSatoshi Sahara /** 547*173bfbcaSSatoshi Sahara * Unites found documents and sum up their scores 548*173bfbcaSSatoshi Sahara * based upon resultCombine() method 549*173bfbcaSSatoshi Sahara * 550*173bfbcaSSatoshi Sahara * @param array $args An array of page arrays 551*173bfbcaSSatoshi Sahara * @return array 552*173bfbcaSSatoshi Sahara * 553*173bfbcaSSatoshi Sahara * @author Kazutaka Miyasaka <kazmiya@gmail.com> 554*173bfbcaSSatoshi Sahara */ 555*173bfbcaSSatoshi Sahara protected static function resultUnite($args) 556*173bfbcaSSatoshi Sahara { 557*173bfbcaSSatoshi Sahara $array_count = count($args); 558*173bfbcaSSatoshi Sahara if ($array_count === 1) { 559*173bfbcaSSatoshi Sahara return $args[0]; 560*173bfbcaSSatoshi Sahara } 561*173bfbcaSSatoshi Sahara 562*173bfbcaSSatoshi Sahara $result = $args[0]; 563*173bfbcaSSatoshi Sahara for ($i = 1; $i !== $array_count; $i++) { 564*173bfbcaSSatoshi Sahara foreach (array_keys($args[$i]) as $id) { 565*173bfbcaSSatoshi Sahara $result[$id] += $args[$i][$id]; 566*173bfbcaSSatoshi Sahara } 567*173bfbcaSSatoshi Sahara } 568*173bfbcaSSatoshi Sahara return $result; 569*173bfbcaSSatoshi Sahara } 570*173bfbcaSSatoshi Sahara 571*173bfbcaSSatoshi Sahara /** 572*173bfbcaSSatoshi Sahara * Computes the difference of documents using page id for comparison 573*173bfbcaSSatoshi Sahara * nearly identical to PHP5's array_diff_key() 574*173bfbcaSSatoshi Sahara * 575*173bfbcaSSatoshi Sahara * @param array $args An array of page arrays 576*173bfbcaSSatoshi Sahara * @return array 577*173bfbcaSSatoshi Sahara * 578*173bfbcaSSatoshi Sahara * @author Kazutaka Miyasaka <kazmiya@gmail.com> 579*173bfbcaSSatoshi Sahara */ 580*173bfbcaSSatoshi Sahara protected static function resultComplement($args) 581*173bfbcaSSatoshi Sahara { 582*173bfbcaSSatoshi Sahara $array_count = count($args); 583*173bfbcaSSatoshi Sahara if ($array_count === 1) { 584*173bfbcaSSatoshi Sahara return $args[0]; 585*173bfbcaSSatoshi Sahara } 586*173bfbcaSSatoshi Sahara 587*173bfbcaSSatoshi Sahara $result = $args[0]; 588*173bfbcaSSatoshi Sahara foreach (array_keys($result) as $id) { 589*173bfbcaSSatoshi Sahara for ($i = 1; $i !== $array_count; $i++) { 590*173bfbcaSSatoshi Sahara if (isset($args[$i][$id])) unset($result[$id]); 591*173bfbcaSSatoshi Sahara } 592*173bfbcaSSatoshi Sahara } 593*173bfbcaSSatoshi Sahara return $result; 594*173bfbcaSSatoshi Sahara } 595*173bfbcaSSatoshi Sahara 596*173bfbcaSSatoshi Sahara /** 597*173bfbcaSSatoshi Sahara * Parses a search query and builds an array of search formulas 598*173bfbcaSSatoshi Sahara * 599*173bfbcaSSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 600*173bfbcaSSatoshi Sahara * @author Kazutaka Miyasaka <kazmiya@gmail.com> 601*173bfbcaSSatoshi Sahara * 602*173bfbcaSSatoshi Sahara * @param Doku_Indexer $Indexer 603*173bfbcaSSatoshi Sahara * @param string $query search query 604*173bfbcaSSatoshi Sahara * @return array of search formulas 605*173bfbcaSSatoshi Sahara */ 606*173bfbcaSSatoshi Sahara public static function queryParser($Indexer, $query) 607*173bfbcaSSatoshi Sahara { 608*173bfbcaSSatoshi Sahara /** 609*173bfbcaSSatoshi Sahara * parse a search query and transform it into intermediate representation 610*173bfbcaSSatoshi Sahara * 611*173bfbcaSSatoshi Sahara * in a search query, you can use the following expressions: 612*173bfbcaSSatoshi Sahara * 613*173bfbcaSSatoshi Sahara * words: 614*173bfbcaSSatoshi Sahara * include 615*173bfbcaSSatoshi Sahara * -exclude 616*173bfbcaSSatoshi Sahara * phrases: 617*173bfbcaSSatoshi Sahara * "phrase to be included" 618*173bfbcaSSatoshi Sahara * -"phrase you want to exclude" 619*173bfbcaSSatoshi Sahara * namespaces: 620*173bfbcaSSatoshi Sahara * @include:namespace (or ns:include:namespace) 621*173bfbcaSSatoshi Sahara * ^exclude:namespace (or -ns:exclude:namespace) 622*173bfbcaSSatoshi Sahara * groups: 623*173bfbcaSSatoshi Sahara * () 624*173bfbcaSSatoshi Sahara * -() 625*173bfbcaSSatoshi Sahara * operators: 626*173bfbcaSSatoshi Sahara * and ('and' is the default operator: you can always omit this) 627*173bfbcaSSatoshi Sahara * or (or pipe symbol '|', lower precedence than 'and') 628*173bfbcaSSatoshi Sahara * 629*173bfbcaSSatoshi Sahara * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain 630*173bfbcaSSatoshi Sahara * a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'". 631*173bfbcaSSatoshi Sahara * this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ] 632*173bfbcaSSatoshi Sahara * as long as you don't mind hit counts. 633*173bfbcaSSatoshi Sahara * 634*173bfbcaSSatoshi Sahara * intermediate representation consists of the following parts: 635*173bfbcaSSatoshi Sahara * 636*173bfbcaSSatoshi Sahara * ( ) - group 637*173bfbcaSSatoshi Sahara * AND - logical and 638*173bfbcaSSatoshi Sahara * OR - logical or 639*173bfbcaSSatoshi Sahara * NOT - logical not 640*173bfbcaSSatoshi Sahara * W+:, W-:, W_: - word (underscore: no need to highlight) 641*173bfbcaSSatoshi Sahara * P+:, P-: - phrase (minus sign: logically in NOT group) 642*173bfbcaSSatoshi Sahara * N+:, N-: - namespace 643*173bfbcaSSatoshi Sahara */ 644*173bfbcaSSatoshi Sahara $parsed_query = ''; 645*173bfbcaSSatoshi Sahara $parens_level = 0; 646*173bfbcaSSatoshi Sahara $terms = preg_split('/(-?".*?")/u', \dokuwiki\Utf8\PhpString::strtolower($query), 647*173bfbcaSSatoshi Sahara -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 648*173bfbcaSSatoshi Sahara ); 649*173bfbcaSSatoshi Sahara 650*173bfbcaSSatoshi Sahara foreach ($terms as $term) { 651*173bfbcaSSatoshi Sahara $parsed = ''; 652*173bfbcaSSatoshi Sahara if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { 653*173bfbcaSSatoshi Sahara // phrase-include and phrase-exclude 654*173bfbcaSSatoshi Sahara $not = $matches[1] ? 'NOT' : ''; 655*173bfbcaSSatoshi Sahara $parsed = $not . static::termParser($Indexer, $matches[2], false, true); 656*173bfbcaSSatoshi Sahara } else { 657*173bfbcaSSatoshi Sahara // fix incomplete phrase 658*173bfbcaSSatoshi Sahara $term = str_replace('"', ' ', $term); 659*173bfbcaSSatoshi Sahara 660*173bfbcaSSatoshi Sahara // fix parentheses 661*173bfbcaSSatoshi Sahara $term = str_replace(')' , ' ) ', $term); 662*173bfbcaSSatoshi Sahara $term = str_replace('(' , ' ( ', $term); 663*173bfbcaSSatoshi Sahara $term = str_replace('- (', ' -(', $term); 664*173bfbcaSSatoshi Sahara 665*173bfbcaSSatoshi Sahara // treat pipe symbols as 'OR' operators 666*173bfbcaSSatoshi Sahara $term = str_replace('|', ' or ', $term); 667*173bfbcaSSatoshi Sahara 668*173bfbcaSSatoshi Sahara // treat ideographic spaces (U+3000) as search term separators 669*173bfbcaSSatoshi Sahara // FIXME: some more separators? 670*173bfbcaSSatoshi Sahara $term = preg_replace('/[ \x{3000}]+/u', ' ', $term); 671*173bfbcaSSatoshi Sahara $term = trim($term); 672*173bfbcaSSatoshi Sahara if ($term === '') continue; 673*173bfbcaSSatoshi Sahara 674*173bfbcaSSatoshi Sahara $tokens = explode(' ', $term); 675*173bfbcaSSatoshi Sahara foreach ($tokens as $token) { 676*173bfbcaSSatoshi Sahara if ($token === '(') { 677*173bfbcaSSatoshi Sahara // parenthesis-include-open 678*173bfbcaSSatoshi Sahara $parsed .= '('; 679*173bfbcaSSatoshi Sahara ++$parens_level; 680*173bfbcaSSatoshi Sahara } elseif ($token === '-(') { 681*173bfbcaSSatoshi Sahara // parenthesis-exclude-open 682*173bfbcaSSatoshi Sahara $parsed .= 'NOT('; 683*173bfbcaSSatoshi Sahara ++$parens_level; 684*173bfbcaSSatoshi Sahara } elseif ($token === ')') { 685*173bfbcaSSatoshi Sahara // parenthesis-any-close 686*173bfbcaSSatoshi Sahara if ($parens_level === 0) continue; 687*173bfbcaSSatoshi Sahara $parsed .= ')'; 688*173bfbcaSSatoshi Sahara $parens_level--; 689*173bfbcaSSatoshi Sahara } elseif ($token === 'and') { 690*173bfbcaSSatoshi Sahara // logical-and (do nothing) 691*173bfbcaSSatoshi Sahara } elseif ($token === 'or') { 692*173bfbcaSSatoshi Sahara // logical-or 693*173bfbcaSSatoshi Sahara $parsed .= 'OR'; 694*173bfbcaSSatoshi Sahara } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) { 695*173bfbcaSSatoshi Sahara // namespace-exclude 696*173bfbcaSSatoshi Sahara $parsed .= 'NOT(N+:'.$matches[1].')'; 697*173bfbcaSSatoshi Sahara } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) { 698*173bfbcaSSatoshi Sahara // namespace-include 699*173bfbcaSSatoshi Sahara $parsed .= '(N+:'.$matches[1].')'; 700*173bfbcaSSatoshi Sahara } elseif (preg_match('/^-(.+)$/', $token, $matches)) { 701*173bfbcaSSatoshi Sahara // word-exclude 702*173bfbcaSSatoshi Sahara $parsed .= 'NOT('.static::termParser($Indexer, $matches[1]).')'; 703*173bfbcaSSatoshi Sahara } else { 704*173bfbcaSSatoshi Sahara // word-include 705*173bfbcaSSatoshi Sahara $parsed .= static::termParser($Indexer, $token); 706*173bfbcaSSatoshi Sahara } 707*173bfbcaSSatoshi Sahara } 708*173bfbcaSSatoshi Sahara } 709*173bfbcaSSatoshi Sahara $parsed_query .= $parsed; 710*173bfbcaSSatoshi Sahara } 711*173bfbcaSSatoshi Sahara 712*173bfbcaSSatoshi Sahara // cleanup (very sensitive) 713*173bfbcaSSatoshi Sahara $parsed_query .= str_repeat(')', $parens_level); 714*173bfbcaSSatoshi Sahara do { 715*173bfbcaSSatoshi Sahara $parsed_query_old = $parsed_query; 716*173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query); 717*173bfbcaSSatoshi Sahara } while ($parsed_query !== $parsed_query_old); 718*173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')' , $parsed_query); 719*173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/(OR)+/u' , 'OR' , $parsed_query); 720*173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/\(OR/u' , '(' , $parsed_query); 721*173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/^OR|OR$/u' , '' , $parsed_query); 722*173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query); 723*173bfbcaSSatoshi Sahara 724*173bfbcaSSatoshi Sahara // adjustment: make highlightings right 725*173bfbcaSSatoshi Sahara $parens_level = 0; 726*173bfbcaSSatoshi Sahara $notgrp_levels = array(); 727*173bfbcaSSatoshi Sahara $parsed_query_new = ''; 728*173bfbcaSSatoshi Sahara $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, 729*173bfbcaSSatoshi Sahara -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 730*173bfbcaSSatoshi Sahara ); 731*173bfbcaSSatoshi Sahara foreach ($tokens as $token) { 732*173bfbcaSSatoshi Sahara if ($token === 'NOT(') { 733*173bfbcaSSatoshi Sahara $notgrp_levels[] = ++$parens_level; 734*173bfbcaSSatoshi Sahara } elseif ($token === '(') { 735*173bfbcaSSatoshi Sahara ++$parens_level; 736*173bfbcaSSatoshi Sahara } elseif ($token === ')') { 737*173bfbcaSSatoshi Sahara if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels); 738*173bfbcaSSatoshi Sahara } elseif (count($notgrp_levels) % 2 === 1) { 739*173bfbcaSSatoshi Sahara // turn highlight-flag off if terms are logically in "NOT" group 740*173bfbcaSSatoshi Sahara $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token); 741*173bfbcaSSatoshi Sahara } 742*173bfbcaSSatoshi Sahara $parsed_query_new .= $token; 743*173bfbcaSSatoshi Sahara } 744*173bfbcaSSatoshi Sahara $parsed_query = $parsed_query_new; 745*173bfbcaSSatoshi Sahara 746*173bfbcaSSatoshi Sahara /** 747*173bfbcaSSatoshi Sahara * convert infix notation string into postfix (Reverse Polish notation) array 748*173bfbcaSSatoshi Sahara * by Shunting-yard algorithm 749*173bfbcaSSatoshi Sahara * 750*173bfbcaSSatoshi Sahara * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation 751*173bfbcaSSatoshi Sahara * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm 752*173bfbcaSSatoshi Sahara */ 753*173bfbcaSSatoshi Sahara $parsed_ary = array(); 754*173bfbcaSSatoshi Sahara $ope_stack = array(); 755*173bfbcaSSatoshi Sahara $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5); 756*173bfbcaSSatoshi Sahara $ope_regex = '/([()]|OR|AND|NOT)/u'; 757*173bfbcaSSatoshi Sahara 758*173bfbcaSSatoshi Sahara $tokens = preg_split($ope_regex, $parsed_query, 759*173bfbcaSSatoshi Sahara -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 760*173bfbcaSSatoshi Sahara ); 761*173bfbcaSSatoshi Sahara foreach ($tokens as $token) { 762*173bfbcaSSatoshi Sahara if (preg_match($ope_regex, $token)) { 763*173bfbcaSSatoshi Sahara // operator 764*173bfbcaSSatoshi Sahara $last_ope = end($ope_stack); 765*173bfbcaSSatoshi Sahara while ($last_ope !== false 766*173bfbcaSSatoshi Sahara && $ope_precedence[$token] <= $ope_precedence[$last_ope] 767*173bfbcaSSatoshi Sahara && $last_ope != '(' 768*173bfbcaSSatoshi Sahara ) { 769*173bfbcaSSatoshi Sahara $parsed_ary[] = array_pop($ope_stack); 770*173bfbcaSSatoshi Sahara $last_ope = end($ope_stack); 771*173bfbcaSSatoshi Sahara } 772*173bfbcaSSatoshi Sahara if ($token == ')') { 773*173bfbcaSSatoshi Sahara array_pop($ope_stack); // this array_pop always deletes '(' 774*173bfbcaSSatoshi Sahara } else { 775*173bfbcaSSatoshi Sahara $ope_stack[] = $token; 776*173bfbcaSSatoshi Sahara } 777*173bfbcaSSatoshi Sahara } else { 778*173bfbcaSSatoshi Sahara // operand 779*173bfbcaSSatoshi Sahara $token_decoded = str_replace(['OP','CP'], ['(',')'], $token); 780*173bfbcaSSatoshi Sahara $parsed_ary[] = $token_decoded; 781*173bfbcaSSatoshi Sahara } 782*173bfbcaSSatoshi Sahara } 783*173bfbcaSSatoshi Sahara $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack))); 784*173bfbcaSSatoshi Sahara 785*173bfbcaSSatoshi Sahara // cleanup: each double "NOT" in RPN array actually does nothing 786*173bfbcaSSatoshi Sahara $parsed_ary_count = count($parsed_ary); 787*173bfbcaSSatoshi Sahara for ($i = 1; $i < $parsed_ary_count; ++$i) { 788*173bfbcaSSatoshi Sahara if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') { 789*173bfbcaSSatoshi Sahara unset($parsed_ary[$i], $parsed_ary[$i - 1]); 790*173bfbcaSSatoshi Sahara } 791*173bfbcaSSatoshi Sahara } 792*173bfbcaSSatoshi Sahara $parsed_ary = array_values($parsed_ary); 793*173bfbcaSSatoshi Sahara 794*173bfbcaSSatoshi Sahara // build return value 795*173bfbcaSSatoshi Sahara $q = array(); 796*173bfbcaSSatoshi Sahara $q['query'] = $query; 797*173bfbcaSSatoshi Sahara $q['parsed_str'] = $parsed_query; 798*173bfbcaSSatoshi Sahara $q['parsed_ary'] = $parsed_ary; 799*173bfbcaSSatoshi Sahara 800*173bfbcaSSatoshi Sahara foreach ($q['parsed_ary'] as $token) { 801*173bfbcaSSatoshi Sahara if ($token[2] !== ':') continue; 802*173bfbcaSSatoshi Sahara $body = substr($token, 3); 803*173bfbcaSSatoshi Sahara 804*173bfbcaSSatoshi Sahara switch (substr($token, 0, 3)) { 805*173bfbcaSSatoshi Sahara case 'N+:': 806*173bfbcaSSatoshi Sahara $q['ns'][] = $body; // for backward compatibility 807*173bfbcaSSatoshi Sahara break; 808*173bfbcaSSatoshi Sahara case 'N-:': 809*173bfbcaSSatoshi Sahara $q['notns'][] = $body; // for backward compatibility 810*173bfbcaSSatoshi Sahara break; 811*173bfbcaSSatoshi Sahara case 'W_:': 812*173bfbcaSSatoshi Sahara $q['words'][] = $body; 813*173bfbcaSSatoshi Sahara break; 814*173bfbcaSSatoshi Sahara case 'W-:': 815*173bfbcaSSatoshi Sahara $q['words'][] = $body; 816*173bfbcaSSatoshi Sahara $q['not'][] = $body; // for backward compatibility 817*173bfbcaSSatoshi Sahara break; 818*173bfbcaSSatoshi Sahara case 'W+:': 819*173bfbcaSSatoshi Sahara $q['words'][] = $body; 820*173bfbcaSSatoshi Sahara $q['highlight'][] = $body; 821*173bfbcaSSatoshi Sahara $q['and'][] = $body; // for backward compatibility 822*173bfbcaSSatoshi Sahara break; 823*173bfbcaSSatoshi Sahara case 'P-:': 824*173bfbcaSSatoshi Sahara $q['phrases'][] = $body; 825*173bfbcaSSatoshi Sahara break; 826*173bfbcaSSatoshi Sahara case 'P+:': 827*173bfbcaSSatoshi Sahara $q['phrases'][] = $body; 828*173bfbcaSSatoshi Sahara $q['highlight'][] = $body; 829*173bfbcaSSatoshi Sahara break; 830*173bfbcaSSatoshi Sahara } 831*173bfbcaSSatoshi Sahara } 832*173bfbcaSSatoshi Sahara foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) { 833*173bfbcaSSatoshi Sahara $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key])); 834*173bfbcaSSatoshi Sahara } 835*173bfbcaSSatoshi Sahara 836*173bfbcaSSatoshi Sahara return $q; 837*173bfbcaSSatoshi Sahara } 838*173bfbcaSSatoshi Sahara 839*173bfbcaSSatoshi Sahara /** 840*173bfbcaSSatoshi Sahara * Transforms given search term into intermediate representation 841*173bfbcaSSatoshi Sahara * 842*173bfbcaSSatoshi Sahara * This function is used in ft_queryParser() and not for general purpose use. 843*173bfbcaSSatoshi Sahara * 844*173bfbcaSSatoshi Sahara * @author Kazutaka Miyasaka <kazmiya@gmail.com> 845*173bfbcaSSatoshi Sahara * 846*173bfbcaSSatoshi Sahara * @param Doku_Indexer $Indexer 847*173bfbcaSSatoshi Sahara * @param string $term 848*173bfbcaSSatoshi Sahara * @param bool $consider_asian 849*173bfbcaSSatoshi Sahara * @param bool $phrase_mode 850*173bfbcaSSatoshi Sahara * @return string 851*173bfbcaSSatoshi Sahara */ 852*173bfbcaSSatoshi Sahara public static function termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) 853*173bfbcaSSatoshi Sahara { 854*173bfbcaSSatoshi Sahara $parsed = ''; 855*173bfbcaSSatoshi Sahara if ($consider_asian) { 856*173bfbcaSSatoshi Sahara // successive asian characters need to be searched as a phrase 857*173bfbcaSSatoshi Sahara $words = \dokuwiki\Utf8\Asian::splitAsianWords($term); 858*173bfbcaSSatoshi Sahara foreach ($words as $word) { 859*173bfbcaSSatoshi Sahara $phrase_mode = $phrase_mode ? true : \dokuwiki\Utf8\Asian::isAsianWords($word); 860*173bfbcaSSatoshi Sahara $parsed .= static::termParser($Indexer, $word, false, $phrase_mode); 861*173bfbcaSSatoshi Sahara } 862*173bfbcaSSatoshi Sahara } else { 863*173bfbcaSSatoshi Sahara $term_noparen = str_replace(['(',')'], ' ', $term); 864*173bfbcaSSatoshi Sahara $words = $Indexer->tokenizer($term_noparen, true); 865*173bfbcaSSatoshi Sahara 866*173bfbcaSSatoshi Sahara // W_: no need to highlight 867*173bfbcaSSatoshi Sahara if (empty($words)) { 868*173bfbcaSSatoshi Sahara $parsed = '()'; // important: do not remove 869*173bfbcaSSatoshi Sahara } elseif ($words[0] === $term) { 870*173bfbcaSSatoshi Sahara $parsed = '(W+:'.$words[0].')'; 871*173bfbcaSSatoshi Sahara } elseif ($phrase_mode) { 872*173bfbcaSSatoshi Sahara $term_encoded = str_replace(['(',')'], ['OP','CP'], $term); 873*173bfbcaSSatoshi Sahara $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))'; 874*173bfbcaSSatoshi Sahara } else { 875*173bfbcaSSatoshi Sahara $parsed = '((W+:'.implode(')(W+:', $words).'))'; 876*173bfbcaSSatoshi Sahara } 877*173bfbcaSSatoshi Sahara } 878*173bfbcaSSatoshi Sahara return $parsed; 879*173bfbcaSSatoshi Sahara } 880*173bfbcaSSatoshi Sahara 881*173bfbcaSSatoshi Sahara /** 882*173bfbcaSSatoshi Sahara * Recreate a search query string based on parsed parts, 883*173bfbcaSSatoshi Sahara * doesn't support negated phrases and `OR` searches 884*173bfbcaSSatoshi Sahara * 885*173bfbcaSSatoshi Sahara * @param array $and 886*173bfbcaSSatoshi Sahara * @param array $not 887*173bfbcaSSatoshi Sahara * @param array $phrases 888*173bfbcaSSatoshi Sahara * @param array $ns 889*173bfbcaSSatoshi Sahara * @param array $notns 890*173bfbcaSSatoshi Sahara * 891*173bfbcaSSatoshi Sahara * @return string 892*173bfbcaSSatoshi Sahara */ 893*173bfbcaSSatoshi Sahara public static function queryUnparser_simple( 894*173bfbcaSSatoshi Sahara array $and, array $not, array $phrases, array $ns, array $notns 895*173bfbcaSSatoshi Sahara ) { 896*173bfbcaSSatoshi Sahara $query = implode(' ', $and); 897*173bfbcaSSatoshi Sahara 898*173bfbcaSSatoshi Sahara if (!empty($not)) { 899*173bfbcaSSatoshi Sahara $query .= ' -' . implode(' -', $not); 900*173bfbcaSSatoshi Sahara } 901*173bfbcaSSatoshi Sahara if (!empty($phrases)) { 902*173bfbcaSSatoshi Sahara $query .= ' "' . implode('" "', $phrases) . '"'; 903*173bfbcaSSatoshi Sahara } 904*173bfbcaSSatoshi Sahara if (!empty($ns)) { 905*173bfbcaSSatoshi Sahara $query .= ' @' . implode(' @', $ns); 906*173bfbcaSSatoshi Sahara } 907*173bfbcaSSatoshi Sahara if (!empty($notns)) { 908*173bfbcaSSatoshi Sahara $query .= ' ^' . implode(' ^', $notns); 909*173bfbcaSSatoshi Sahara } 910*173bfbcaSSatoshi Sahara return $query; 911*173bfbcaSSatoshi Sahara } 912*173bfbcaSSatoshi Sahara} 913