1173bfbcaSSatoshi Sahara<?php 2173bfbcaSSatoshi Saharanamespace dokuwiki\Search; 3173bfbcaSSatoshi Sahara 4173bfbcaSSatoshi Saharause dokuwiki\Extension\Event; 5*c31af4f3SSatoshi Saharause dokuwiki\Search\Indexer; 6*c31af4f3SSatoshi Saharause dokuwiki\Utf8; 7173bfbcaSSatoshi Sahara 8173bfbcaSSatoshi Sahara/** 9173bfbcaSSatoshi Sahara * Class DokuWiki Fulltext Search 10173bfbcaSSatoshi Sahara * 11173bfbcaSSatoshi Sahara * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 12173bfbcaSSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 13173bfbcaSSatoshi Sahara */ 14173bfbcaSSatoshi Saharaclass FulltextSearch 15173bfbcaSSatoshi Sahara{ 16173bfbcaSSatoshi Sahara /** 17173bfbcaSSatoshi Sahara * Fulltext Search constructor. prevent direct object creation 18173bfbcaSSatoshi Sahara */ 19173bfbcaSSatoshi Sahara protected function __construct() {} 20173bfbcaSSatoshi Sahara 21173bfbcaSSatoshi Sahara /** 22173bfbcaSSatoshi Sahara * The fulltext search 23173bfbcaSSatoshi Sahara * 24173bfbcaSSatoshi Sahara * Returns a list of matching documents for the given query 25173bfbcaSSatoshi Sahara * 26173bfbcaSSatoshi Sahara * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event() 27173bfbcaSSatoshi Sahara * 28173bfbcaSSatoshi Sahara * @param string $query 29173bfbcaSSatoshi Sahara * @param array $highlight 30173bfbcaSSatoshi Sahara * @param string $sort 31173bfbcaSSatoshi Sahara * @param int|string $after only show results with mtime after this date, 32173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 33173bfbcaSSatoshi Sahara * @param int|string $before only show results with mtime before this date, 34173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 35173bfbcaSSatoshi Sahara * 36173bfbcaSSatoshi Sahara * @return array 37173bfbcaSSatoshi Sahara */ 38173bfbcaSSatoshi Sahara public static function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null) 39173bfbcaSSatoshi Sahara { 40173bfbcaSSatoshi Sahara if ($sort === null) { 41173bfbcaSSatoshi Sahara $sort = 'hits'; 42173bfbcaSSatoshi Sahara } 43173bfbcaSSatoshi Sahara $data = [ 44173bfbcaSSatoshi Sahara 'query' => $query, 45173bfbcaSSatoshi Sahara 'sort' => $sort, 46173bfbcaSSatoshi Sahara 'after' => $after, 47173bfbcaSSatoshi Sahara 'before' => $before 48173bfbcaSSatoshi Sahara ]; 49173bfbcaSSatoshi Sahara $data['highlight'] =& $highlight; 50173bfbcaSSatoshi Sahara $action = static::class.'::callback_pageSearch'; 51173bfbcaSSatoshi Sahara return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action); 52173bfbcaSSatoshi Sahara } 53173bfbcaSSatoshi Sahara 54173bfbcaSSatoshi Sahara /** 55173bfbcaSSatoshi Sahara * Returns a list of matching documents for the given query 56173bfbcaSSatoshi Sahara * 57173bfbcaSSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 58173bfbcaSSatoshi Sahara * @author Kazutaka Miyasaka <kazmiya@gmail.com> 59173bfbcaSSatoshi Sahara * 60173bfbcaSSatoshi Sahara * @param array $data event data 61173bfbcaSSatoshi Sahara * @return array matching documents 62173bfbcaSSatoshi Sahara */ 63173bfbcaSSatoshi Sahara public static function callback_pageSearch(&$data) 64173bfbcaSSatoshi Sahara { 65*c31af4f3SSatoshi Sahara $Indexer = Indexer::getInstance(); 66173bfbcaSSatoshi Sahara 67173bfbcaSSatoshi Sahara // parse the given query 68173bfbcaSSatoshi Sahara $q = static::queryParser($Indexer, $data['query']); 69173bfbcaSSatoshi Sahara $data['highlight'] = $q['highlight']; 70173bfbcaSSatoshi Sahara 71173bfbcaSSatoshi Sahara if (empty($q['parsed_ary'])) return array(); 72173bfbcaSSatoshi Sahara 73173bfbcaSSatoshi Sahara // lookup all words found in the query 74173bfbcaSSatoshi Sahara $lookup = $Indexer->lookup($q['words']); 75173bfbcaSSatoshi Sahara 76173bfbcaSSatoshi Sahara // get all pages in this dokuwiki site (!: includes nonexistent pages) 77173bfbcaSSatoshi Sahara $pages_all = array(); 78173bfbcaSSatoshi Sahara foreach ($Indexer->getPages() as $id) { 79173bfbcaSSatoshi Sahara $pages_all[$id] = 0; // base: 0 hit 80173bfbcaSSatoshi Sahara } 81173bfbcaSSatoshi Sahara 82173bfbcaSSatoshi Sahara // process the query 83173bfbcaSSatoshi Sahara $stack = array(); 84173bfbcaSSatoshi Sahara foreach ($q['parsed_ary'] as $token) { 85173bfbcaSSatoshi Sahara switch (substr($token, 0, 3)) { 86173bfbcaSSatoshi Sahara case 'W+:': 87173bfbcaSSatoshi Sahara case 'W-:': 88173bfbcaSSatoshi Sahara case 'W_:': // word 89173bfbcaSSatoshi Sahara $word = substr($token, 3); 90173bfbcaSSatoshi Sahara $stack[] = (array) $lookup[$word]; 91173bfbcaSSatoshi Sahara break; 92173bfbcaSSatoshi Sahara case 'P+:': 93173bfbcaSSatoshi Sahara case 'P-:': // phrase 94173bfbcaSSatoshi Sahara $phrase = substr($token, 3); 95173bfbcaSSatoshi Sahara // since phrases are always parsed as ((W1)(W2)...(P)), 96173bfbcaSSatoshi Sahara // the end($stack) always points the pages that contain 97173bfbcaSSatoshi Sahara // all words in this phrase 98173bfbcaSSatoshi Sahara $pages = end($stack); 99173bfbcaSSatoshi Sahara $pages_matched = array(); 100173bfbcaSSatoshi Sahara foreach (array_keys($pages) as $id) { 101173bfbcaSSatoshi Sahara $evdata = array( 102173bfbcaSSatoshi Sahara 'id' => $id, 103173bfbcaSSatoshi Sahara 'phrase' => $phrase, 104173bfbcaSSatoshi Sahara 'text' => rawWiki($id) 105173bfbcaSSatoshi Sahara ); 106173bfbcaSSatoshi Sahara $evt = new Event('FULLTEXT_PHRASE_MATCH', $evdata); 107173bfbcaSSatoshi Sahara if ($evt->advise_before() && $evt->result !== true) { 108*c31af4f3SSatoshi Sahara $text = Utf8\PhpString::strtolower($evdata['text']); 109173bfbcaSSatoshi Sahara if (strpos($text, $phrase) !== false) { 110173bfbcaSSatoshi Sahara $evt->result = true; 111173bfbcaSSatoshi Sahara } 112173bfbcaSSatoshi Sahara } 113173bfbcaSSatoshi Sahara $evt->advise_after(); 114173bfbcaSSatoshi Sahara if ($evt->result === true) { 115173bfbcaSSatoshi Sahara $pages_matched[$id] = 0; // phrase: always 0 hit 116173bfbcaSSatoshi Sahara } 117173bfbcaSSatoshi Sahara } 118173bfbcaSSatoshi Sahara $stack[] = $pages_matched; 119173bfbcaSSatoshi Sahara break; 120173bfbcaSSatoshi Sahara case 'N+:': 121173bfbcaSSatoshi Sahara case 'N-:': // namespace 122173bfbcaSSatoshi Sahara $ns = cleanID(substr($token, 3)) . ':'; 123173bfbcaSSatoshi Sahara $pages_matched = array(); 124173bfbcaSSatoshi Sahara foreach (array_keys($pages_all) as $id) { 125173bfbcaSSatoshi Sahara if (strpos($id, $ns) === 0) { 126173bfbcaSSatoshi Sahara $pages_matched[$id] = 0; // namespace: always 0 hit 127173bfbcaSSatoshi Sahara } 128173bfbcaSSatoshi Sahara } 129173bfbcaSSatoshi Sahara $stack[] = $pages_matched; 130173bfbcaSSatoshi Sahara break; 131173bfbcaSSatoshi Sahara case 'AND': // and operation 132173bfbcaSSatoshi Sahara list($pages1, $pages2) = array_splice($stack, -2); 133173bfbcaSSatoshi Sahara $stack[] = static::resultCombine(array($pages1, $pages2)); 134173bfbcaSSatoshi Sahara break; 135173bfbcaSSatoshi Sahara case 'OR': // or operation 136173bfbcaSSatoshi Sahara list($pages1, $pages2) = array_splice($stack, -2); 137173bfbcaSSatoshi Sahara $stack[] = static::resultUnite(array($pages1, $pages2)); 138173bfbcaSSatoshi Sahara break; 139173bfbcaSSatoshi Sahara case 'NOT': // not operation (unary) 140173bfbcaSSatoshi Sahara $pages = array_pop($stack); 141173bfbcaSSatoshi Sahara $stack[] = static::resultComplement(array($pages_all, $pages)); 142173bfbcaSSatoshi Sahara break; 143173bfbcaSSatoshi Sahara } 144173bfbcaSSatoshi Sahara } 145173bfbcaSSatoshi Sahara $docs = array_pop($stack); 146173bfbcaSSatoshi Sahara 147173bfbcaSSatoshi Sahara if (empty($docs)) return array(); 148173bfbcaSSatoshi Sahara 149173bfbcaSSatoshi Sahara // check: settings, acls, existence 150173bfbcaSSatoshi Sahara foreach (array_keys($docs) as $id) { 151173bfbcaSSatoshi Sahara if (isHiddenPage($id) 152173bfbcaSSatoshi Sahara || auth_quickaclcheck($id) < AUTH_READ 153173bfbcaSSatoshi Sahara || !page_exists($id, '', false) 154173bfbcaSSatoshi Sahara ) { 155173bfbcaSSatoshi Sahara unset($docs[$id]); 156173bfbcaSSatoshi Sahara } 157173bfbcaSSatoshi Sahara } 158173bfbcaSSatoshi Sahara 159173bfbcaSSatoshi Sahara $docs = static::filterResultsByTime($docs, $data['after'], $data['before']); 160173bfbcaSSatoshi Sahara 161173bfbcaSSatoshi Sahara if ($data['sort'] === 'mtime') { 162173bfbcaSSatoshi Sahara uksort($docs, static::class.'::pagemtimesorter'); 163173bfbcaSSatoshi Sahara } else { 164173bfbcaSSatoshi Sahara // sort docs by count 165173bfbcaSSatoshi Sahara arsort($docs); 166173bfbcaSSatoshi Sahara } 167173bfbcaSSatoshi Sahara 168173bfbcaSSatoshi Sahara return $docs; 169173bfbcaSSatoshi Sahara } 170173bfbcaSSatoshi Sahara 171173bfbcaSSatoshi Sahara /** 172173bfbcaSSatoshi Sahara * Quicksearch for pagenames 173173bfbcaSSatoshi Sahara * 174173bfbcaSSatoshi Sahara * By default it only matches the pagename and ignores the 175173bfbcaSSatoshi Sahara * namespace. This can be changed with the second parameter. 176173bfbcaSSatoshi Sahara * The third parameter allows to search in titles as well. 177173bfbcaSSatoshi Sahara * 178173bfbcaSSatoshi Sahara * The function always returns titles as well 179173bfbcaSSatoshi Sahara * 180173bfbcaSSatoshi Sahara * @triggers SEARCH_QUERY_PAGELOOKUP 181173bfbcaSSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 182173bfbcaSSatoshi Sahara * @author Adrian Lang <lang@cosmocode.de> 183173bfbcaSSatoshi Sahara * 184173bfbcaSSatoshi Sahara * @param string $id page id 185173bfbcaSSatoshi Sahara * @param bool $in_ns match against namespace as well? 186173bfbcaSSatoshi Sahara * @param bool $in_title search in title? 187173bfbcaSSatoshi Sahara * @param int|string $after only show results with mtime after this date, 188173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 189173bfbcaSSatoshi Sahara * @param int|string $before only show results with mtime before this date, 190173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 191173bfbcaSSatoshi Sahara * 192173bfbcaSSatoshi Sahara * @return string[] 193173bfbcaSSatoshi Sahara */ 194173bfbcaSSatoshi Sahara public static function pageLookup($id, $in_ns=false, $in_title=false, $after = null, $before = null) 195173bfbcaSSatoshi Sahara { 196173bfbcaSSatoshi Sahara $data = [ 197173bfbcaSSatoshi Sahara 'id' => $id, 198173bfbcaSSatoshi Sahara 'in_ns' => $in_ns, 199173bfbcaSSatoshi Sahara 'in_title' => $in_title, 200173bfbcaSSatoshi Sahara 'after' => $after, 201173bfbcaSSatoshi Sahara 'before' => $before 202173bfbcaSSatoshi Sahara ]; 203173bfbcaSSatoshi Sahara $data['has_titles'] = true; // for plugin backward compatibility check 204173bfbcaSSatoshi Sahara $action = static::class.'::callback_pageLookup'; 205173bfbcaSSatoshi Sahara return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, $action); 206173bfbcaSSatoshi Sahara } 207173bfbcaSSatoshi Sahara 208173bfbcaSSatoshi Sahara /** 209173bfbcaSSatoshi Sahara * Returns list of pages as array(pageid => First Heading) 210173bfbcaSSatoshi Sahara * 211173bfbcaSSatoshi Sahara * @param array &$data event data 212173bfbcaSSatoshi Sahara * @return string[] 213173bfbcaSSatoshi Sahara */ 214173bfbcaSSatoshi Sahara public static function callback_pageLookup(&$data) 215173bfbcaSSatoshi Sahara { 216*c31af4f3SSatoshi Sahara $Indexer = Indexer::getInstance(); 217*c31af4f3SSatoshi Sahara 218173bfbcaSSatoshi Sahara // split out original parameters 219173bfbcaSSatoshi Sahara $id = $data['id']; 220173bfbcaSSatoshi Sahara $parsedQuery = static::queryParser($Indexer, $id); 221173bfbcaSSatoshi Sahara if (count($parsedQuery['ns']) > 0) { 222173bfbcaSSatoshi Sahara $ns = cleanID($parsedQuery['ns'][0]) . ':'; 223173bfbcaSSatoshi Sahara $id = implode(' ', $parsedQuery['highlight']); 224173bfbcaSSatoshi Sahara } 225173bfbcaSSatoshi Sahara 226173bfbcaSSatoshi Sahara $in_ns = $data['in_ns']; 227173bfbcaSSatoshi Sahara $in_title = $data['in_title']; 228173bfbcaSSatoshi Sahara $cleaned = cleanID($id); 229173bfbcaSSatoshi Sahara 230173bfbcaSSatoshi Sahara $pages = array(); 231173bfbcaSSatoshi Sahara if ($id !== '' && $cleaned !== '') { 232*c31af4f3SSatoshi Sahara $page_idx = $Indexer->getPages(); 233173bfbcaSSatoshi Sahara foreach ($page_idx as $p_id) { 234173bfbcaSSatoshi Sahara if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) { 235173bfbcaSSatoshi Sahara if (!isset($pages[$p_id])) { 236173bfbcaSSatoshi Sahara $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER); 237173bfbcaSSatoshi Sahara } 238173bfbcaSSatoshi Sahara } 239173bfbcaSSatoshi Sahara } 240173bfbcaSSatoshi Sahara if ($in_title) { 241173bfbcaSSatoshi Sahara $func = static::class.'::pageLookupTitleCompare'; 242173bfbcaSSatoshi Sahara foreach ($Indexer->lookupKey('title', $id, $func) as $p_id) { 243173bfbcaSSatoshi Sahara if (!isset($pages[$p_id])) { 244173bfbcaSSatoshi Sahara $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER); 245173bfbcaSSatoshi Sahara } 246173bfbcaSSatoshi Sahara } 247173bfbcaSSatoshi Sahara } 248173bfbcaSSatoshi Sahara } 249173bfbcaSSatoshi Sahara 250173bfbcaSSatoshi Sahara if (isset($ns)) { 251173bfbcaSSatoshi Sahara foreach (array_keys($pages) as $p_id) { 252173bfbcaSSatoshi Sahara if (strpos($p_id, $ns) !== 0) { 253173bfbcaSSatoshi Sahara unset($pages[$p_id]); 254173bfbcaSSatoshi Sahara } 255173bfbcaSSatoshi Sahara } 256173bfbcaSSatoshi Sahara } 257173bfbcaSSatoshi Sahara 258173bfbcaSSatoshi Sahara // discard hidden pages 259173bfbcaSSatoshi Sahara // discard nonexistent pages 260173bfbcaSSatoshi Sahara // check ACL permissions 261173bfbcaSSatoshi Sahara foreach (array_keys($pages) as $idx) { 262173bfbcaSSatoshi Sahara if (!isVisiblePage($idx) || !page_exists($idx) || auth_quickaclcheck($idx) < AUTH_READ) { 263173bfbcaSSatoshi Sahara unset($pages[$idx]); 264173bfbcaSSatoshi Sahara } 265173bfbcaSSatoshi Sahara } 266173bfbcaSSatoshi Sahara 267173bfbcaSSatoshi Sahara $pages = static::filterResultsByTime($pages, $data['after'], $data['before']); 268173bfbcaSSatoshi Sahara 269173bfbcaSSatoshi Sahara uksort($pages, static::class.'::pagesorter'); 270173bfbcaSSatoshi Sahara return $pages; 271173bfbcaSSatoshi Sahara } 272173bfbcaSSatoshi Sahara 273173bfbcaSSatoshi Sahara /** 274173bfbcaSSatoshi Sahara * @param array $results search results in the form pageid => value 275173bfbcaSSatoshi Sahara * @param int|string $after only returns results with mtime after this date, 276173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 277173bfbcaSSatoshi Sahara * @param int|string $before only returns results with mtime after this date, 278173bfbcaSSatoshi Sahara * accepts timestap or strtotime arguments 279173bfbcaSSatoshi Sahara * 280173bfbcaSSatoshi Sahara * @return array 281173bfbcaSSatoshi Sahara */ 282173bfbcaSSatoshi Sahara protected static function filterResultsByTime(array $results, $after, $before) 283173bfbcaSSatoshi Sahara { 284173bfbcaSSatoshi Sahara if ($after || $before) { 285173bfbcaSSatoshi Sahara $after = is_int($after) ? $after : strtotime($after); 286173bfbcaSSatoshi Sahara $before = is_int($before) ? $before : strtotime($before); 287173bfbcaSSatoshi Sahara 288173bfbcaSSatoshi Sahara foreach ($results as $id => $value) { 289173bfbcaSSatoshi Sahara $mTime = filemtime(wikiFN($id)); 290173bfbcaSSatoshi Sahara if ($after && $after > $mTime) { 291173bfbcaSSatoshi Sahara unset($results[$id]); 292173bfbcaSSatoshi Sahara continue; 293173bfbcaSSatoshi Sahara } 294173bfbcaSSatoshi Sahara if ($before && $before < $mTime) { 295173bfbcaSSatoshi Sahara unset($results[$id]); 296173bfbcaSSatoshi Sahara } 297173bfbcaSSatoshi Sahara } 298173bfbcaSSatoshi Sahara } 299173bfbcaSSatoshi Sahara 300173bfbcaSSatoshi Sahara return $results; 301173bfbcaSSatoshi Sahara } 302173bfbcaSSatoshi Sahara 303173bfbcaSSatoshi Sahara /** 304173bfbcaSSatoshi Sahara * Tiny helper function for comparing the searched title with the title 305173bfbcaSSatoshi Sahara * from the search index. This function is a wrapper around stripos with 306173bfbcaSSatoshi Sahara * adapted argument order and return value. 307173bfbcaSSatoshi Sahara * 308173bfbcaSSatoshi Sahara * @param string $search searched title 309173bfbcaSSatoshi Sahara * @param string $title title from index 310173bfbcaSSatoshi Sahara * @return bool 311173bfbcaSSatoshi Sahara */ 312173bfbcaSSatoshi Sahara public static function pageLookupTitleCompare($search, $title) 313173bfbcaSSatoshi Sahara { 314173bfbcaSSatoshi Sahara return stripos($title, $search) !== false; 315173bfbcaSSatoshi Sahara } 316173bfbcaSSatoshi Sahara 317173bfbcaSSatoshi Sahara /** 318173bfbcaSSatoshi Sahara * Sort pages based on their namespace level first, then on their string 319173bfbcaSSatoshi Sahara * values. This makes higher hierarchy pages rank higher than lower hierarchy 320173bfbcaSSatoshi Sahara * pages. 321173bfbcaSSatoshi Sahara * 322173bfbcaSSatoshi Sahara * @param string $a 323173bfbcaSSatoshi Sahara * @param string $b 324173bfbcaSSatoshi Sahara * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b, 325173bfbcaSSatoshi Sahara * and 0 if they are equal. 326173bfbcaSSatoshi Sahara */ 327173bfbcaSSatoshi Sahara protected static function pagesorter($a, $b) 328173bfbcaSSatoshi Sahara { 329173bfbcaSSatoshi Sahara $ac = count(explode(':',$a)); 330173bfbcaSSatoshi Sahara $bc = count(explode(':',$b)); 331173bfbcaSSatoshi Sahara if ($ac < $bc) { 332173bfbcaSSatoshi Sahara return -1; 333173bfbcaSSatoshi Sahara } elseif ($ac > $bc) { 334173bfbcaSSatoshi Sahara return 1; 335173bfbcaSSatoshi Sahara } 336173bfbcaSSatoshi Sahara return strcmp ($a,$b); 337173bfbcaSSatoshi Sahara } 338173bfbcaSSatoshi Sahara 339173bfbcaSSatoshi Sahara /** 340173bfbcaSSatoshi Sahara * Sort pages by their mtime, from newest to oldest 341173bfbcaSSatoshi Sahara * 342173bfbcaSSatoshi Sahara * @param string $a 343173bfbcaSSatoshi Sahara * @param string $b 344173bfbcaSSatoshi Sahara * 345173bfbcaSSatoshi Sahara * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a 346173bfbcaSSatoshi Sahara * and 0 if they are of the same age 347173bfbcaSSatoshi Sahara */ 348173bfbcaSSatoshi Sahara protected static function pagemtimesorter($a, $b) 349173bfbcaSSatoshi Sahara { 350173bfbcaSSatoshi Sahara $mtimeA = filemtime(wikiFN($a)); 351173bfbcaSSatoshi Sahara $mtimeB = filemtime(wikiFN($b)); 352173bfbcaSSatoshi Sahara return $mtimeB - $mtimeA; 353173bfbcaSSatoshi Sahara } 354173bfbcaSSatoshi Sahara 355173bfbcaSSatoshi Sahara /** 356173bfbcaSSatoshi Sahara * Creates a snippet extract 357173bfbcaSSatoshi Sahara * 358173bfbcaSSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 359173bfbcaSSatoshi Sahara * @triggers FULLTEXT_SNIPPET_CREATE 360173bfbcaSSatoshi Sahara * 361173bfbcaSSatoshi Sahara * @param string $id page id 362173bfbcaSSatoshi Sahara * @param array $highlight 363173bfbcaSSatoshi Sahara * @return mixed 364173bfbcaSSatoshi Sahara */ 365173bfbcaSSatoshi Sahara public static function snippet($id, $highlight) 366173bfbcaSSatoshi Sahara { 367173bfbcaSSatoshi Sahara $text = rawWiki($id); 368173bfbcaSSatoshi Sahara $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens 369173bfbcaSSatoshi Sahara $evdata = array( 370173bfbcaSSatoshi Sahara 'id' => $id, 371173bfbcaSSatoshi Sahara 'text' => &$text, 372173bfbcaSSatoshi Sahara 'highlight' => &$highlight, 373173bfbcaSSatoshi Sahara 'snippet' => '', 374173bfbcaSSatoshi Sahara ); 375173bfbcaSSatoshi Sahara 376173bfbcaSSatoshi Sahara $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata); 377173bfbcaSSatoshi Sahara if ($evt->advise_before()) { 378173bfbcaSSatoshi Sahara $match = array(); 379173bfbcaSSatoshi Sahara $snippets = array(); 380173bfbcaSSatoshi Sahara $utf8_offset = $offset = $end = 0; 381173bfbcaSSatoshi Sahara $len = \dokuwiki\Utf8\PhpString::strlen($text); 382173bfbcaSSatoshi Sahara 383173bfbcaSSatoshi Sahara // build a regexp from the phrases to highlight 384173bfbcaSSatoshi Sahara $re1 = '(' . 385173bfbcaSSatoshi Sahara join( 386173bfbcaSSatoshi Sahara '|', 387173bfbcaSSatoshi Sahara array_map( 388173bfbcaSSatoshi Sahara static::class.'::snippet_re_preprocess', 389173bfbcaSSatoshi Sahara array_map( 390173bfbcaSSatoshi Sahara 'preg_quote_cb', 391173bfbcaSSatoshi Sahara array_filter((array) $highlight) 392173bfbcaSSatoshi Sahara ) 393173bfbcaSSatoshi Sahara ) 394173bfbcaSSatoshi Sahara ) . 395173bfbcaSSatoshi Sahara ')'; 396173bfbcaSSatoshi Sahara $re2 = "$re1.{0,75}(?!\\1)$re1"; 397173bfbcaSSatoshi Sahara $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; 398173bfbcaSSatoshi Sahara 399173bfbcaSSatoshi Sahara for ($cnt=4; $cnt--;) { 400173bfbcaSSatoshi Sahara if (0) { 401173bfbcaSSatoshi Sahara } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 402173bfbcaSSatoshi Sahara } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 403173bfbcaSSatoshi Sahara } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 404173bfbcaSSatoshi Sahara } else { 405173bfbcaSSatoshi Sahara break; 406173bfbcaSSatoshi Sahara } 407173bfbcaSSatoshi Sahara 408173bfbcaSSatoshi Sahara list($str, $idx) = $match[0]; 409173bfbcaSSatoshi Sahara 410173bfbcaSSatoshi Sahara // convert $idx (a byte offset) into a utf8 character offset 411173bfbcaSSatoshi Sahara $utf8_idx = \dokuwiki\Utf8\PhpString::strlen(substr($text, 0, $idx)); 412173bfbcaSSatoshi Sahara $utf8_len = \dokuwiki\Utf8\PhpString::strlen($str); 413173bfbcaSSatoshi Sahara 414173bfbcaSSatoshi Sahara // establish context, 100 bytes surrounding the match string 415173bfbcaSSatoshi Sahara // first look to see if we can go 100 either side, 416173bfbcaSSatoshi Sahara // then drop to 50 adding any excess if the other side can't go to 50, 417173bfbcaSSatoshi Sahara $pre = min($utf8_idx - $utf8_offset, 100); 418173bfbcaSSatoshi Sahara $post = min($len - $utf8_idx - $utf8_len, 100); 419173bfbcaSSatoshi Sahara 420173bfbcaSSatoshi Sahara if ($pre > 50 && $post > 50) { 421173bfbcaSSatoshi Sahara $pre = $post = 50; 422173bfbcaSSatoshi Sahara } elseif ($pre > 50) { 423173bfbcaSSatoshi Sahara $pre = min($pre, 100 - $post); 424173bfbcaSSatoshi Sahara } elseif ($post > 50) { 425173bfbcaSSatoshi Sahara $post = min($post, 100 - $pre); 426173bfbcaSSatoshi Sahara } elseif ($offset == 0) { 427173bfbcaSSatoshi Sahara // both are less than 50, means the context is the whole string 428173bfbcaSSatoshi Sahara // make it so and break out of this loop - there is no need for the 429173bfbcaSSatoshi Sahara // complex snippet calculations 430173bfbcaSSatoshi Sahara $snippets = array($text); 431173bfbcaSSatoshi Sahara break; 432173bfbcaSSatoshi Sahara } 433173bfbcaSSatoshi Sahara 434173bfbcaSSatoshi Sahara // establish context start and end points, try to append to previous 435173bfbcaSSatoshi Sahara // context if possible 436173bfbcaSSatoshi Sahara $start = $utf8_idx - $pre; 437173bfbcaSSatoshi Sahara $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 438173bfbcaSSatoshi Sahara $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 439173bfbcaSSatoshi Sahara 440173bfbcaSSatoshi Sahara if ($append) { 441173bfbcaSSatoshi Sahara $snippets[count($snippets)-1] .= \dokuwiki\Utf8\PhpString::substr($text,$append,$end-$append); 442173bfbcaSSatoshi Sahara } else { 443173bfbcaSSatoshi Sahara $snippets[] = \dokuwiki\Utf8\PhpString::substr($text,$start,$end-$start); 444173bfbcaSSatoshi Sahara } 445173bfbcaSSatoshi Sahara 446173bfbcaSSatoshi Sahara // set $offset for next match attempt 447173bfbcaSSatoshi Sahara // continue matching after the current match 448173bfbcaSSatoshi Sahara // if the current match is not the longest possible match starting at the current offset 449173bfbcaSSatoshi Sahara // this prevents further matching of this snippet but for possible matches of length 450173bfbcaSSatoshi Sahara // smaller than match length + context (at least 50 characters) this match is part of the context 451173bfbcaSSatoshi Sahara $utf8_offset = $utf8_idx + $utf8_len; 452173bfbcaSSatoshi Sahara $offset = $idx + strlen(\dokuwiki\Utf8\PhpString::substr($text,$utf8_idx,$utf8_len)); 453173bfbcaSSatoshi Sahara $offset = \dokuwiki\Utf8\Clean::correctIdx($text,$offset); 454173bfbcaSSatoshi Sahara } 455173bfbcaSSatoshi Sahara 456173bfbcaSSatoshi Sahara $m = "\1"; 457173bfbcaSSatoshi Sahara $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets); 458173bfbcaSSatoshi Sahara $snippet = preg_replace( 459173bfbcaSSatoshi Sahara '/' . $m . '([^' . $m . ']*?)' . $m . '/iu', 460173bfbcaSSatoshi Sahara '<strong class="search_hit">$1</strong>', 461173bfbcaSSatoshi Sahara hsc(join('... ', $snippets)) 462173bfbcaSSatoshi Sahara ); 463173bfbcaSSatoshi Sahara 464173bfbcaSSatoshi Sahara $evdata['snippet'] = $snippet; 465173bfbcaSSatoshi Sahara } 466173bfbcaSSatoshi Sahara $evt->advise_after(); 467173bfbcaSSatoshi Sahara unset($evt); 468173bfbcaSSatoshi Sahara 469173bfbcaSSatoshi Sahara return $evdata['snippet']; 470173bfbcaSSatoshi Sahara } 471173bfbcaSSatoshi Sahara 472173bfbcaSSatoshi Sahara /** 473173bfbcaSSatoshi Sahara * Wraps a search term in regex boundary checks. 474173bfbcaSSatoshi Sahara * 475173bfbcaSSatoshi Sahara * @param string $term 476173bfbcaSSatoshi Sahara * @return string 477173bfbcaSSatoshi Sahara */ 478173bfbcaSSatoshi Sahara public static function snippet_re_preprocess($term) 479173bfbcaSSatoshi Sahara { 480173bfbcaSSatoshi Sahara // do not process asian terms where word boundaries are not explicit 481173bfbcaSSatoshi Sahara if (\dokuwiki\Utf8\Asian::isAsianWords($term)) return $term; 482173bfbcaSSatoshi Sahara 483173bfbcaSSatoshi Sahara if (UTF8_PROPERTYSUPPORT) { 484173bfbcaSSatoshi Sahara // unicode word boundaries 485173bfbcaSSatoshi Sahara // see http://stackoverflow.com/a/2449017/172068 486173bfbcaSSatoshi Sahara $BL = '(?<!\pL)'; 487173bfbcaSSatoshi Sahara $BR = '(?!\pL)'; 488173bfbcaSSatoshi Sahara } else { 489173bfbcaSSatoshi Sahara // not as correct as above, but at least won't break 490173bfbcaSSatoshi Sahara $BL = '\b'; 491173bfbcaSSatoshi Sahara $BR = '\b'; 492173bfbcaSSatoshi Sahara } 493173bfbcaSSatoshi Sahara 494173bfbcaSSatoshi Sahara if (substr($term, 0, 2) == '\\*') { 495173bfbcaSSatoshi Sahara $term = substr($term, 2); 496173bfbcaSSatoshi Sahara } else { 497173bfbcaSSatoshi Sahara $term = $BL.$term; 498173bfbcaSSatoshi Sahara } 499173bfbcaSSatoshi Sahara 500173bfbcaSSatoshi Sahara if (substr($term, -2, 2) == '\\*') { 501173bfbcaSSatoshi Sahara $term = substr($term, 0, -2); 502173bfbcaSSatoshi Sahara } else { 503173bfbcaSSatoshi Sahara $term = $term.$BR; 504173bfbcaSSatoshi Sahara } 505173bfbcaSSatoshi Sahara 506173bfbcaSSatoshi Sahara if ($term == $BL || $term == $BR || $term == $BL.$BR) { 507173bfbcaSSatoshi Sahara $term = ''; 508173bfbcaSSatoshi Sahara } 509173bfbcaSSatoshi Sahara return $term; 510173bfbcaSSatoshi Sahara } 511173bfbcaSSatoshi Sahara 512173bfbcaSSatoshi Sahara /** 513173bfbcaSSatoshi Sahara * Combine found documents and sum up their scores 514173bfbcaSSatoshi Sahara * 515173bfbcaSSatoshi Sahara * This function is used to combine searched words with a logical 516173bfbcaSSatoshi Sahara * AND. Only documents available in all arrays are returned. 517173bfbcaSSatoshi Sahara * 518173bfbcaSSatoshi Sahara * based upon PEAR's PHP_Compat function for array_intersect_key() 519173bfbcaSSatoshi Sahara * 520173bfbcaSSatoshi Sahara * @param array $args An array of page arrays 521173bfbcaSSatoshi Sahara * @return array 522173bfbcaSSatoshi Sahara */ 523173bfbcaSSatoshi Sahara protected static function resultCombine($args) 524173bfbcaSSatoshi Sahara { 525173bfbcaSSatoshi Sahara $array_count = count($args); 526173bfbcaSSatoshi Sahara if ($array_count == 1) { 527173bfbcaSSatoshi Sahara return $args[0]; 528173bfbcaSSatoshi Sahara } 529173bfbcaSSatoshi Sahara 530173bfbcaSSatoshi Sahara $result = array(); 531173bfbcaSSatoshi Sahara if ($array_count > 1) { 532173bfbcaSSatoshi Sahara foreach ($args[0] as $key => $value) { 533173bfbcaSSatoshi Sahara $result[$key] = $value; 534173bfbcaSSatoshi Sahara for ($i = 1; $i !== $array_count; $i++) { 535173bfbcaSSatoshi Sahara if (!isset($args[$i][$key])) { 536173bfbcaSSatoshi Sahara unset($result[$key]); 537173bfbcaSSatoshi Sahara break; 538173bfbcaSSatoshi Sahara } 539173bfbcaSSatoshi Sahara $result[$key] += $args[$i][$key]; 540173bfbcaSSatoshi Sahara } 541173bfbcaSSatoshi Sahara } 542173bfbcaSSatoshi Sahara } 543173bfbcaSSatoshi Sahara return $result; 544173bfbcaSSatoshi Sahara } 545173bfbcaSSatoshi Sahara 546173bfbcaSSatoshi Sahara /** 547173bfbcaSSatoshi Sahara * Unites found documents and sum up their scores 548173bfbcaSSatoshi Sahara * based upon resultCombine() method 549173bfbcaSSatoshi Sahara * 550173bfbcaSSatoshi Sahara * @param array $args An array of page arrays 551173bfbcaSSatoshi Sahara * @return array 552173bfbcaSSatoshi Sahara * 553173bfbcaSSatoshi Sahara * @author Kazutaka Miyasaka <kazmiya@gmail.com> 554173bfbcaSSatoshi Sahara */ 555173bfbcaSSatoshi Sahara protected static function resultUnite($args) 556173bfbcaSSatoshi Sahara { 557173bfbcaSSatoshi Sahara $array_count = count($args); 558173bfbcaSSatoshi Sahara if ($array_count === 1) { 559173bfbcaSSatoshi Sahara return $args[0]; 560173bfbcaSSatoshi Sahara } 561173bfbcaSSatoshi Sahara 562173bfbcaSSatoshi Sahara $result = $args[0]; 563173bfbcaSSatoshi Sahara for ($i = 1; $i !== $array_count; $i++) { 564173bfbcaSSatoshi Sahara foreach (array_keys($args[$i]) as $id) { 565173bfbcaSSatoshi Sahara $result[$id] += $args[$i][$id]; 566173bfbcaSSatoshi Sahara } 567173bfbcaSSatoshi Sahara } 568173bfbcaSSatoshi Sahara return $result; 569173bfbcaSSatoshi Sahara } 570173bfbcaSSatoshi Sahara 571173bfbcaSSatoshi Sahara /** 572173bfbcaSSatoshi Sahara * Computes the difference of documents using page id for comparison 573173bfbcaSSatoshi Sahara * nearly identical to PHP5's array_diff_key() 574173bfbcaSSatoshi Sahara * 575173bfbcaSSatoshi Sahara * @param array $args An array of page arrays 576173bfbcaSSatoshi Sahara * @return array 577173bfbcaSSatoshi Sahara * 578173bfbcaSSatoshi Sahara * @author Kazutaka Miyasaka <kazmiya@gmail.com> 579173bfbcaSSatoshi Sahara */ 580173bfbcaSSatoshi Sahara protected static function resultComplement($args) 581173bfbcaSSatoshi Sahara { 582173bfbcaSSatoshi Sahara $array_count = count($args); 583173bfbcaSSatoshi Sahara if ($array_count === 1) { 584173bfbcaSSatoshi Sahara return $args[0]; 585173bfbcaSSatoshi Sahara } 586173bfbcaSSatoshi Sahara 587173bfbcaSSatoshi Sahara $result = $args[0]; 588173bfbcaSSatoshi Sahara foreach (array_keys($result) as $id) { 589173bfbcaSSatoshi Sahara for ($i = 1; $i !== $array_count; $i++) { 590173bfbcaSSatoshi Sahara if (isset($args[$i][$id])) unset($result[$id]); 591173bfbcaSSatoshi Sahara } 592173bfbcaSSatoshi Sahara } 593173bfbcaSSatoshi Sahara return $result; 594173bfbcaSSatoshi Sahara } 595173bfbcaSSatoshi Sahara 596173bfbcaSSatoshi Sahara /** 597173bfbcaSSatoshi Sahara * Parses a search query and builds an array of search formulas 598173bfbcaSSatoshi Sahara * 599173bfbcaSSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 600173bfbcaSSatoshi Sahara * @author Kazutaka Miyasaka <kazmiya@gmail.com> 601173bfbcaSSatoshi Sahara * 602*c31af4f3SSatoshi Sahara * @param Indexer $Indexer 603173bfbcaSSatoshi Sahara * @param string $query search query 604173bfbcaSSatoshi Sahara * @return array of search formulas 605173bfbcaSSatoshi Sahara */ 606173bfbcaSSatoshi Sahara public static function queryParser($Indexer, $query) 607173bfbcaSSatoshi Sahara { 608173bfbcaSSatoshi Sahara /** 609173bfbcaSSatoshi Sahara * parse a search query and transform it into intermediate representation 610173bfbcaSSatoshi Sahara * 611173bfbcaSSatoshi Sahara * in a search query, you can use the following expressions: 612173bfbcaSSatoshi Sahara * 613173bfbcaSSatoshi Sahara * words: 614173bfbcaSSatoshi Sahara * include 615173bfbcaSSatoshi Sahara * -exclude 616173bfbcaSSatoshi Sahara * phrases: 617173bfbcaSSatoshi Sahara * "phrase to be included" 618173bfbcaSSatoshi Sahara * -"phrase you want to exclude" 619173bfbcaSSatoshi Sahara * namespaces: 620173bfbcaSSatoshi Sahara * @include:namespace (or ns:include:namespace) 621173bfbcaSSatoshi Sahara * ^exclude:namespace (or -ns:exclude:namespace) 622173bfbcaSSatoshi Sahara * groups: 623173bfbcaSSatoshi Sahara * () 624173bfbcaSSatoshi Sahara * -() 625173bfbcaSSatoshi Sahara * operators: 626173bfbcaSSatoshi Sahara * and ('and' is the default operator: you can always omit this) 627173bfbcaSSatoshi Sahara * or (or pipe symbol '|', lower precedence than 'and') 628173bfbcaSSatoshi Sahara * 629173bfbcaSSatoshi Sahara * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain 630173bfbcaSSatoshi Sahara * a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'". 631173bfbcaSSatoshi Sahara * this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ] 632173bfbcaSSatoshi Sahara * as long as you don't mind hit counts. 633173bfbcaSSatoshi Sahara * 634173bfbcaSSatoshi Sahara * intermediate representation consists of the following parts: 635173bfbcaSSatoshi Sahara * 636173bfbcaSSatoshi Sahara * ( ) - group 637173bfbcaSSatoshi Sahara * AND - logical and 638173bfbcaSSatoshi Sahara * OR - logical or 639173bfbcaSSatoshi Sahara * NOT - logical not 640173bfbcaSSatoshi Sahara * W+:, W-:, W_: - word (underscore: no need to highlight) 641173bfbcaSSatoshi Sahara * P+:, P-: - phrase (minus sign: logically in NOT group) 642173bfbcaSSatoshi Sahara * N+:, N-: - namespace 643173bfbcaSSatoshi Sahara */ 644173bfbcaSSatoshi Sahara $parsed_query = ''; 645173bfbcaSSatoshi Sahara $parens_level = 0; 646173bfbcaSSatoshi Sahara $terms = preg_split('/(-?".*?")/u', \dokuwiki\Utf8\PhpString::strtolower($query), 647173bfbcaSSatoshi Sahara -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 648173bfbcaSSatoshi Sahara ); 649173bfbcaSSatoshi Sahara 650173bfbcaSSatoshi Sahara foreach ($terms as $term) { 651173bfbcaSSatoshi Sahara $parsed = ''; 652173bfbcaSSatoshi Sahara if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { 653173bfbcaSSatoshi Sahara // phrase-include and phrase-exclude 654173bfbcaSSatoshi Sahara $not = $matches[1] ? 'NOT' : ''; 655173bfbcaSSatoshi Sahara $parsed = $not . static::termParser($Indexer, $matches[2], false, true); 656173bfbcaSSatoshi Sahara } else { 657173bfbcaSSatoshi Sahara // fix incomplete phrase 658173bfbcaSSatoshi Sahara $term = str_replace('"', ' ', $term); 659173bfbcaSSatoshi Sahara 660173bfbcaSSatoshi Sahara // fix parentheses 661173bfbcaSSatoshi Sahara $term = str_replace(')' , ' ) ', $term); 662173bfbcaSSatoshi Sahara $term = str_replace('(' , ' ( ', $term); 663173bfbcaSSatoshi Sahara $term = str_replace('- (', ' -(', $term); 664173bfbcaSSatoshi Sahara 665173bfbcaSSatoshi Sahara // treat pipe symbols as 'OR' operators 666173bfbcaSSatoshi Sahara $term = str_replace('|', ' or ', $term); 667173bfbcaSSatoshi Sahara 668173bfbcaSSatoshi Sahara // treat ideographic spaces (U+3000) as search term separators 669173bfbcaSSatoshi Sahara // FIXME: some more separators? 670173bfbcaSSatoshi Sahara $term = preg_replace('/[ \x{3000}]+/u', ' ', $term); 671173bfbcaSSatoshi Sahara $term = trim($term); 672173bfbcaSSatoshi Sahara if ($term === '') continue; 673173bfbcaSSatoshi Sahara 674173bfbcaSSatoshi Sahara $tokens = explode(' ', $term); 675173bfbcaSSatoshi Sahara foreach ($tokens as $token) { 676173bfbcaSSatoshi Sahara if ($token === '(') { 677173bfbcaSSatoshi Sahara // parenthesis-include-open 678173bfbcaSSatoshi Sahara $parsed .= '('; 679173bfbcaSSatoshi Sahara ++$parens_level; 680173bfbcaSSatoshi Sahara } elseif ($token === '-(') { 681173bfbcaSSatoshi Sahara // parenthesis-exclude-open 682173bfbcaSSatoshi Sahara $parsed .= 'NOT('; 683173bfbcaSSatoshi Sahara ++$parens_level; 684173bfbcaSSatoshi Sahara } elseif ($token === ')') { 685173bfbcaSSatoshi Sahara // parenthesis-any-close 686173bfbcaSSatoshi Sahara if ($parens_level === 0) continue; 687173bfbcaSSatoshi Sahara $parsed .= ')'; 688173bfbcaSSatoshi Sahara $parens_level--; 689173bfbcaSSatoshi Sahara } elseif ($token === 'and') { 690173bfbcaSSatoshi Sahara // logical-and (do nothing) 691173bfbcaSSatoshi Sahara } elseif ($token === 'or') { 692173bfbcaSSatoshi Sahara // logical-or 693173bfbcaSSatoshi Sahara $parsed .= 'OR'; 694173bfbcaSSatoshi Sahara } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) { 695173bfbcaSSatoshi Sahara // namespace-exclude 696173bfbcaSSatoshi Sahara $parsed .= 'NOT(N+:'.$matches[1].')'; 697173bfbcaSSatoshi Sahara } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) { 698173bfbcaSSatoshi Sahara // namespace-include 699173bfbcaSSatoshi Sahara $parsed .= '(N+:'.$matches[1].')'; 700173bfbcaSSatoshi Sahara } elseif (preg_match('/^-(.+)$/', $token, $matches)) { 701173bfbcaSSatoshi Sahara // word-exclude 702173bfbcaSSatoshi Sahara $parsed .= 'NOT('.static::termParser($Indexer, $matches[1]).')'; 703173bfbcaSSatoshi Sahara } else { 704173bfbcaSSatoshi Sahara // word-include 705173bfbcaSSatoshi Sahara $parsed .= static::termParser($Indexer, $token); 706173bfbcaSSatoshi Sahara } 707173bfbcaSSatoshi Sahara } 708173bfbcaSSatoshi Sahara } 709173bfbcaSSatoshi Sahara $parsed_query .= $parsed; 710173bfbcaSSatoshi Sahara } 711173bfbcaSSatoshi Sahara 712173bfbcaSSatoshi Sahara // cleanup (very sensitive) 713173bfbcaSSatoshi Sahara $parsed_query .= str_repeat(')', $parens_level); 714173bfbcaSSatoshi Sahara do { 715173bfbcaSSatoshi Sahara $parsed_query_old = $parsed_query; 716173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query); 717173bfbcaSSatoshi Sahara } while ($parsed_query !== $parsed_query_old); 718173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')' , $parsed_query); 719173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/(OR)+/u' , 'OR' , $parsed_query); 720173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/\(OR/u' , '(' , $parsed_query); 721173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/^OR|OR$/u' , '' , $parsed_query); 722173bfbcaSSatoshi Sahara $parsed_query = preg_replace('/\)(NOT)?\(/u' , ')AND$1(', $parsed_query); 723173bfbcaSSatoshi Sahara 724173bfbcaSSatoshi Sahara // adjustment: make highlightings right 725173bfbcaSSatoshi Sahara $parens_level = 0; 726173bfbcaSSatoshi Sahara $notgrp_levels = array(); 727173bfbcaSSatoshi Sahara $parsed_query_new = ''; 728173bfbcaSSatoshi Sahara $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, 729173bfbcaSSatoshi Sahara -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 730173bfbcaSSatoshi Sahara ); 731173bfbcaSSatoshi Sahara foreach ($tokens as $token) { 732173bfbcaSSatoshi Sahara if ($token === 'NOT(') { 733173bfbcaSSatoshi Sahara $notgrp_levels[] = ++$parens_level; 734173bfbcaSSatoshi Sahara } elseif ($token === '(') { 735173bfbcaSSatoshi Sahara ++$parens_level; 736173bfbcaSSatoshi Sahara } elseif ($token === ')') { 737173bfbcaSSatoshi Sahara if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels); 738173bfbcaSSatoshi Sahara } elseif (count($notgrp_levels) % 2 === 1) { 739173bfbcaSSatoshi Sahara // turn highlight-flag off if terms are logically in "NOT" group 740173bfbcaSSatoshi Sahara $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token); 741173bfbcaSSatoshi Sahara } 742173bfbcaSSatoshi Sahara $parsed_query_new .= $token; 743173bfbcaSSatoshi Sahara } 744173bfbcaSSatoshi Sahara $parsed_query = $parsed_query_new; 745173bfbcaSSatoshi Sahara 746173bfbcaSSatoshi Sahara /** 747173bfbcaSSatoshi Sahara * convert infix notation string into postfix (Reverse Polish notation) array 748173bfbcaSSatoshi Sahara * by Shunting-yard algorithm 749173bfbcaSSatoshi Sahara * 750173bfbcaSSatoshi Sahara * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation 751173bfbcaSSatoshi Sahara * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm 752173bfbcaSSatoshi Sahara */ 753173bfbcaSSatoshi Sahara $parsed_ary = array(); 754173bfbcaSSatoshi Sahara $ope_stack = array(); 755173bfbcaSSatoshi Sahara $ope_precedence = array(')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5); 756173bfbcaSSatoshi Sahara $ope_regex = '/([()]|OR|AND|NOT)/u'; 757173bfbcaSSatoshi Sahara 758173bfbcaSSatoshi Sahara $tokens = preg_split($ope_regex, $parsed_query, 759173bfbcaSSatoshi Sahara -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 760173bfbcaSSatoshi Sahara ); 761173bfbcaSSatoshi Sahara foreach ($tokens as $token) { 762173bfbcaSSatoshi Sahara if (preg_match($ope_regex, $token)) { 763173bfbcaSSatoshi Sahara // operator 764173bfbcaSSatoshi Sahara $last_ope = end($ope_stack); 765173bfbcaSSatoshi Sahara while ($last_ope !== false 766173bfbcaSSatoshi Sahara && $ope_precedence[$token] <= $ope_precedence[$last_ope] 767173bfbcaSSatoshi Sahara && $last_ope != '(' 768173bfbcaSSatoshi Sahara ) { 769173bfbcaSSatoshi Sahara $parsed_ary[] = array_pop($ope_stack); 770173bfbcaSSatoshi Sahara $last_ope = end($ope_stack); 771173bfbcaSSatoshi Sahara } 772173bfbcaSSatoshi Sahara if ($token == ')') { 773173bfbcaSSatoshi Sahara array_pop($ope_stack); // this array_pop always deletes '(' 774173bfbcaSSatoshi Sahara } else { 775173bfbcaSSatoshi Sahara $ope_stack[] = $token; 776173bfbcaSSatoshi Sahara } 777173bfbcaSSatoshi Sahara } else { 778173bfbcaSSatoshi Sahara // operand 779173bfbcaSSatoshi Sahara $token_decoded = str_replace(['OP','CP'], ['(',')'], $token); 780173bfbcaSSatoshi Sahara $parsed_ary[] = $token_decoded; 781173bfbcaSSatoshi Sahara } 782173bfbcaSSatoshi Sahara } 783173bfbcaSSatoshi Sahara $parsed_ary = array_values(array_merge($parsed_ary, array_reverse($ope_stack))); 784173bfbcaSSatoshi Sahara 785173bfbcaSSatoshi Sahara // cleanup: each double "NOT" in RPN array actually does nothing 786173bfbcaSSatoshi Sahara $parsed_ary_count = count($parsed_ary); 787173bfbcaSSatoshi Sahara for ($i = 1; $i < $parsed_ary_count; ++$i) { 788173bfbcaSSatoshi Sahara if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') { 789173bfbcaSSatoshi Sahara unset($parsed_ary[$i], $parsed_ary[$i - 1]); 790173bfbcaSSatoshi Sahara } 791173bfbcaSSatoshi Sahara } 792173bfbcaSSatoshi Sahara $parsed_ary = array_values($parsed_ary); 793173bfbcaSSatoshi Sahara 794173bfbcaSSatoshi Sahara // build return value 795173bfbcaSSatoshi Sahara $q = array(); 796173bfbcaSSatoshi Sahara $q['query'] = $query; 797173bfbcaSSatoshi Sahara $q['parsed_str'] = $parsed_query; 798173bfbcaSSatoshi Sahara $q['parsed_ary'] = $parsed_ary; 799173bfbcaSSatoshi Sahara 800173bfbcaSSatoshi Sahara foreach ($q['parsed_ary'] as $token) { 801173bfbcaSSatoshi Sahara if ($token[2] !== ':') continue; 802173bfbcaSSatoshi Sahara $body = substr($token, 3); 803173bfbcaSSatoshi Sahara 804173bfbcaSSatoshi Sahara switch (substr($token, 0, 3)) { 805173bfbcaSSatoshi Sahara case 'N+:': 806173bfbcaSSatoshi Sahara $q['ns'][] = $body; // for backward compatibility 807173bfbcaSSatoshi Sahara break; 808173bfbcaSSatoshi Sahara case 'N-:': 809173bfbcaSSatoshi Sahara $q['notns'][] = $body; // for backward compatibility 810173bfbcaSSatoshi Sahara break; 811173bfbcaSSatoshi Sahara case 'W_:': 812173bfbcaSSatoshi Sahara $q['words'][] = $body; 813173bfbcaSSatoshi Sahara break; 814173bfbcaSSatoshi Sahara case 'W-:': 815173bfbcaSSatoshi Sahara $q['words'][] = $body; 816173bfbcaSSatoshi Sahara $q['not'][] = $body; // for backward compatibility 817173bfbcaSSatoshi Sahara break; 818173bfbcaSSatoshi Sahara case 'W+:': 819173bfbcaSSatoshi Sahara $q['words'][] = $body; 820173bfbcaSSatoshi Sahara $q['highlight'][] = $body; 821173bfbcaSSatoshi Sahara $q['and'][] = $body; // for backward compatibility 822173bfbcaSSatoshi Sahara break; 823173bfbcaSSatoshi Sahara case 'P-:': 824173bfbcaSSatoshi Sahara $q['phrases'][] = $body; 825173bfbcaSSatoshi Sahara break; 826173bfbcaSSatoshi Sahara case 'P+:': 827173bfbcaSSatoshi Sahara $q['phrases'][] = $body; 828173bfbcaSSatoshi Sahara $q['highlight'][] = $body; 829173bfbcaSSatoshi Sahara break; 830173bfbcaSSatoshi Sahara } 831173bfbcaSSatoshi Sahara } 832173bfbcaSSatoshi Sahara foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) { 833173bfbcaSSatoshi Sahara $q[$key] = empty($q[$key]) ? array() : array_values(array_unique($q[$key])); 834173bfbcaSSatoshi Sahara } 835173bfbcaSSatoshi Sahara 836173bfbcaSSatoshi Sahara return $q; 837173bfbcaSSatoshi Sahara } 838173bfbcaSSatoshi Sahara 839173bfbcaSSatoshi Sahara /** 840173bfbcaSSatoshi Sahara * Transforms given search term into intermediate representation 841173bfbcaSSatoshi Sahara * 842173bfbcaSSatoshi Sahara * This function is used in ft_queryParser() and not for general purpose use. 843173bfbcaSSatoshi Sahara * 844173bfbcaSSatoshi Sahara * @author Kazutaka Miyasaka <kazmiya@gmail.com> 845173bfbcaSSatoshi Sahara * 846*c31af4f3SSatoshi Sahara * @param Indexer $Indexer 847173bfbcaSSatoshi Sahara * @param string $term 848173bfbcaSSatoshi Sahara * @param bool $consider_asian 849173bfbcaSSatoshi Sahara * @param bool $phrase_mode 850173bfbcaSSatoshi Sahara * @return string 851173bfbcaSSatoshi Sahara */ 852173bfbcaSSatoshi Sahara public static function termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) 853173bfbcaSSatoshi Sahara { 854173bfbcaSSatoshi Sahara $parsed = ''; 855173bfbcaSSatoshi Sahara if ($consider_asian) { 856173bfbcaSSatoshi Sahara // successive asian characters need to be searched as a phrase 857173bfbcaSSatoshi Sahara $words = \dokuwiki\Utf8\Asian::splitAsianWords($term); 858173bfbcaSSatoshi Sahara foreach ($words as $word) { 859173bfbcaSSatoshi Sahara $phrase_mode = $phrase_mode ? true : \dokuwiki\Utf8\Asian::isAsianWords($word); 860173bfbcaSSatoshi Sahara $parsed .= static::termParser($Indexer, $word, false, $phrase_mode); 861173bfbcaSSatoshi Sahara } 862173bfbcaSSatoshi Sahara } else { 863173bfbcaSSatoshi Sahara $term_noparen = str_replace(['(',')'], ' ', $term); 864173bfbcaSSatoshi Sahara $words = $Indexer->tokenizer($term_noparen, true); 865173bfbcaSSatoshi Sahara 866173bfbcaSSatoshi Sahara // W_: no need to highlight 867173bfbcaSSatoshi Sahara if (empty($words)) { 868173bfbcaSSatoshi Sahara $parsed = '()'; // important: do not remove 869173bfbcaSSatoshi Sahara } elseif ($words[0] === $term) { 870173bfbcaSSatoshi Sahara $parsed = '(W+:'.$words[0].')'; 871173bfbcaSSatoshi Sahara } elseif ($phrase_mode) { 872173bfbcaSSatoshi Sahara $term_encoded = str_replace(['(',')'], ['OP','CP'], $term); 873173bfbcaSSatoshi Sahara $parsed = '((W_:'.implode(')(W_:', $words).')(P+:'.$term_encoded.'))'; 874173bfbcaSSatoshi Sahara } else { 875173bfbcaSSatoshi Sahara $parsed = '((W+:'.implode(')(W+:', $words).'))'; 876173bfbcaSSatoshi Sahara } 877173bfbcaSSatoshi Sahara } 878173bfbcaSSatoshi Sahara return $parsed; 879173bfbcaSSatoshi Sahara } 880173bfbcaSSatoshi Sahara 881173bfbcaSSatoshi Sahara /** 882173bfbcaSSatoshi Sahara * Recreate a search query string based on parsed parts, 883173bfbcaSSatoshi Sahara * doesn't support negated phrases and `OR` searches 884173bfbcaSSatoshi Sahara * 885173bfbcaSSatoshi Sahara * @param array $and 886173bfbcaSSatoshi Sahara * @param array $not 887173bfbcaSSatoshi Sahara * @param array $phrases 888173bfbcaSSatoshi Sahara * @param array $ns 889173bfbcaSSatoshi Sahara * @param array $notns 890173bfbcaSSatoshi Sahara * 891173bfbcaSSatoshi Sahara * @return string 892173bfbcaSSatoshi Sahara */ 893173bfbcaSSatoshi Sahara public static function queryUnparser_simple( 894173bfbcaSSatoshi Sahara array $and, array $not, array $phrases, array $ns, array $notns 895173bfbcaSSatoshi Sahara ) { 896173bfbcaSSatoshi Sahara $query = implode(' ', $and); 897173bfbcaSSatoshi Sahara 898173bfbcaSSatoshi Sahara if (!empty($not)) { 899173bfbcaSSatoshi Sahara $query .= ' -' . implode(' -', $not); 900173bfbcaSSatoshi Sahara } 901173bfbcaSSatoshi Sahara if (!empty($phrases)) { 902173bfbcaSSatoshi Sahara $query .= ' "' . implode('" "', $phrases) . '"'; 903173bfbcaSSatoshi Sahara } 904173bfbcaSSatoshi Sahara if (!empty($ns)) { 905173bfbcaSSatoshi Sahara $query .= ' @' . implode(' @', $ns); 906173bfbcaSSatoshi Sahara } 907173bfbcaSSatoshi Sahara if (!empty($notns)) { 908173bfbcaSSatoshi Sahara $query .= ' ^' . implode(' ^', $notns); 909173bfbcaSSatoshi Sahara } 910173bfbcaSSatoshi Sahara return $query; 911173bfbcaSSatoshi Sahara } 912173bfbcaSSatoshi Sahara} 913