1f5eb7cf0SAndreas Gohr<?php 2d4f83172SAndreas Gohr 3f5eb7cf0SAndreas Gohr/** 4f5eb7cf0SAndreas Gohr * DokuWiki fulltextsearch functions using the index 5f5eb7cf0SAndreas Gohr * 6f5eb7cf0SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 7f5eb7cf0SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 827f63a23SAndreas Gohr */ 9d4f83172SAndreas Gohr 1024870174SAndreas Gohruse dokuwiki\Utf8\Asian; 1124870174SAndreas Gohruse dokuwiki\Search\Indexer; 1227f63a23SAndreas Gohruse dokuwiki\Extension\Event; 137fb26b8eSAndreas Gohruse dokuwiki\Utf8\Clean; 147fb26b8eSAndreas Gohruse dokuwiki\Utf8\PhpString; 152d85e841SAndreas Gohruse dokuwiki\Utf8\Sort; 16f5eb7cf0SAndreas Gohr 17bd0293e7SAndreas Gohr/** 18bd0293e7SAndreas Gohr * create snippets for the first few results only 19bd0293e7SAndreas Gohr */ 20bd0293e7SAndreas Gohrif (!defined('FT_SNIPPET_NUMBER')) define('FT_SNIPPET_NUMBER', 15); 21f5eb7cf0SAndreas Gohr 22f5eb7cf0SAndreas Gohr/** 23f5eb7cf0SAndreas Gohr * The fulltext search 24f5eb7cf0SAndreas Gohr * 25f5eb7cf0SAndreas Gohr * Returns a list of matching documents for the given query 26506fa893SAndreas Gohr * 276840140fSChris Smith * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event() 286840140fSChris Smith * 2942ea7f44SGerrit Uitslag * @param string $query 3042ea7f44SGerrit Uitslag * @param array $highlight 313850270cSMichael Große * @param string $sort 3264159a61SAndreas Gohr * @param int|string $after only show results with mtime after this date, accepts timestap or strtotime arguments 3364159a61SAndreas Gohr * @param int|string $before only show results with mtime before this date, accepts timestap or strtotime arguments 343850270cSMichael Große * 3542ea7f44SGerrit Uitslag * @return array 36f5eb7cf0SAndreas Gohr */ 37d868eb89SAndreas Gohrfunction ft_pageSearch($query, &$highlight, $sort = null, $after = null, $before = null) 38d868eb89SAndreas Gohr{ 396840140fSChris Smith 403850270cSMichael Große if ($sort === null) { 413850270cSMichael Große $sort = 'hits'; 423850270cSMichael Große } 433850270cSMichael Große $data = [ 443850270cSMichael Große 'query' => $query, 453850270cSMichael Große 'sort' => $sort, 463850270cSMichael Große 'after' => $after, 473850270cSMichael Große 'before' => $before 483850270cSMichael Große ]; 496840140fSChris Smith $data['highlight'] =& $highlight; 506840140fSChris Smith 51cbb44eabSAndreas Gohr return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, '_ft_pageSearch'); 526840140fSChris Smith} 53865c2687SKazutaka Miyasaka 54865c2687SKazutaka Miyasaka/** 55865c2687SKazutaka Miyasaka * Returns a list of matching documents for the given query 56865c2687SKazutaka Miyasaka * 57865c2687SKazutaka Miyasaka * @author Andreas Gohr <andi@splitbrain.org> 58865c2687SKazutaka Miyasaka * @author Kazutaka Miyasaka <kazmiya@gmail.com> 5942ea7f44SGerrit Uitslag * 6042ea7f44SGerrit Uitslag * @param array $data event data 6142ea7f44SGerrit Uitslag * @return array matching documents 62865c2687SKazutaka Miyasaka */ 63d868eb89SAndreas Gohrfunction _ft_pageSearch(&$data) 64d868eb89SAndreas Gohr{ 659b41be24STom N Harris $Indexer = idx_get_indexer(); 669b41be24STom N Harris 67865c2687SKazutaka Miyasaka // parse the given query 689b41be24STom N Harris $q = ft_queryParser($Indexer, $data['query']); 69865c2687SKazutaka Miyasaka $data['highlight'] = $q['highlight']; 706840140fSChris Smith 7124870174SAndreas Gohr if (empty($q['parsed_ary'])) return []; 72506fa893SAndreas Gohr 73f5eb7cf0SAndreas Gohr // lookup all words found in the query 749b41be24STom N Harris $lookup = $Indexer->lookup($q['words']); 75f5eb7cf0SAndreas Gohr 76865c2687SKazutaka Miyasaka // get all pages in this dokuwiki site (!: includes nonexistent pages) 7724870174SAndreas Gohr $pages_all = []; 789b41be24STom N Harris foreach ($Indexer->getPages() as $id) { 799b41be24STom N Harris $pages_all[$id] = 0; // base: 0 hit 80f5eb7cf0SAndreas Gohr } 81f5eb7cf0SAndreas Gohr 82865c2687SKazutaka Miyasaka // process the query 8324870174SAndreas Gohr $stack = []; 84865c2687SKazutaka Miyasaka foreach ($q['parsed_ary'] as $token) { 85865c2687SKazutaka Miyasaka switch (substr($token, 0, 3)) { 86865c2687SKazutaka Miyasaka case 'W+:': 872f502d70SKazutaka Miyasaka case 'W-:': 882f502d70SKazutaka Miyasaka case 'W_:': // word 89865c2687SKazutaka Miyasaka $word = substr($token, 3); 905afd9580SAndreas Gohr if (isset($lookup[$word])) { 91865c2687SKazutaka Miyasaka $stack[] = (array)$lookup[$word]; 925afd9580SAndreas Gohr } 93865c2687SKazutaka Miyasaka break; 942f502d70SKazutaka Miyasaka case 'P+:': 952f502d70SKazutaka Miyasaka case 'P-:': // phrase 96865c2687SKazutaka Miyasaka $phrase = substr($token, 3); 97865c2687SKazutaka Miyasaka // since phrases are always parsed as ((W1)(W2)...(P)), 985dccc923SAndreas Gohr // the end($stack) always points at the pages that contain 99865c2687SKazutaka Miyasaka // all words in this phrase 1005dccc923SAndreas Gohr $pages = $stack ? end($stack) : []; 10124870174SAndreas Gohr $pages_matched = []; 102865c2687SKazutaka Miyasaka foreach (array_keys($pages) as $id) { 10324870174SAndreas Gohr $evdata = [ 104a7e8b43eSMichael Hamann 'id' => $id, 105a7e8b43eSMichael Hamann 'phrase' => $phrase, 106a7e8b43eSMichael Hamann 'text' => rawWiki($id) 10724870174SAndreas Gohr ]; 108e1d9dcc8SAndreas Gohr $evt = new Event('FULLTEXT_PHRASE_MATCH', $evdata); 109a7e8b43eSMichael Hamann if ($evt->advise_before() && $evt->result !== true) { 1107fb26b8eSAndreas Gohr $text = PhpString::strtolower($evdata['text']); 111865c2687SKazutaka Miyasaka if (strpos($text, $phrase) !== false) { 112a7e8b43eSMichael Hamann $evt->result = true; 113a7e8b43eSMichael Hamann } 114a7e8b43eSMichael Hamann } 115a7e8b43eSMichael Hamann $evt->advise_after(); 116a7e8b43eSMichael Hamann if ($evt->result === true) { 117865c2687SKazutaka Miyasaka $pages_matched[$id] = 0; // phrase: always 0 hit 118865c2687SKazutaka Miyasaka } 119865c2687SKazutaka Miyasaka } 120865c2687SKazutaka Miyasaka $stack[] = $pages_matched; 121865c2687SKazutaka Miyasaka break; 1222f502d70SKazutaka Miyasaka case 'N+:': 1232f502d70SKazutaka Miyasaka case 'N-:': // namespace 124de3383c6SMichael Große $ns = cleanID(substr($token, 3)) . ':'; 12524870174SAndreas Gohr $pages_matched = []; 126865c2687SKazutaka Miyasaka foreach (array_keys($pages_all) as $id) { 127865c2687SKazutaka Miyasaka if (strpos($id, $ns) === 0) { 128865c2687SKazutaka Miyasaka $pages_matched[$id] = 0; // namespace: always 0 hit 129865c2687SKazutaka Miyasaka } 130865c2687SKazutaka Miyasaka } 131865c2687SKazutaka Miyasaka $stack[] = $pages_matched; 132865c2687SKazutaka Miyasaka break; 133865c2687SKazutaka Miyasaka case 'AND': // and operation 134e08e2789SBheesham Persaud $pages = array_splice($stack, -2); 1358b267e5eSsplitbrain if ($pages === []) { 136e08e2789SBheesham Persaud break; 137e08e2789SBheesham Persaud } 138e08e2789SBheesham Persaud $stack[] = ft_resultCombine($pages); 139865c2687SKazutaka Miyasaka break; 140865c2687SKazutaka Miyasaka case 'OR': // or operation 141e08e2789SBheesham Persaud $pages = array_splice($stack, -2); 1428b267e5eSsplitbrain if ($pages === []) { 143e08e2789SBheesham Persaud break; 144e08e2789SBheesham Persaud } 145e08e2789SBheesham Persaud $stack[] = ft_resultUnite($pages); 146865c2687SKazutaka Miyasaka break; 147865c2687SKazutaka Miyasaka case 'NOT': // not operation (unary) 148865c2687SKazutaka Miyasaka $pages = array_pop($stack); 14924870174SAndreas Gohr $stack[] = ft_resultComplement([$pages_all, $pages]); 150a21136cdSAndreas Gohr break; 151a21136cdSAndreas Gohr } 152f5eb7cf0SAndreas Gohr } 153865c2687SKazutaka Miyasaka $docs = array_pop($stack); 154865c2687SKazutaka Miyasaka 15524870174SAndreas Gohr if (empty($docs)) return []; 156865c2687SKazutaka Miyasaka 157865c2687SKazutaka Miyasaka // check: settings, acls, existence 158865c2687SKazutaka Miyasaka foreach (array_keys($docs) as $id) { 159865c2687SKazutaka Miyasaka if (isHiddenPage($id) || auth_quickaclcheck($id) < AUTH_READ || !page_exists($id, '', false)) { 160865c2687SKazutaka Miyasaka unset($docs[$id]); 161f5eb7cf0SAndreas Gohr } 162f5eb7cf0SAndreas Gohr } 163f5eb7cf0SAndreas Gohr 1643850270cSMichael Große $docs = _ft_filterResultsByTime($docs, $data['after'], $data['before']); 1658d0e286aSMichael Große 1668d0e286aSMichael Große if ($data['sort'] === 'mtime') { 1678d0e286aSMichael Große uksort($docs, 'ft_pagemtimesorter'); 1688d0e286aSMichael Große } else { 169865c2687SKazutaka Miyasaka // sort docs by count 17006281c9cSMoisés Braga Ribeiro uksort($docs, 'ft_pagesorter'); 171f5eb7cf0SAndreas Gohr arsort($docs); 1728d0e286aSMichael Große } 173f5eb7cf0SAndreas Gohr 174f5eb7cf0SAndreas Gohr return $docs; 175f5eb7cf0SAndreas Gohr} 176f5eb7cf0SAndreas Gohr 177f5eb7cf0SAndreas Gohr/** 17854f4c056SAndreas Gohr * Returns the backlinks for a given page 17954f4c056SAndreas Gohr * 180320f489aSMichael Hamann * Uses the metadata index. 18107ff0babSMichael Hamann * 18207ff0babSMichael Hamann * @param string $id The id for which links shall be returned 18307ff0babSMichael Hamann * @param bool $ignore_perms Ignore the fact that pages are hidden or read-protected 18407ff0babSMichael Hamann * @return array The pages that contain links to the given page 18554f4c056SAndreas Gohr */ 186d868eb89SAndreas Gohrfunction ft_backlinks($id, $ignore_perms = false) 187d868eb89SAndreas Gohr{ 188320f489aSMichael Hamann $result = idx_get_indexer()->lookupKey('relation_references', $id); 18954f4c056SAndreas Gohr 19024870174SAndreas Gohr if ($result === []) return $result; 19163773904SAndreas Gohr 19263773904SAndreas Gohr // check ACL permissions 19363773904SAndreas Gohr foreach (array_keys($result) as $idx) { 1947d34963bSAndreas Gohr if ( 1957d34963bSAndreas Gohr (!$ignore_perms && ( 19607ff0babSMichael Hamann isHiddenPage($result[$idx]) || auth_quickaclcheck($result[$idx]) < AUTH_READ 1977d34963bSAndreas Gohr )) || !page_exists($result[$idx], '', false) 1987d34963bSAndreas Gohr ) { 19963773904SAndreas Gohr unset($result[$idx]); 20063773904SAndreas Gohr } 20163773904SAndreas Gohr } 20263773904SAndreas Gohr 2032d85e841SAndreas Gohr Sort::sort($result); 20454f4c056SAndreas Gohr return $result; 20554f4c056SAndreas Gohr} 20654f4c056SAndreas Gohr 20754f4c056SAndreas Gohr/** 208a05e297aSAndreas Gohr * Returns the pages that use a given media file 209a05e297aSAndreas Gohr * 210ffec1009SMichael Hamann * Uses the relation media metadata property and the metadata index. 211a05e297aSAndreas Gohr * 212ffec1009SMichael Hamann * Note that before 2013-07-31 the second parameter was the maximum number of results and 213ffec1009SMichael Hamann * permissions were ignored. That's why the parameter is now checked to be explicitely set 214ffec1009SMichael Hamann * to true (with type bool) in order to be compatible with older uses of the function. 215ffec1009SMichael Hamann * 216ffec1009SMichael Hamann * @param string $id The media id to look for 217ffec1009SMichael Hamann * @param bool $ignore_perms Ignore hidden pages and acls (optional, default: false) 218ffec1009SMichael Hamann * @return array A list of pages that use the given media file 219a05e297aSAndreas Gohr */ 220d868eb89SAndreas Gohrfunction ft_mediause($id, $ignore_perms = false) 221d868eb89SAndreas Gohr{ 222ffec1009SMichael Hamann $result = idx_get_indexer()->lookupKey('relation_media', $id); 223a05e297aSAndreas Gohr 22424870174SAndreas Gohr if ($result === []) return $result; 225a05e297aSAndreas Gohr 226ffec1009SMichael Hamann // check ACL permissions 227ffec1009SMichael Hamann foreach (array_keys($result) as $idx) { 2287d34963bSAndreas Gohr if ( 2297d34963bSAndreas Gohr (!$ignore_perms && ( 230ffec1009SMichael Hamann isHiddenPage($result[$idx]) || auth_quickaclcheck($result[$idx]) < AUTH_READ 2317d34963bSAndreas Gohr )) || !page_exists($result[$idx], '', false) 2327d34963bSAndreas Gohr ) { 233ffec1009SMichael Hamann unset($result[$idx]); 234a05e297aSAndreas Gohr } 235a05e297aSAndreas Gohr } 236a05e297aSAndreas Gohr 2372d85e841SAndreas Gohr Sort::sort($result); 238a05e297aSAndreas Gohr return $result; 239a05e297aSAndreas Gohr} 240a05e297aSAndreas Gohr 241a05e297aSAndreas Gohr 242a05e297aSAndreas Gohr/** 243506fa893SAndreas Gohr * Quicksearch for pagenames 244506fa893SAndreas Gohr * 245506fa893SAndreas Gohr * By default it only matches the pagename and ignores the 24680423ab6SAdrian Lang * namespace. This can be changed with the second parameter. 24780423ab6SAdrian Lang * The third parameter allows to search in titles as well. 248506fa893SAndreas Gohr * 2498d22f1e9SAndreas Gohr * The function always returns titles as well 2506840140fSChris Smith * 2518d22f1e9SAndreas Gohr * @triggers SEARCH_QUERY_PAGELOOKUP 252506fa893SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 2538d22f1e9SAndreas Gohr * @author Adrian Lang <lang@cosmocode.de> 25442ea7f44SGerrit Uitslag * 25542ea7f44SGerrit Uitslag * @param string $id page id 25642ea7f44SGerrit Uitslag * @param bool $in_ns match against namespace as well? 25742ea7f44SGerrit Uitslag * @param bool $in_title search in title? 25864159a61SAndreas Gohr * @param int|string $after only show results with mtime after this date, accepts timestap or strtotime arguments 25964159a61SAndreas Gohr * @param int|string $before only show results with mtime before this date, accepts timestap or strtotime arguments 2603850270cSMichael Große * 26142ea7f44SGerrit Uitslag * @return string[] 262506fa893SAndreas Gohr */ 263d868eb89SAndreas Gohrfunction ft_pageLookup($id, $in_ns = false, $in_title = false, $after = null, $before = null) 264d868eb89SAndreas Gohr{ 2653850270cSMichael Große $data = [ 2663850270cSMichael Große 'id' => $id, 2673850270cSMichael Große 'in_ns' => $in_ns, 2683850270cSMichael Große 'in_title' => $in_title, 2693850270cSMichael Große 'after' => $after, 2703850270cSMichael Große 'before' => $before 2713850270cSMichael Große ]; 2728d22f1e9SAndreas Gohr $data['has_titles'] = true; // for plugin backward compatibility check 273cbb44eabSAndreas Gohr return Event::createAndTrigger('SEARCH_QUERY_PAGELOOKUP', $data, '_ft_pageLookup'); 2746840140fSChris Smith} 2756840140fSChris Smith 27642ea7f44SGerrit Uitslag/** 27742ea7f44SGerrit Uitslag * Returns list of pages as array(pageid => First Heading) 27842ea7f44SGerrit Uitslag * 27942ea7f44SGerrit Uitslag * @param array &$data event data 28042ea7f44SGerrit Uitslag * @return string[] 28142ea7f44SGerrit Uitslag */ 282d868eb89SAndreas Gohrfunction _ft_pageLookup(&$data) 283d868eb89SAndreas Gohr{ 28480423ab6SAdrian Lang // split out original parameters 2856840140fSChris Smith $id = $data['id']; 286940f24fcSMichael Große $Indexer = idx_get_indexer(); 287940f24fcSMichael Große $parsedQuery = ft_queryParser($Indexer, $id); 288940f24fcSMichael Große if (count($parsedQuery['ns']) > 0) { 289940f24fcSMichael Große $ns = cleanID($parsedQuery['ns'][0]) . ':'; 290940f24fcSMichael Große $id = implode(' ', $parsedQuery['highlight']); 291b0f6db0cSAdrian Lang } 292248d652bSGerrit Uitslag if (count($parsedQuery['notns']) > 0) { 293248d652bSGerrit Uitslag $notns = cleanID($parsedQuery['notns'][0]) . ':'; 294248d652bSGerrit Uitslag $id = implode(' ', $parsedQuery['highlight']); 295248d652bSGerrit Uitslag } 296b0f6db0cSAdrian Lang 2978d22f1e9SAndreas Gohr $in_ns = $data['in_ns']; 2988d22f1e9SAndreas Gohr $in_title = $data['in_title']; 29980423ab6SAdrian Lang $cleaned = cleanID($id); 3009b41be24STom N Harris 3019b41be24STom N Harris $Indexer = idx_get_indexer(); 3029b41be24STom N Harris $page_idx = $Indexer->getPages(); 3039b41be24STom N Harris 30424870174SAndreas Gohr $pages = []; 3055479a8c3SAndreas Gohr if ($id !== '' && $cleaned !== '') { 3069b41be24STom N Harris foreach ($page_idx as $p_id) { 3079b41be24STom N Harris if ((strpos($in_ns ? $p_id : noNSorNS($p_id), $cleaned) !== false)) { 3089b41be24STom N Harris if (!isset($pages[$p_id])) 30967c15eceSMichael Hamann $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER); 310506fa893SAndreas Gohr } 311506fa893SAndreas Gohr } 312f078bb00STom N Harris if ($in_title) { 313c66f16a3SMichael Hamann foreach ($Indexer->lookupKey('title', $id, '_ft_pageLookupTitleCompare') as $p_id) { 314f078bb00STom N Harris if (!isset($pages[$p_id])) 31567c15eceSMichael Hamann $pages[$p_id] = p_get_first_heading($p_id, METADATA_DONT_RENDER); 316f078bb00STom N Harris } 317f078bb00STom N Harris } 318d0bdf765SAdrian Lang } 3190c074a52SMichael Hamann 320d0bdf765SAdrian Lang if (isset($ns)) { 3210c074a52SMichael Hamann foreach (array_keys($pages) as $p_id) { 3220c074a52SMichael Hamann if (strpos($p_id, $ns) !== 0) { 3230c074a52SMichael Hamann unset($pages[$p_id]); 324d0bdf765SAdrian Lang } 325d0bdf765SAdrian Lang } 326506fa893SAndreas Gohr } 327248d652bSGerrit Uitslag if (isset($notns)) { 328248d652bSGerrit Uitslag foreach (array_keys($pages) as $p_id) { 329248d652bSGerrit Uitslag if (strpos($p_id, $notns) === 0) { 330248d652bSGerrit Uitslag unset($pages[$p_id]); 331248d652bSGerrit Uitslag } 332248d652bSGerrit Uitslag } 333248d652bSGerrit Uitslag } 33463773904SAndreas Gohr 33580423ab6SAdrian Lang // discard hidden pages 33680423ab6SAdrian Lang // discard nonexistent pages 33763773904SAndreas Gohr // check ACL permissions 33863773904SAndreas Gohr foreach (array_keys($pages) as $idx) { 3397d34963bSAndreas Gohr if ( 3407d34963bSAndreas Gohr !isVisiblePage($idx) || !page_exists($idx) || 3417d34963bSAndreas Gohr auth_quickaclcheck($idx) < AUTH_READ 3427d34963bSAndreas Gohr ) { 34363773904SAndreas Gohr unset($pages[$idx]); 34463773904SAndreas Gohr } 34563773904SAndreas Gohr } 34663773904SAndreas Gohr 3473850270cSMichael Große $pages = _ft_filterResultsByTime($pages, $data['after'], $data['before']); 3481b48999cSMichael Große 3493d2017d9SAdrian Lang uksort($pages, 'ft_pagesorter'); 3508d22f1e9SAndreas Gohr return $pages; 351506fa893SAndreas Gohr} 352506fa893SAndreas Gohr 3531b48999cSMichael Große 3541b48999cSMichael Große/** 3551b48999cSMichael Große * @param array $results search results in the form pageid => value 35664159a61SAndreas Gohr * @param int|string $after only returns results with mtime after this date, accepts timestap or strtotime arguments 35764159a61SAndreas Gohr * @param int|string $before only returns results with mtime after this date, accepts timestap or strtotime arguments 3581b48999cSMichael Große * 3591b48999cSMichael Große * @return array 3601b48999cSMichael Große */ 361d868eb89SAndreas Gohrfunction _ft_filterResultsByTime(array $results, $after, $before) 362d868eb89SAndreas Gohr{ 3633850270cSMichael Große if ($after || $before) { 3641b48999cSMichael Große $after = is_int($after) ? $after : strtotime($after); 3651b48999cSMichael Große $before = is_int($before) ? $before : strtotime($before); 3661b48999cSMichael Große 36724870174SAndreas Gohr foreach (array_keys($results) as $id) { 3681b48999cSMichael Große $mTime = filemtime(wikiFN($id)); 3691b48999cSMichael Große if ($after && $after > $mTime) { 3701b48999cSMichael Große unset($results[$id]); 3711b48999cSMichael Große continue; 3721b48999cSMichael Große } 3731b48999cSMichael Große if ($before && $before < $mTime) { 3741b48999cSMichael Große unset($results[$id]); 3751b48999cSMichael Große } 3761b48999cSMichael Große } 3771b48999cSMichael Große } 3781b48999cSMichael Große 3791b48999cSMichael Große return $results; 3801b48999cSMichael Große} 3811b48999cSMichael Große 382506fa893SAndreas Gohr/** 383c66f16a3SMichael Hamann * Tiny helper function for comparing the searched title with the title 384c66f16a3SMichael Hamann * from the search index. This function is a wrapper around stripos with 385c66f16a3SMichael Hamann * adapted argument order and return value. 38642ea7f44SGerrit Uitslag * 38742ea7f44SGerrit Uitslag * @param string $search searched title 38842ea7f44SGerrit Uitslag * @param string $title title from index 38942ea7f44SGerrit Uitslag * @return bool 390c66f16a3SMichael Hamann */ 391d868eb89SAndreas Gohrfunction _ft_pageLookupTitleCompare($search, $title) 392d868eb89SAndreas Gohr{ 3937fb26b8eSAndreas Gohr if (Clean::isASCII($search)) { 3947fb26b8eSAndreas Gohr $pos = stripos($title, $search); 3957fb26b8eSAndreas Gohr } else { 3967fb26b8eSAndreas Gohr $pos = PhpString::strpos( 3977fb26b8eSAndreas Gohr PhpString::strtolower($title), 3987fb26b8eSAndreas Gohr PhpString::strtolower($search) 3997fb26b8eSAndreas Gohr ); 4007fb26b8eSAndreas Gohr } 4017fb26b8eSAndreas Gohr 4027fb26b8eSAndreas Gohr return $pos !== false; 403c66f16a3SMichael Hamann} 404c66f16a3SMichael Hamann 405c66f16a3SMichael Hamann/** 406f31eb72bSAndreas Gohr * Sort pages based on their namespace level first, then on their string 407f31eb72bSAndreas Gohr * values. This makes higher hierarchy pages rank higher than lower hierarchy 408f31eb72bSAndreas Gohr * pages. 40942ea7f44SGerrit Uitslag * 41042ea7f44SGerrit Uitslag * @param string $a 41142ea7f44SGerrit Uitslag * @param string $b 41242ea7f44SGerrit Uitslag * @return int Returns < 0 if $a is less than $b; > 0 if $a is greater than $b, and 0 if they are equal. 413f31eb72bSAndreas Gohr */ 414d868eb89SAndreas Gohrfunction ft_pagesorter($a, $b) 415d868eb89SAndreas Gohr{ 416f31eb72bSAndreas Gohr $ac = count(explode(':', $a)); 417f31eb72bSAndreas Gohr $bc = count(explode(':', $b)); 418f31eb72bSAndreas Gohr if ($ac < $bc) { 419f31eb72bSAndreas Gohr return -1; 420f31eb72bSAndreas Gohr } elseif ($ac > $bc) { 421f31eb72bSAndreas Gohr return 1; 422f31eb72bSAndreas Gohr } 4232d85e841SAndreas Gohr return Sort::strcmp($a, $b); 424f31eb72bSAndreas Gohr} 425f31eb72bSAndreas Gohr 426f31eb72bSAndreas Gohr/** 4278d0e286aSMichael Große * Sort pages by their mtime, from newest to oldest 4288d0e286aSMichael Große * 4298d0e286aSMichael Große * @param string $a 4308d0e286aSMichael Große * @param string $b 4318d0e286aSMichael Große * 4328d0e286aSMichael Große * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a and 0 if they are of the same age 4338d0e286aSMichael Große */ 434d868eb89SAndreas Gohrfunction ft_pagemtimesorter($a, $b) 435d868eb89SAndreas Gohr{ 4368d0e286aSMichael Große $mtimeA = filemtime(wikiFN($a)); 4378d0e286aSMichael Große $mtimeB = filemtime(wikiFN($b)); 4388d0e286aSMichael Große return $mtimeB - $mtimeA; 4398d0e286aSMichael Große} 4408d0e286aSMichael Große 4418d0e286aSMichael Große/** 442506fa893SAndreas Gohr * Creates a snippet extract 443506fa893SAndreas Gohr * 444506fa893SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 44560e91a17SAndreas Gohr * @triggers FULLTEXT_SNIPPET_CREATE 44642ea7f44SGerrit Uitslag * 44742ea7f44SGerrit Uitslag * @param string $id page id 44842ea7f44SGerrit Uitslag * @param array $highlight 44942ea7f44SGerrit Uitslag * @return mixed 450506fa893SAndreas Gohr */ 451d868eb89SAndreas Gohrfunction ft_snippet($id, $highlight) 452d868eb89SAndreas Gohr{ 453506fa893SAndreas Gohr $text = rawWiki($id); 45424870174SAndreas Gohr $text = str_replace("\xC2\xAD", '', $text); 45524870174SAndreas Gohr // remove soft-hyphens 45624870174SAndreas Gohr $evdata = [ 45760e91a17SAndreas Gohr 'id' => $id, 45860e91a17SAndreas Gohr 'text' => &$text, 45960e91a17SAndreas Gohr 'highlight' => &$highlight, 46024870174SAndreas Gohr 'snippet' => '' 46124870174SAndreas Gohr ]; 46260e91a17SAndreas Gohr 463e1d9dcc8SAndreas Gohr $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata); 46460e91a17SAndreas Gohr if ($evt->advise_before()) { 46524870174SAndreas Gohr $match = []; 46624870174SAndreas Gohr $snippets = []; 46724870174SAndreas Gohr $utf8_offset = 0; 46824870174SAndreas Gohr $offset = 0; 46924870174SAndreas Gohr $end = 0; 4707fb26b8eSAndreas Gohr $len = PhpString::strlen($text); 4719ee93076Schris 472546d3a99SAndreas Gohr // build a regexp from the phrases to highlight 47364159a61SAndreas Gohr $re1 = '(' . 47424870174SAndreas Gohr implode( 47564159a61SAndreas Gohr '|', 47664159a61SAndreas Gohr array_map( 47764159a61SAndreas Gohr 'ft_snippet_re_preprocess', 47864159a61SAndreas Gohr array_map( 47964159a61SAndreas Gohr 'preg_quote_cb', 48064159a61SAndreas Gohr array_filter((array) $highlight) 48164159a61SAndreas Gohr ) 48264159a61SAndreas Gohr ) 48364159a61SAndreas Gohr ) . 48464159a61SAndreas Gohr ')'; 485b571ff2dSChuck Kollars $re2 = "$re1.{0,75}(?!\\1)$re1"; 486b571ff2dSChuck Kollars $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; 487546d3a99SAndreas Gohr 488b571ff2dSChuck Kollars for ($cnt = 4; $cnt--;) { 489b571ff2dSChuck Kollars if (0) { 490b571ff2dSChuck Kollars } elseif (preg_match('/' . $re3 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 491b571ff2dSChuck Kollars } elseif (preg_match('/' . $re2 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 492b571ff2dSChuck Kollars } elseif (preg_match('/' . $re1 . '/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 493b571ff2dSChuck Kollars } else { 494b571ff2dSChuck Kollars break; 495b571ff2dSChuck Kollars } 496ced0762eSchris 49724870174SAndreas Gohr [$str, $idx] = $match[0]; 498ced0762eSchris 499ced0762eSchris // convert $idx (a byte offset) into a utf8 character offset 5007fb26b8eSAndreas Gohr $utf8_idx = PhpString::strlen(substr($text, 0, $idx)); 5017fb26b8eSAndreas Gohr $utf8_len = PhpString::strlen($str); 502ced0762eSchris 503ced0762eSchris // establish context, 100 bytes surrounding the match string 504ced0762eSchris // first look to see if we can go 100 either side, 505ced0762eSchris // then drop to 50 adding any excess if the other side can't go to 50, 506ced0762eSchris $pre = min($utf8_idx - $utf8_offset, 100); 507ced0762eSchris $post = min($len - $utf8_idx - $utf8_len, 100); 508ced0762eSchris 509ced0762eSchris if ($pre > 50 && $post > 50) { 51024870174SAndreas Gohr $pre = 50; 51124870174SAndreas Gohr $post = 50; 512ced0762eSchris } elseif ($pre > 50) { 513ced0762eSchris $pre = min($pre, 100 - $post); 514ced0762eSchris } elseif ($post > 50) { 515ced0762eSchris $post = min($post, 100 - $pre); 516ef3e3cddSMichael Hamann } elseif ($offset == 0) { 517ced0762eSchris // both are less than 50, means the context is the whole string 51810ffc9ddSAndreas Gohr // make it so and break out of this loop - there is no need for the 51910ffc9ddSAndreas Gohr // complex snippet calculations 52024870174SAndreas Gohr $snippets = [$text]; 521ced0762eSchris break; 522ced0762eSchris } 523ced0762eSchris 52410ffc9ddSAndreas Gohr // establish context start and end points, try to append to previous 52510ffc9ddSAndreas Gohr // context if possible 5269ee93076Schris $start = $utf8_idx - $pre; 527ced0762eSchris $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 5289ee93076Schris $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 529ced0762eSchris 530ced0762eSchris if ($append) { 5317fb26b8eSAndreas Gohr $snippets[count($snippets) - 1] .= PhpString::substr($text, $append, $end - $append); 532ced0762eSchris } else { 5337fb26b8eSAndreas Gohr $snippets[] = PhpString::substr($text, $start, $end - $start); 534ced0762eSchris } 535ced0762eSchris 536ced0762eSchris // set $offset for next match attempt 53743d58b76SMichael Hamann // continue matching after the current match 53843d58b76SMichael Hamann // if the current match is not the longest possible match starting at the current offset 53943d58b76SMichael Hamann // this prevents further matching of this snippet but for possible matches of length 54043d58b76SMichael Hamann // smaller than match length + context (at least 50 characters) this match is part of the context 54143d58b76SMichael Hamann $utf8_offset = $utf8_idx + $utf8_len; 5427fb26b8eSAndreas Gohr $offset = $idx + strlen(PhpString::substr($text, $utf8_idx, $utf8_len)); 5437fb26b8eSAndreas Gohr $offset = Clean::correctIdx($text, $offset); 5449ee93076Schris } 5459ee93076Schris 546ced0762eSchris $m = "\1"; 547b571ff2dSChuck Kollars $snippets = preg_replace('/' . $re1 . '/iu', $m . '$1' . $m, $snippets); 54864159a61SAndreas Gohr $snippet = preg_replace( 54964159a61SAndreas Gohr '/' . $m . '([^' . $m . ']*?)' . $m . '/iu', 55064159a61SAndreas Gohr '<strong class="search_hit">$1</strong>', 55124870174SAndreas Gohr hsc(implode('... ', $snippets)) 55264159a61SAndreas Gohr ); 553bd2cb6fcSchris 55460e91a17SAndreas Gohr $evdata['snippet'] = $snippet; 55560e91a17SAndreas Gohr } 55660e91a17SAndreas Gohr $evt->advise_after(); 55760e91a17SAndreas Gohr unset($evt); 55860e91a17SAndreas Gohr 55960e91a17SAndreas Gohr return $evdata['snippet']; 560506fa893SAndreas Gohr} 561506fa893SAndreas Gohr 562506fa893SAndreas Gohr/** 56326eb848cSGina Haeussge * Wraps a search term in regex boundary checks. 56442ea7f44SGerrit Uitslag * 56542ea7f44SGerrit Uitslag * @param string $term 56642ea7f44SGerrit Uitslag * @return string 56726eb848cSGina Haeussge */ 568d868eb89SAndreas Gohrfunction ft_snippet_re_preprocess($term) 569d868eb89SAndreas Gohr{ 57035594613SKazutaka Miyasaka // do not process asian terms where word boundaries are not explicit 57124870174SAndreas Gohr if (Asian::isAsianWords($term)) return $term; 57235594613SKazutaka Miyasaka 5733161005dSAndreas Gohr if (UTF8_PROPERTYSUPPORT) { 57484e581a6SAndreas Gohr // unicode word boundaries 57584e581a6SAndreas Gohr // see http://stackoverflow.com/a/2449017/172068 57684e581a6SAndreas Gohr $BL = '(?<!\pL)'; 57784e581a6SAndreas Gohr $BR = '(?!\pL)'; 5783161005dSAndreas Gohr } else { 5793161005dSAndreas Gohr // not as correct as above, but at least won't break 5803161005dSAndreas Gohr $BL = '\b'; 5813161005dSAndreas Gohr $BR = '\b'; 5823161005dSAndreas Gohr } 5833161005dSAndreas Gohr 5846c16a3a9Sfiwswe if (str_starts_with($term, '\\*')) { 5852237b4faSAndreas Gohr $term = substr($term, 2); 5862237b4faSAndreas Gohr } else { 58784e581a6SAndreas Gohr $term = $BL . $term; 5882237b4faSAndreas Gohr } 5892237b4faSAndreas Gohr 5906c16a3a9Sfiwswe if (str_ends_with($term, '\\*')) { 5912237b4faSAndreas Gohr $term = substr($term, 0, -2); 5922237b4faSAndreas Gohr } else { 59324870174SAndreas Gohr $term .= $BR; 5942237b4faSAndreas Gohr } 5958a803caeSAndreas Gohr 59684e581a6SAndreas Gohr if ($term == $BL || $term == $BR || $term == $BL . $BR) $term = ''; 5972237b4faSAndreas Gohr return $term; 59826eb848cSGina Haeussge} 59926eb848cSGina Haeussge 60026eb848cSGina Haeussge/** 601f5eb7cf0SAndreas Gohr * Combine found documents and sum up their scores 602f5eb7cf0SAndreas Gohr * 603f5eb7cf0SAndreas Gohr * This function is used to combine searched words with a logical 604f5eb7cf0SAndreas Gohr * AND. Only documents available in all arrays are returned. 605f5eb7cf0SAndreas Gohr * 606f5eb7cf0SAndreas Gohr * based upon PEAR's PHP_Compat function for array_intersect_key() 607f5eb7cf0SAndreas Gohr * 608f5eb7cf0SAndreas Gohr * @param array $args An array of page arrays 60942ea7f44SGerrit Uitslag * @return array 610f5eb7cf0SAndreas Gohr */ 611d868eb89SAndreas Gohrfunction ft_resultCombine($args) 612d868eb89SAndreas Gohr{ 613f5eb7cf0SAndreas Gohr $array_count = count($args); 614134f4ab2SAndreas Gohr if ($array_count == 1) { 615134f4ab2SAndreas Gohr return $args[0]; 616134f4ab2SAndreas Gohr } 617134f4ab2SAndreas Gohr 61824870174SAndreas Gohr $result = []; 61909c27a6dSGuy Brand if ($array_count > 1) { 620a21136cdSAndreas Gohr foreach ($args[0] as $key => $value) { 621a21136cdSAndreas Gohr $result[$key] = $value; 622f5eb7cf0SAndreas Gohr for ($i = 1; $i !== $array_count; $i++) { 623a21136cdSAndreas Gohr if (!isset($args[$i][$key])) { 624a21136cdSAndreas Gohr unset($result[$key]); 625a21136cdSAndreas Gohr break; 626f5eb7cf0SAndreas Gohr } 627a21136cdSAndreas Gohr $result[$key] += $args[$i][$key]; 628f5eb7cf0SAndreas Gohr } 629f5eb7cf0SAndreas Gohr } 63009c27a6dSGuy Brand } 631f5eb7cf0SAndreas Gohr return $result; 632f5eb7cf0SAndreas Gohr} 633f5eb7cf0SAndreas Gohr 634f5eb7cf0SAndreas Gohr/** 635865c2687SKazutaka Miyasaka * Unites found documents and sum up their scores 636f5eb7cf0SAndreas Gohr * 637865c2687SKazutaka Miyasaka * based upon ft_resultCombine() function 638865c2687SKazutaka Miyasaka * 639865c2687SKazutaka Miyasaka * @param array $args An array of page arrays 64042ea7f44SGerrit Uitslag * @return array 64142ea7f44SGerrit Uitslag * 642865c2687SKazutaka Miyasaka * @author Kazutaka Miyasaka <kazmiya@gmail.com> 643865c2687SKazutaka Miyasaka */ 644d868eb89SAndreas Gohrfunction ft_resultUnite($args) 645d868eb89SAndreas Gohr{ 646865c2687SKazutaka Miyasaka $array_count = count($args); 647865c2687SKazutaka Miyasaka if ($array_count === 1) { 648865c2687SKazutaka Miyasaka return $args[0]; 649865c2687SKazutaka Miyasaka } 650865c2687SKazutaka Miyasaka 651865c2687SKazutaka Miyasaka $result = $args[0]; 652865c2687SKazutaka Miyasaka for ($i = 1; $i !== $array_count; $i++) { 653865c2687SKazutaka Miyasaka foreach (array_keys($args[$i]) as $id) { 654*882e464dSAnna Dabrowska if (isset($result[$id])) { 655865c2687SKazutaka Miyasaka $result[$id] += $args[$i][$id]; 656*882e464dSAnna Dabrowska } else { 657*882e464dSAnna Dabrowska $result[$id] = $args[$i][$id]; 658*882e464dSAnna Dabrowska } 659865c2687SKazutaka Miyasaka } 660865c2687SKazutaka Miyasaka } 661865c2687SKazutaka Miyasaka return $result; 662865c2687SKazutaka Miyasaka} 663865c2687SKazutaka Miyasaka 664865c2687SKazutaka Miyasaka/** 665865c2687SKazutaka Miyasaka * Computes the difference of documents using page id for comparison 666865c2687SKazutaka Miyasaka * 667865c2687SKazutaka Miyasaka * nearly identical to PHP5's array_diff_key() 668865c2687SKazutaka Miyasaka * 669865c2687SKazutaka Miyasaka * @param array $args An array of page arrays 67042ea7f44SGerrit Uitslag * @return array 67142ea7f44SGerrit Uitslag * 672865c2687SKazutaka Miyasaka * @author Kazutaka Miyasaka <kazmiya@gmail.com> 673865c2687SKazutaka Miyasaka */ 674d868eb89SAndreas Gohrfunction ft_resultComplement($args) 675d868eb89SAndreas Gohr{ 676865c2687SKazutaka Miyasaka $array_count = count($args); 677865c2687SKazutaka Miyasaka if ($array_count === 1) { 678865c2687SKazutaka Miyasaka return $args[0]; 679865c2687SKazutaka Miyasaka } 680865c2687SKazutaka Miyasaka 681865c2687SKazutaka Miyasaka $result = $args[0]; 682865c2687SKazutaka Miyasaka foreach (array_keys($result) as $id) { 683865c2687SKazutaka Miyasaka for ($i = 1; $i !== $array_count; $i++) { 684865c2687SKazutaka Miyasaka if (isset($args[$i][$id])) unset($result[$id]); 685865c2687SKazutaka Miyasaka } 686865c2687SKazutaka Miyasaka } 687865c2687SKazutaka Miyasaka return $result; 688865c2687SKazutaka Miyasaka} 689865c2687SKazutaka Miyasaka 690865c2687SKazutaka Miyasaka/** 691865c2687SKazutaka Miyasaka * Parses a search query and builds an array of search formulas 692865c2687SKazutaka Miyasaka * 693865c2687SKazutaka Miyasaka * @author Andreas Gohr <andi@splitbrain.org> 694865c2687SKazutaka Miyasaka * @author Kazutaka Miyasaka <kazmiya@gmail.com> 69542ea7f44SGerrit Uitslag * 69624870174SAndreas Gohr * @param Indexer $Indexer 69742ea7f44SGerrit Uitslag * @param string $query search query 69842ea7f44SGerrit Uitslag * @return array of search formulas 699f5eb7cf0SAndreas Gohr */ 700d868eb89SAndreas Gohrfunction ft_queryParser($Indexer, $query) 701d868eb89SAndreas Gohr{ 702865c2687SKazutaka Miyasaka /** 703865c2687SKazutaka Miyasaka * parse a search query and transform it into intermediate representation 704865c2687SKazutaka Miyasaka * 705865c2687SKazutaka Miyasaka * in a search query, you can use the following expressions: 706865c2687SKazutaka Miyasaka * 707865c2687SKazutaka Miyasaka * words: 708865c2687SKazutaka Miyasaka * include 709865c2687SKazutaka Miyasaka * -exclude 710865c2687SKazutaka Miyasaka * phrases: 711865c2687SKazutaka Miyasaka * "phrase to be included" 712865c2687SKazutaka Miyasaka * -"phrase you want to exclude" 713865c2687SKazutaka Miyasaka * namespaces: 714865c2687SKazutaka Miyasaka * @include:namespace (or ns:include:namespace) 715865c2687SKazutaka Miyasaka * ^exclude:namespace (or -ns:exclude:namespace) 716865c2687SKazutaka Miyasaka * groups: 717865c2687SKazutaka Miyasaka * () 718865c2687SKazutaka Miyasaka * -() 719865c2687SKazutaka Miyasaka * operators: 720865c2687SKazutaka Miyasaka * and ('and' is the default operator: you can always omit this) 7217871d415SKazutaka Miyasaka * or (or pipe symbol '|', lower precedence than 'and') 722865c2687SKazutaka Miyasaka * 723865c2687SKazutaka Miyasaka * e.g. a query [ aa "bb cc" @dd:ee ] means "search pages which contain 724865c2687SKazutaka Miyasaka * a word 'aa', a phrase 'bb cc' and are within a namespace 'dd:ee'". 725865c2687SKazutaka Miyasaka * this query is equivalent to [ -(-aa or -"bb cc" or -ns:dd:ee) ] 726865c2687SKazutaka Miyasaka * as long as you don't mind hit counts. 727865c2687SKazutaka Miyasaka * 728865c2687SKazutaka Miyasaka * intermediate representation consists of the following parts: 729865c2687SKazutaka Miyasaka * 730865c2687SKazutaka Miyasaka * ( ) - group 731865c2687SKazutaka Miyasaka * AND - logical and 732865c2687SKazutaka Miyasaka * OR - logical or 733865c2687SKazutaka Miyasaka * NOT - logical not 7342f502d70SKazutaka Miyasaka * W+:, W-:, W_: - word (underscore: no need to highlight) 7352f502d70SKazutaka Miyasaka * P+:, P-: - phrase (minus sign: logically in NOT group) 7362f502d70SKazutaka Miyasaka * N+:, N-: - namespace 737865c2687SKazutaka Miyasaka */ 738865c2687SKazutaka Miyasaka $parsed_query = ''; 739865c2687SKazutaka Miyasaka $parens_level = 0; 740dccd6b2bSAndreas Gohr $terms = preg_split( 741dccd6b2bSAndreas Gohr '/(-?".*?")/u', 742dccd6b2bSAndreas Gohr PhpString::strtolower($query), 743dccd6b2bSAndreas Gohr -1, 744dccd6b2bSAndreas Gohr PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY 745dccd6b2bSAndreas Gohr ); 746865c2687SKazutaka Miyasaka 747865c2687SKazutaka Miyasaka foreach ($terms as $term) { 748865c2687SKazutaka Miyasaka $parsed = ''; 749865c2687SKazutaka Miyasaka if (preg_match('/^(-?)"(.+)"$/u', $term, $matches)) { 750865c2687SKazutaka Miyasaka // phrase-include and phrase-exclude 751865c2687SKazutaka Miyasaka $not = $matches[1] ? 'NOT' : ''; 7529b41be24STom N Harris $parsed = $not . ft_termParser($Indexer, $matches[2], false, true); 753f5eb7cf0SAndreas Gohr } else { 754865c2687SKazutaka Miyasaka // fix incomplete phrase 755865c2687SKazutaka Miyasaka $term = str_replace('"', ' ', $term); 756865c2687SKazutaka Miyasaka 757865c2687SKazutaka Miyasaka // fix parentheses 758865c2687SKazutaka Miyasaka $term = str_replace(')', ' ) ', $term); 759865c2687SKazutaka Miyasaka $term = str_replace('(', ' ( ', $term); 760865c2687SKazutaka Miyasaka $term = str_replace('- (', ' -(', $term); 761865c2687SKazutaka Miyasaka 7627871d415SKazutaka Miyasaka // treat pipe symbols as 'OR' operators 7637871d415SKazutaka Miyasaka $term = str_replace('|', ' or ', $term); 7647871d415SKazutaka Miyasaka 765865c2687SKazutaka Miyasaka // treat ideographic spaces (U+3000) as search term separators 766865c2687SKazutaka Miyasaka // FIXME: some more separators? 767865c2687SKazutaka Miyasaka $term = preg_replace('/[ \x{3000}]+/u', ' ', $term); 768865c2687SKazutaka Miyasaka $term = trim($term); 769865c2687SKazutaka Miyasaka if ($term === '') continue; 770865c2687SKazutaka Miyasaka 771865c2687SKazutaka Miyasaka $tokens = explode(' ', $term); 772865c2687SKazutaka Miyasaka foreach ($tokens as $token) { 773865c2687SKazutaka Miyasaka if ($token === '(') { 774865c2687SKazutaka Miyasaka // parenthesis-include-open 775865c2687SKazutaka Miyasaka $parsed .= '('; 776865c2687SKazutaka Miyasaka ++$parens_level; 777865c2687SKazutaka Miyasaka } elseif ($token === '-(') { 778865c2687SKazutaka Miyasaka // parenthesis-exclude-open 779865c2687SKazutaka Miyasaka $parsed .= 'NOT('; 780865c2687SKazutaka Miyasaka ++$parens_level; 781865c2687SKazutaka Miyasaka } elseif ($token === ')') { 782865c2687SKazutaka Miyasaka // parenthesis-any-close 783865c2687SKazutaka Miyasaka if ($parens_level === 0) continue; 784865c2687SKazutaka Miyasaka $parsed .= ')'; 785865c2687SKazutaka Miyasaka $parens_level--; 786865c2687SKazutaka Miyasaka } elseif ($token === 'and') { 787865c2687SKazutaka Miyasaka // logical-and (do nothing) 788865c2687SKazutaka Miyasaka } elseif ($token === 'or') { 789865c2687SKazutaka Miyasaka // logical-or 790865c2687SKazutaka Miyasaka $parsed .= 'OR'; 791865c2687SKazutaka Miyasaka } elseif (preg_match('/^(?:\^|-ns:)(.+)$/u', $token, $matches)) { 792865c2687SKazutaka Miyasaka // namespace-exclude 7932f502d70SKazutaka Miyasaka $parsed .= 'NOT(N+:' . $matches[1] . ')'; 794865c2687SKazutaka Miyasaka } elseif (preg_match('/^(?:@|ns:)(.+)$/u', $token, $matches)) { 795865c2687SKazutaka Miyasaka // namespace-include 7962f502d70SKazutaka Miyasaka $parsed .= '(N+:' . $matches[1] . ')'; 797865c2687SKazutaka Miyasaka } elseif (preg_match('/^-(.+)$/', $token, $matches)) { 798865c2687SKazutaka Miyasaka // word-exclude 7999b41be24STom N Harris $parsed .= 'NOT(' . ft_termParser($Indexer, $matches[1]) . ')'; 800865c2687SKazutaka Miyasaka } else { 801865c2687SKazutaka Miyasaka // word-include 8029b41be24STom N Harris $parsed .= ft_termParser($Indexer, $token); 803865c2687SKazutaka Miyasaka } 804865c2687SKazutaka Miyasaka } 805865c2687SKazutaka Miyasaka } 806865c2687SKazutaka Miyasaka $parsed_query .= $parsed; 807f5eb7cf0SAndreas Gohr } 808f5eb7cf0SAndreas Gohr 809865c2687SKazutaka Miyasaka // cleanup (very sensitive) 810865c2687SKazutaka Miyasaka $parsed_query .= str_repeat(')', $parens_level); 811865c2687SKazutaka Miyasaka do { 812865c2687SKazutaka Miyasaka $parsed_query_old = $parsed_query; 813865c2687SKazutaka Miyasaka $parsed_query = preg_replace('/(NOT)?\(\)/u', '', $parsed_query); 814865c2687SKazutaka Miyasaka } while ($parsed_query !== $parsed_query_old); 815865c2687SKazutaka Miyasaka $parsed_query = preg_replace('/(NOT|OR)+\)/u', ')', $parsed_query); 816865c2687SKazutaka Miyasaka $parsed_query = preg_replace('/(OR)+/u', 'OR', $parsed_query); 817865c2687SKazutaka Miyasaka $parsed_query = preg_replace('/\(OR/u', '(', $parsed_query); 818865c2687SKazutaka Miyasaka $parsed_query = preg_replace('/^OR|OR$/u', '', $parsed_query); 819865c2687SKazutaka Miyasaka $parsed_query = preg_replace('/\)(NOT)?\(/u', ')AND$1(', $parsed_query); 820865c2687SKazutaka Miyasaka 8212f502d70SKazutaka Miyasaka // adjustment: make highlightings right 8222f502d70SKazutaka Miyasaka $parens_level = 0; 82324870174SAndreas Gohr $notgrp_levels = []; 8242f502d70SKazutaka Miyasaka $parsed_query_new = ''; 8252f502d70SKazutaka Miyasaka $tokens = preg_split('/(NOT\(|[()])/u', $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); 8262f502d70SKazutaka Miyasaka foreach ($tokens as $token) { 8272f502d70SKazutaka Miyasaka if ($token === 'NOT(') { 8282f502d70SKazutaka Miyasaka $notgrp_levels[] = ++$parens_level; 8292f502d70SKazutaka Miyasaka } elseif ($token === '(') { 8302f502d70SKazutaka Miyasaka ++$parens_level; 8312f502d70SKazutaka Miyasaka } elseif ($token === ')') { 8322f502d70SKazutaka Miyasaka if ($parens_level-- === end($notgrp_levels)) array_pop($notgrp_levels); 8332f502d70SKazutaka Miyasaka } elseif (count($notgrp_levels) % 2 === 1) { 8342f502d70SKazutaka Miyasaka // turn highlight-flag off if terms are logically in "NOT" group 8352f502d70SKazutaka Miyasaka $token = preg_replace('/([WPN])\+\:/u', '$1-:', $token); 8362f502d70SKazutaka Miyasaka } 8372f502d70SKazutaka Miyasaka $parsed_query_new .= $token; 8382f502d70SKazutaka Miyasaka } 8392f502d70SKazutaka Miyasaka $parsed_query = $parsed_query_new; 8402f502d70SKazutaka Miyasaka 841865c2687SKazutaka Miyasaka /** 842865c2687SKazutaka Miyasaka * convert infix notation string into postfix (Reverse Polish notation) array 843865c2687SKazutaka Miyasaka * by Shunting-yard algorithm 844865c2687SKazutaka Miyasaka * 845865c2687SKazutaka Miyasaka * see: http://en.wikipedia.org/wiki/Reverse_Polish_notation 846865c2687SKazutaka Miyasaka * see: http://en.wikipedia.org/wiki/Shunting-yard_algorithm 847865c2687SKazutaka Miyasaka */ 84824870174SAndreas Gohr $parsed_ary = []; 84924870174SAndreas Gohr $ope_stack = []; 85024870174SAndreas Gohr $ope_precedence = [')' => 1, 'OR' => 2, 'AND' => 3, 'NOT' => 4, '(' => 5]; 851865c2687SKazutaka Miyasaka $ope_regex = '/([()]|OR|AND|NOT)/u'; 852865c2687SKazutaka Miyasaka 853865c2687SKazutaka Miyasaka $tokens = preg_split($ope_regex, $parsed_query, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); 854865c2687SKazutaka Miyasaka foreach ($tokens as $token) { 855865c2687SKazutaka Miyasaka if (preg_match($ope_regex, $token)) { 856865c2687SKazutaka Miyasaka // operator 857865c2687SKazutaka Miyasaka $last_ope = end($ope_stack); 85867d812e0SMarius van Witzenburg while ($last_ope !== false && $ope_precedence[$token] <= $ope_precedence[$last_ope] && $last_ope != '(') { 859865c2687SKazutaka Miyasaka $parsed_ary[] = array_pop($ope_stack); 860865c2687SKazutaka Miyasaka $last_ope = end($ope_stack); 861865c2687SKazutaka Miyasaka } 862865c2687SKazutaka Miyasaka if ($token == ')') { 863865c2687SKazutaka Miyasaka array_pop($ope_stack); // this array_pop always deletes '(' 864865c2687SKazutaka Miyasaka } else { 865865c2687SKazutaka Miyasaka $ope_stack[] = $token; 866865c2687SKazutaka Miyasaka } 867865c2687SKazutaka Miyasaka } else { 868865c2687SKazutaka Miyasaka // operand 86924870174SAndreas Gohr $token_decoded = str_replace(['OP', 'CP'], ['(', ')'], $token); 870865c2687SKazutaka Miyasaka $parsed_ary[] = $token_decoded; 871865c2687SKazutaka Miyasaka } 872865c2687SKazutaka Miyasaka } 87324870174SAndreas Gohr $parsed_ary = array_values([...$parsed_ary, ...array_reverse($ope_stack)]); 874865c2687SKazutaka Miyasaka 875865c2687SKazutaka Miyasaka // cleanup: each double "NOT" in RPN array actually does nothing 876865c2687SKazutaka Miyasaka $parsed_ary_count = count($parsed_ary); 877865c2687SKazutaka Miyasaka for ($i = 1; $i < $parsed_ary_count; ++$i) { 878865c2687SKazutaka Miyasaka if ($parsed_ary[$i] === 'NOT' && $parsed_ary[$i - 1] === 'NOT') { 879865c2687SKazutaka Miyasaka unset($parsed_ary[$i], $parsed_ary[$i - 1]); 880865c2687SKazutaka Miyasaka } 881865c2687SKazutaka Miyasaka } 882865c2687SKazutaka Miyasaka $parsed_ary = array_values($parsed_ary); 883865c2687SKazutaka Miyasaka 884865c2687SKazutaka Miyasaka // build return value 88524870174SAndreas Gohr $q = []; 886f5eb7cf0SAndreas Gohr $q['query'] = $query; 887865c2687SKazutaka Miyasaka $q['parsed_str'] = $parsed_query; 888865c2687SKazutaka Miyasaka $q['parsed_ary'] = $parsed_ary; 889f5eb7cf0SAndreas Gohr 890865c2687SKazutaka Miyasaka foreach ($q['parsed_ary'] as $token) { 89140f2b82eSAndreas Gohr if (strlen($token) < 3 || $token[2] !== ':') continue; 892865c2687SKazutaka Miyasaka $body = substr($token, 3); 893865c2687SKazutaka Miyasaka 894865c2687SKazutaka Miyasaka switch (substr($token, 0, 3)) { 8952f502d70SKazutaka Miyasaka case 'N+:': 896865c2687SKazutaka Miyasaka $q['ns'][] = $body; // for backward compatibility 897865c2687SKazutaka Miyasaka break; 8982f502d70SKazutaka Miyasaka case 'N-:': 8992f502d70SKazutaka Miyasaka $q['notns'][] = $body; // for backward compatibility 9002f502d70SKazutaka Miyasaka break; 9012f502d70SKazutaka Miyasaka case 'W_:': 9022f502d70SKazutaka Miyasaka $q['words'][] = $body; 9032f502d70SKazutaka Miyasaka break; 904865c2687SKazutaka Miyasaka case 'W-:': 905865c2687SKazutaka Miyasaka $q['words'][] = $body; 9062f502d70SKazutaka Miyasaka $q['not'][] = $body; // for backward compatibility 907865c2687SKazutaka Miyasaka break; 908865c2687SKazutaka Miyasaka case 'W+:': 909865c2687SKazutaka Miyasaka $q['words'][] = $body; 9102237b4faSAndreas Gohr $q['highlight'][] = $body; 9112f502d70SKazutaka Miyasaka $q['and'][] = $body; // for backward compatibility 912865c2687SKazutaka Miyasaka break; 9132f502d70SKazutaka Miyasaka case 'P-:': 9142f502d70SKazutaka Miyasaka $q['phrases'][] = $body; 9152f502d70SKazutaka Miyasaka break; 9162f502d70SKazutaka Miyasaka case 'P+:': 917865c2687SKazutaka Miyasaka $q['phrases'][] = $body; 9182237b4faSAndreas Gohr $q['highlight'][] = $body; 919865c2687SKazutaka Miyasaka break; 920865c2687SKazutaka Miyasaka } 921865c2687SKazutaka Miyasaka } 92224870174SAndreas Gohr foreach (['words', 'phrases', 'highlight', 'ns', 'notns', 'and', 'not'] as $key) { 92324870174SAndreas Gohr $q[$key] = empty($q[$key]) ? [] : array_values(array_unique($q[$key])); 924f5eb7cf0SAndreas Gohr } 925f5eb7cf0SAndreas Gohr 926f5eb7cf0SAndreas Gohr return $q; 927f5eb7cf0SAndreas Gohr} 928f5eb7cf0SAndreas Gohr 929865c2687SKazutaka Miyasaka/** 930865c2687SKazutaka Miyasaka * Transforms given search term into intermediate representation 931865c2687SKazutaka Miyasaka * 932865c2687SKazutaka Miyasaka * This function is used in ft_queryParser() and not for general purpose use. 933865c2687SKazutaka Miyasaka * 934865c2687SKazutaka Miyasaka * @author Kazutaka Miyasaka <kazmiya@gmail.com> 93542ea7f44SGerrit Uitslag * 93624870174SAndreas Gohr * @param Indexer $Indexer 93742ea7f44SGerrit Uitslag * @param string $term 93842ea7f44SGerrit Uitslag * @param bool $consider_asian 93942ea7f44SGerrit Uitslag * @param bool $phrase_mode 94042ea7f44SGerrit Uitslag * @return string 941865c2687SKazutaka Miyasaka */ 942d868eb89SAndreas Gohrfunction ft_termParser($Indexer, $term, $consider_asian = true, $phrase_mode = false) 943d868eb89SAndreas Gohr{ 944865c2687SKazutaka Miyasaka $parsed = ''; 945865c2687SKazutaka Miyasaka if ($consider_asian) { 946865c2687SKazutaka Miyasaka // successive asian characters need to be searched as a phrase 94724870174SAndreas Gohr $words = Asian::splitAsianWords($term); 948865c2687SKazutaka Miyasaka foreach ($words as $word) { 94924870174SAndreas Gohr $phrase_mode = $phrase_mode ? true : Asian::isAsianWords($word); 9509b41be24STom N Harris $parsed .= ft_termParser($Indexer, $word, false, $phrase_mode); 951865c2687SKazutaka Miyasaka } 952865c2687SKazutaka Miyasaka } else { 95324870174SAndreas Gohr $term_noparen = str_replace(['(', ')'], ' ', $term); 9549b41be24STom N Harris $words = $Indexer->tokenizer($term_noparen, true); 955865c2687SKazutaka Miyasaka 9562f502d70SKazutaka Miyasaka // W_: no need to highlight 957865c2687SKazutaka Miyasaka if (empty($words)) { 958865c2687SKazutaka Miyasaka $parsed = '()'; // important: do not remove 959865c2687SKazutaka Miyasaka } elseif ($words[0] === $term) { 960865c2687SKazutaka Miyasaka $parsed = '(W+:' . $words[0] . ')'; 961865c2687SKazutaka Miyasaka } elseif ($phrase_mode) { 96224870174SAndreas Gohr $term_encoded = str_replace(['(', ')'], ['OP', 'CP'], $term); 9632f502d70SKazutaka Miyasaka $parsed = '((W_:' . implode(')(W_:', $words) . ')(P+:' . $term_encoded . '))'; 964865c2687SKazutaka Miyasaka } else { 965865c2687SKazutaka Miyasaka $parsed = '((W+:' . implode(')(W+:', $words) . '))'; 966865c2687SKazutaka Miyasaka } 967865c2687SKazutaka Miyasaka } 968865c2687SKazutaka Miyasaka return $parsed; 969865c2687SKazutaka Miyasaka} 970865c2687SKazutaka Miyasaka 97144156e11SMichael Große/** 97244156e11SMichael Große * Recreate a search query string based on parsed parts, doesn't support negated phrases and `OR` searches 97344156e11SMichael Große * 97444156e11SMichael Große * @param array $and 97544156e11SMichael Große * @param array $not 97644156e11SMichael Große * @param array $phrases 97744156e11SMichael Große * @param array $ns 97844156e11SMichael Große * @param array $notns 97944156e11SMichael Große * 98044156e11SMichael Große * @return string 98144156e11SMichael Große */ 982d868eb89SAndreas Gohrfunction ft_queryUnparser_simple(array $and, array $not, array $phrases, array $ns, array $notns) 983d868eb89SAndreas Gohr{ 98444156e11SMichael Große $query = implode(' ', $and); 98524870174SAndreas Gohr if ($not !== []) { 98644156e11SMichael Große $query .= ' -' . implode(' -', $not); 98744156e11SMichael Große } 98844156e11SMichael Große 98924870174SAndreas Gohr if ($phrases !== []) { 99044156e11SMichael Große $query .= ' "' . implode('" "', $phrases) . '"'; 99144156e11SMichael Große } 99244156e11SMichael Große 99324870174SAndreas Gohr if ($ns !== []) { 99444156e11SMichael Große $query .= ' @' . implode(' @', $ns); 99544156e11SMichael Große } 99644156e11SMichael Große 99724870174SAndreas Gohr if ($notns !== []) { 99844156e11SMichael Große $query .= ' ^' . implode(' ^', $notns); 99944156e11SMichael Große } 100044156e11SMichael Große 100144156e11SMichael Große return $query; 100244156e11SMichael Große} 100344156e11SMichael Große 1004e3776c06SMichael Hamann//Setup VIM: ex: et ts=4 : 1005