1*f5eb7cf0SAndreas Gohr<?php 2*f5eb7cf0SAndreas Gohr/** 3*f5eb7cf0SAndreas Gohr * DokuWiki fulltextsearch functions using the index 4*f5eb7cf0SAndreas Gohr * 5*f5eb7cf0SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6*f5eb7cf0SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 7*f5eb7cf0SAndreas Gohr */ 8*f5eb7cf0SAndreas Gohr 9*f5eb7cf0SAndreas Gohr if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10*f5eb7cf0SAndreas Gohr require_once(DOKU_INC.'inc/indexer.php'); 11*f5eb7cf0SAndreas Gohr 12*f5eb7cf0SAndreas Gohr 13*f5eb7cf0SAndreas Gohr/** 14*f5eb7cf0SAndreas Gohr * The fulltext search 15*f5eb7cf0SAndreas Gohr * 16*f5eb7cf0SAndreas Gohr * Returns a list of matching documents for the given query 17*f5eb7cf0SAndreas Gohr */ 18*f5eb7cf0SAndreas Gohrfunction ft_pageSearch($query){ 19*f5eb7cf0SAndreas Gohr $q = ft_queryParser($query); 20*f5eb7cf0SAndreas Gohr 21*f5eb7cf0SAndreas Gohr // lookup all words found in the query 22*f5eb7cf0SAndreas Gohr $words = array_merge($q['and'],$q['not']); 23*f5eb7cf0SAndreas Gohr foreach($q['phrases'] as $phrase){ 24*f5eb7cf0SAndreas Gohr $words = array_merge($words,$phrase['words']); 25*f5eb7cf0SAndreas Gohr } 26*f5eb7cf0SAndreas Gohr if(!count($words)) return array(); 27*f5eb7cf0SAndreas Gohr $result = idx_lookup($words); 28*f5eb7cf0SAndreas Gohr 29*f5eb7cf0SAndreas Gohr // merge search results with query 30*f5eb7cf0SAndreas Gohr foreach($q['and'] as $pos => $w){ 31*f5eb7cf0SAndreas Gohr $q['and'][$pos] = $result[$w]; 32*f5eb7cf0SAndreas Gohr } 33*f5eb7cf0SAndreas Gohr // create a list of unwanted docs 34*f5eb7cf0SAndreas Gohr $not = array(); 35*f5eb7cf0SAndreas Gohr foreach($q['not'] as $pos => $w){ 36*f5eb7cf0SAndreas Gohr $not = array_merge($not,array_keys($result[$w])); 37*f5eb7cf0SAndreas Gohr } 38*f5eb7cf0SAndreas Gohr 39*f5eb7cf0SAndreas Gohr 40*f5eb7cf0SAndreas Gohr // combine and words 41*f5eb7cf0SAndreas Gohr if(count($q['and']) > 1){ 42*f5eb7cf0SAndreas Gohr $docs = ft_resultCombine($q['and']); 43*f5eb7cf0SAndreas Gohr }else{ 44*f5eb7cf0SAndreas Gohr $docs = $q['and'][0]; 45*f5eb7cf0SAndreas Gohr } 46*f5eb7cf0SAndreas Gohr if(!count($docs)) return array(); 47*f5eb7cf0SAndreas Gohr 48*f5eb7cf0SAndreas Gohr // remove negative matches 49*f5eb7cf0SAndreas Gohr foreach($not as $n){ 50*f5eb7cf0SAndreas Gohr unset($docs[$n]); 51*f5eb7cf0SAndreas Gohr } 52*f5eb7cf0SAndreas Gohr 53*f5eb7cf0SAndreas Gohr if(!count($docs)) return array(); 54*f5eb7cf0SAndreas Gohr 55*f5eb7cf0SAndreas Gohr 56*f5eb7cf0SAndreas Gohr // handle phrases 57*f5eb7cf0SAndreas Gohr if(count($q['phrases'])){ 58*f5eb7cf0SAndreas Gohr //build a regexp 59*f5eb7cf0SAndreas Gohr $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); 60*f5eb7cf0SAndreas Gohr $q['phrases'] = array_map('preg_quote',$q['phrases']); 61*f5eb7cf0SAndreas Gohr $regex = '('.join('|',$q['phrases']).')'; 62*f5eb7cf0SAndreas Gohr 63*f5eb7cf0SAndreas Gohr // check the source of all documents for the exact phrases 64*f5eb7cf0SAndreas Gohr foreach(array_keys($docs) as $id){ 65*f5eb7cf0SAndreas Gohr $text = utf8_strtolower(rawWiki($id)); 66*f5eb7cf0SAndreas Gohr if(!preg_match_all('/'.$regex.'/usi',$text)){ 67*f5eb7cf0SAndreas Gohr unset($docs[$id]); // no hit - remove 68*f5eb7cf0SAndreas Gohr } 69*f5eb7cf0SAndreas Gohr } 70*f5eb7cf0SAndreas Gohr } 71*f5eb7cf0SAndreas Gohr 72*f5eb7cf0SAndreas Gohr if(!count($docs)) return array(); 73*f5eb7cf0SAndreas Gohr 74*f5eb7cf0SAndreas Gohr // if there are any hits left, sort them by count 75*f5eb7cf0SAndreas Gohr arsort($docs); 76*f5eb7cf0SAndreas Gohr 77*f5eb7cf0SAndreas Gohr return $docs; 78*f5eb7cf0SAndreas Gohr} 79*f5eb7cf0SAndreas Gohr 80*f5eb7cf0SAndreas Gohr/** 81*f5eb7cf0SAndreas Gohr * Combine found documents and sum up their scores 82*f5eb7cf0SAndreas Gohr * 83*f5eb7cf0SAndreas Gohr * This function is used to combine searched words with a logical 84*f5eb7cf0SAndreas Gohr * AND. Only documents available in all arrays are returned. 85*f5eb7cf0SAndreas Gohr * 86*f5eb7cf0SAndreas Gohr * based upon PEAR's PHP_Compat function for array_intersect_key() 87*f5eb7cf0SAndreas Gohr * 88*f5eb7cf0SAndreas Gohr * @param array $args An array of page arrays 89*f5eb7cf0SAndreas Gohr */ 90*f5eb7cf0SAndreas Gohrfunction ft_resultCombine($args){ 91*f5eb7cf0SAndreas Gohr $array_count = count($args); 92*f5eb7cf0SAndreas Gohr $result = array(); 93*f5eb7cf0SAndreas Gohr foreach ($args[0] as $key1 => $value1) { 94*f5eb7cf0SAndreas Gohr for ($i = 1; $i !== $array_count; $i++) { 95*f5eb7cf0SAndreas Gohr foreach ($args[$i] as $key2 => $value2) { 96*f5eb7cf0SAndreas Gohr if ((string) $key1 === (string) $key2) { 97*f5eb7cf0SAndreas Gohr if(!isset($result[$key1])) $result[$key1] = $value1; 98*f5eb7cf0SAndreas Gohr $result[$key1] += $value2; 99*f5eb7cf0SAndreas Gohr } 100*f5eb7cf0SAndreas Gohr } 101*f5eb7cf0SAndreas Gohr } 102*f5eb7cf0SAndreas Gohr } 103*f5eb7cf0SAndreas Gohr return $result; 104*f5eb7cf0SAndreas Gohr} 105*f5eb7cf0SAndreas Gohr 106*f5eb7cf0SAndreas Gohr/** 107*f5eb7cf0SAndreas Gohr * Builds an array of search words from a query 108*f5eb7cf0SAndreas Gohr * 109*f5eb7cf0SAndreas Gohr * @todo support OR and parenthesises? 110*f5eb7cf0SAndreas Gohr */ 111*f5eb7cf0SAndreas Gohrfunction ft_queryParser($query){ 112*f5eb7cf0SAndreas Gohr global $conf; 113*f5eb7cf0SAndreas Gohr $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 114*f5eb7cf0SAndreas Gohr if(@file_exists($swfile)){ 115*f5eb7cf0SAndreas Gohr $stopwords = file($swfile); 116*f5eb7cf0SAndreas Gohr }else{ 117*f5eb7cf0SAndreas Gohr $stopwords = array(); 118*f5eb7cf0SAndreas Gohr } 119*f5eb7cf0SAndreas Gohr 120*f5eb7cf0SAndreas Gohr $q = array(); 121*f5eb7cf0SAndreas Gohr $q['query'] = $query; 122*f5eb7cf0SAndreas Gohr $q['phrases'] = array(); 123*f5eb7cf0SAndreas Gohr $q['and'] = array(); 124*f5eb7cf0SAndreas Gohr $q['not'] = array(); 125*f5eb7cf0SAndreas Gohr 126*f5eb7cf0SAndreas Gohr // handle phrase searches 127*f5eb7cf0SAndreas Gohr while(preg_match('/"(.*?)"/',$query,$match)){ 128*f5eb7cf0SAndreas Gohr $q['phrases'][] = $match[0]; 129*f5eb7cf0SAndreas Gohr $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords)); 130*f5eb7cf0SAndreas Gohr $query = preg_replace('/"(.*?)"/','',$query,1); 131*f5eb7cf0SAndreas Gohr } 132*f5eb7cf0SAndreas Gohr 133*f5eb7cf0SAndreas Gohr $words = explode(' ',$query); 134*f5eb7cf0SAndreas Gohr foreach($words as $w){ 135*f5eb7cf0SAndreas Gohr if($w{0} == '-'){ 136*f5eb7cf0SAndreas Gohr $token = idx_tokenizer($w,$stopwords); 137*f5eb7cf0SAndreas Gohr if(count($token)) $q['not'] = array_merge($q['not'],$token); 138*f5eb7cf0SAndreas Gohr }else{ 139*f5eb7cf0SAndreas Gohr $token = idx_tokenizer($w,$stopwords); 140*f5eb7cf0SAndreas Gohr if(count($token)) $q['and'] = array_merge($q['and'],$token); 141*f5eb7cf0SAndreas Gohr } 142*f5eb7cf0SAndreas Gohr } 143*f5eb7cf0SAndreas Gohr 144*f5eb7cf0SAndreas Gohr return $q; 145*f5eb7cf0SAndreas Gohr} 146*f5eb7cf0SAndreas Gohr 147*f5eb7cf0SAndreas Gohr 148