1<?php 2/** 3 * DokuWiki fulltextsearch functions using the index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_INC.'inc/indexer.php'); 11 12 13/** 14 * The fulltext search 15 * 16 * Returns a list of matching documents for the given query 17 */ 18function ft_pageSearch($query){ 19 $q = ft_queryParser($query); 20 21 // lookup all words found in the query 22 $words = array_merge($q['and'],$q['not']); 23 foreach($q['phrases'] as $phrase){ 24 $words = array_merge($words,$phrase['words']); 25 } 26 if(!count($words)) return array(); 27 $result = idx_lookup($words); 28 29 // merge search results with query 30 foreach($q['and'] as $pos => $w){ 31 $q['and'][$pos] = $result[$w]; 32 } 33 // create a list of unwanted docs 34 $not = array(); 35 foreach($q['not'] as $pos => $w){ 36 $not = array_merge($not,array_keys($result[$w])); 37 } 38 39 40 // combine and words 41 if(count($q['and']) > 1){ 42 $docs = ft_resultCombine($q['and']); 43 }else{ 44 $docs = $q['and'][0]; 45 } 46 if(!count($docs)) return array(); 47 48 // remove negative matches 49 foreach($not as $n){ 50 unset($docs[$n]); 51 } 52 53 if(!count($docs)) return array(); 54 55 56 // handle phrases 57 if(count($q['phrases'])){ 58 //build a regexp 59 $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); 60 $q['phrases'] = array_map('preg_quote',$q['phrases']); 61 $regex = '('.join('|',$q['phrases']).')'; 62 63 // check the source of all documents for the exact phrases 64 foreach(array_keys($docs) as $id){ 65 $text = utf8_strtolower(rawWiki($id)); 66 if(!preg_match_all('/'.$regex.'/usi',$text)){ 67 unset($docs[$id]); // no hit - remove 68 } 69 } 70 } 71 72 if(!count($docs)) return array(); 73 74 // if there are any hits left, sort them by count 75 arsort($docs); 76 77 return $docs; 78} 79 80/** 81 * Combine found documents and sum up their scores 82 * 83 * This function is used to combine searched words with a logical 84 * AND. Only documents available in all arrays are returned. 85 * 86 * based upon PEAR's PHP_Compat function for array_intersect_key() 87 * 88 * @param array $args An array of page arrays 89 */ 90function ft_resultCombine($args){ 91 $array_count = count($args); 92 $result = array(); 93 foreach ($args[0] as $key1 => $value1) { 94 for ($i = 1; $i !== $array_count; $i++) { 95 foreach ($args[$i] as $key2 => $value2) { 96 if ((string) $key1 === (string) $key2) { 97 if(!isset($result[$key1])) $result[$key1] = $value1; 98 $result[$key1] += $value2; 99 } 100 } 101 } 102 } 103 return $result; 104} 105 106/** 107 * Builds an array of search words from a query 108 * 109 * @todo support OR and parenthesises? 110 */ 111function ft_queryParser($query){ 112 global $conf; 113 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 114 if(@file_exists($swfile)){ 115 $stopwords = file($swfile); 116 }else{ 117 $stopwords = array(); 118 } 119 120 $q = array(); 121 $q['query'] = $query; 122 $q['phrases'] = array(); 123 $q['and'] = array(); 124 $q['not'] = array(); 125 126 // handle phrase searches 127 while(preg_match('/"(.*?)"/',$query,$match)){ 128 $q['phrases'][] = $match[0]; 129 $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords)); 130 $query = preg_replace('/"(.*?)"/','',$query,1); 131 } 132 133 $words = explode(' ',$query); 134 foreach($words as $w){ 135 if($w{0} == '-'){ 136 $token = idx_tokenizer($w,$stopwords); 137 if(count($token)) $q['not'] = array_merge($q['not'],$token); 138 }else{ 139 $token = idx_tokenizer($w,$stopwords); 140 if(count($token)) $q['and'] = array_merge($q['and'],$token); 141 } 142 } 143 144 return $q; 145} 146 147 148