1<?php 2/** 3 * DokuWiki fulltextsearch functions using the index 4 * 5 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10 require_once(DOKU_INC.'inc/indexer.php'); 11 12 13/** 14 * The fulltext search 15 * 16 * Returns a list of matching documents for the given query 17 * 18 */ 19function ft_pageSearch($query,&$poswords){ 20 $q = ft_queryParser($query); 21 22 // use this for higlighting later: 23 $poswords = join(' ',$q['and']); 24 25 // lookup all words found in the query 26 $words = array_merge($q['and'],$q['not']); 27 if(!count($words)) return array(); 28 $result = idx_lookup($words); 29 30 // merge search results with query 31 foreach($q['and'] as $pos => $w){ 32 $q['and'][$pos] = $result[$w]; 33 } 34 // create a list of unwanted docs 35 $not = array(); 36 foreach($q['not'] as $pos => $w){ 37 $not = array_merge($not,array_keys($result[$w])); 38 } 39 40 // combine and-words 41 if(count($q['and']) > 1){ 42 $docs = ft_resultCombine($q['and']); 43 }else{ 44 $docs = $q['and'][0]; 45 } 46 if(!count($docs)) return array(); 47 48 // remove negative matches 49 foreach($not as $n){ 50 unset($docs[$n]); 51 } 52 53 if(!count($docs)) return array(); 54 55 // handle phrases 56 if(count($q['phrases'])){ 57 //build a regexp 58 $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); 59 $q['phrases'] = array_map('preg_quote',$q['phrases']); 60 $regex = '('.join('|',$q['phrases']).')'; 61 62 // check the source of all documents for the exact phrases 63 foreach(array_keys($docs) as $id){ 64 $text = utf8_strtolower(rawWiki($id)); 65 if(!preg_match('/'.$regex.'/usi',$text)){ 66 unset($docs[$id]); // no hit - remove 67 } 68 } 69 } 70 71 if(!count($docs)) return array(); 72 73 // if there are any hits left, sort them by count 74 arsort($docs); 75 76 return $docs; 77} 78 79/** 80 * Quicksearch for pagenames 81 * 82 * By default it only matches the pagename and ignores the 83 * namespace. This can be changed with the second parameter 84 * 85 * @author Andreas Gohr <andi@splitbrain.org> 86 */ 87function ft_pageLookup($id,$pageonly=true){ 88 global $conf; 89 $id = preg_quote($id,'/'); 90 $pages = file($conf['cachedir'].'/page.idx'); 91 $pages = array_values(preg_grep('/'.$id.'/',$pages)); 92 93 $cnt = count($pages); 94 for($i=0; $i<$cnt; $i++){ 95 if($pageonly){ 96 if(!preg_match('/'.$id.'/',noNS($pages[$i]))){ 97 unset($pages[$i]); 98 continue; 99 } 100 } 101 if(!@file_exists(wikiFN($pages[$i]))){ 102 unset($pages[$i]); 103 continue; 104 } 105 } 106 sort($pages); 107 return $pages; 108} 109 110/** 111 * Creates a snippet extract 112 * 113 * @author Andreas Gohr <andi@splitbrain.org> 114 */ 115function ft_snippet($id,$poswords){ 116 $poswords = preg_quote($poswords,'#'); 117 $re = '('.str_replace(' ','|',$poswords).')'; 118 $text = rawWiki($id); 119 //FIXME caseinsensitive matching doesn't work with UTF-8!? 120 preg_match_all('#(.{0,50})'.$re.'(.{0,50})#iu',$text,$matches,PREG_SET_ORDER); 121 122 $cnt = 0; 123 $snippet = ''; 124 foreach($matches as $match){ 125 $snippet .= '...'.htmlspecialchars($match[1]); 126 $snippet .= '<span class="search_hit">'; 127 $snippet .= htmlspecialchars($match[2]); 128 $snippet .= '</span>'; 129 $snippet .= htmlspecialchars($match[3]).'... '; 130 if($cnt++ == 2) break; 131 } 132 133 return $snippet; 134} 135 136/** 137 * Combine found documents and sum up their scores 138 * 139 * This function is used to combine searched words with a logical 140 * AND. Only documents available in all arrays are returned. 141 * 142 * based upon PEAR's PHP_Compat function for array_intersect_key() 143 * 144 * @param array $args An array of page arrays 145 */ 146function ft_resultCombine($args){ 147 $array_count = count($args); 148 $result = array(); 149 foreach ($args[0] as $key1 => $value1) { 150 for ($i = 1; $i !== $array_count; $i++) { 151 foreach ($args[$i] as $key2 => $value2) { 152 if ((string) $key1 === (string) $key2) { 153 if(!isset($result[$key1])) $result[$key1] = $value1; 154 $result[$key1] += $value2; 155 } 156 } 157 } 158 } 159 return $result; 160} 161 162/** 163 * Builds an array of search words from a query 164 * 165 * @todo support OR and parenthesises? 166 */ 167function ft_queryParser($query){ 168 global $conf; 169 $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 170 if(@file_exists($swfile)){ 171 $stopwords = file($swfile); 172 }else{ 173 $stopwords = array(); 174 } 175 176 $q = array(); 177 $q['query'] = $query; 178 $q['phrases'] = array(); 179 $q['and'] = array(); 180 $q['not'] = array(); 181 182 // handle phrase searches 183 while(preg_match('/"(.*?)"/',$query,$match)){ 184 $q['phrases'][] = $match[0]; 185 $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords)); 186 $query = preg_replace('/"(.*?)"/','',$query,1); 187 } 188 189 $words = explode(' ',$query); 190 foreach($words as $w){ 191 if($w{0} == '-'){ 192 $token = idx_tokenizer($w,$stopwords); 193 if(count($token)) $q['not'] = array_merge($q['not'],$token); 194 }else{ 195 $token = idx_tokenizer($w,$stopwords); 196 if(count($token)) $q['and'] = array_merge($q['and'],$token); 197 } 198 } 199 200 return $q; 201} 202 203//Setup VIM: ex: et ts=4 enc=utf-8 : 204