1f5eb7cf0SAndreas Gohr<?php 2f5eb7cf0SAndreas Gohr/** 3f5eb7cf0SAndreas Gohr * DokuWiki fulltextsearch functions using the index 4f5eb7cf0SAndreas Gohr * 5f5eb7cf0SAndreas Gohr * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 6f5eb7cf0SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 7f5eb7cf0SAndreas Gohr */ 8f5eb7cf0SAndreas Gohr 9f5eb7cf0SAndreas Gohr if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/'); 10f5eb7cf0SAndreas Gohr require_once(DOKU_INC.'inc/indexer.php'); 11f5eb7cf0SAndreas Gohr 12f5eb7cf0SAndreas Gohr 13f5eb7cf0SAndreas Gohr/** 14f5eb7cf0SAndreas Gohr * The fulltext search 15f5eb7cf0SAndreas Gohr * 16f5eb7cf0SAndreas Gohr * Returns a list of matching documents for the given query 17*506fa893SAndreas Gohr * 18f5eb7cf0SAndreas Gohr */ 19*506fa893SAndreas Gohrfunction ft_pageSearch($query,&$poswords){ 20f5eb7cf0SAndreas Gohr $q = ft_queryParser($query); 21f5eb7cf0SAndreas Gohr 22*506fa893SAndreas Gohr // use this for higlighting later: 23*506fa893SAndreas Gohr $poswords = join(' ',$q['and']); 24*506fa893SAndreas Gohr 25f5eb7cf0SAndreas Gohr // lookup all words found in the query 26f5eb7cf0SAndreas Gohr $words = array_merge($q['and'],$q['not']); 27f5eb7cf0SAndreas Gohr if(!count($words)) return array(); 28f5eb7cf0SAndreas Gohr $result = idx_lookup($words); 29f5eb7cf0SAndreas Gohr 30f5eb7cf0SAndreas Gohr // merge search results with query 31f5eb7cf0SAndreas Gohr foreach($q['and'] as $pos => $w){ 32f5eb7cf0SAndreas Gohr $q['and'][$pos] = $result[$w]; 33f5eb7cf0SAndreas Gohr } 34f5eb7cf0SAndreas Gohr // create a list of unwanted docs 35f5eb7cf0SAndreas Gohr $not = array(); 36f5eb7cf0SAndreas Gohr foreach($q['not'] as $pos => $w){ 37f5eb7cf0SAndreas Gohr $not = array_merge($not,array_keys($result[$w])); 38f5eb7cf0SAndreas Gohr } 39f5eb7cf0SAndreas Gohr 40*506fa893SAndreas Gohr // combine and-words 41f5eb7cf0SAndreas Gohr if(count($q['and']) > 1){ 42f5eb7cf0SAndreas Gohr $docs = ft_resultCombine($q['and']); 43f5eb7cf0SAndreas Gohr }else{ 44f5eb7cf0SAndreas Gohr $docs = $q['and'][0]; 45f5eb7cf0SAndreas Gohr } 46f5eb7cf0SAndreas Gohr if(!count($docs)) return array(); 47f5eb7cf0SAndreas Gohr 48f5eb7cf0SAndreas Gohr // remove negative matches 49f5eb7cf0SAndreas Gohr foreach($not as $n){ 50f5eb7cf0SAndreas Gohr unset($docs[$n]); 51f5eb7cf0SAndreas Gohr } 52f5eb7cf0SAndreas Gohr 53f5eb7cf0SAndreas Gohr if(!count($docs)) return array(); 54f5eb7cf0SAndreas Gohr 55f5eb7cf0SAndreas Gohr // handle phrases 56f5eb7cf0SAndreas Gohr if(count($q['phrases'])){ 57f5eb7cf0SAndreas Gohr //build a regexp 58f5eb7cf0SAndreas Gohr $q['phrases'] = array_map('utf8_strtolower',$q['phrases']); 59f5eb7cf0SAndreas Gohr $q['phrases'] = array_map('preg_quote',$q['phrases']); 60f5eb7cf0SAndreas Gohr $regex = '('.join('|',$q['phrases']).')'; 61f5eb7cf0SAndreas Gohr 62f5eb7cf0SAndreas Gohr // check the source of all documents for the exact phrases 63f5eb7cf0SAndreas Gohr foreach(array_keys($docs) as $id){ 64f5eb7cf0SAndreas Gohr $text = utf8_strtolower(rawWiki($id)); 65*506fa893SAndreas Gohr if(!preg_match('/'.$regex.'/usi',$text)){ 66f5eb7cf0SAndreas Gohr unset($docs[$id]); // no hit - remove 67f5eb7cf0SAndreas Gohr } 68f5eb7cf0SAndreas Gohr } 69f5eb7cf0SAndreas Gohr } 70f5eb7cf0SAndreas Gohr 71f5eb7cf0SAndreas Gohr if(!count($docs)) return array(); 72f5eb7cf0SAndreas Gohr 73f5eb7cf0SAndreas Gohr // if there are any hits left, sort them by count 74f5eb7cf0SAndreas Gohr arsort($docs); 75f5eb7cf0SAndreas Gohr 76f5eb7cf0SAndreas Gohr return $docs; 77f5eb7cf0SAndreas Gohr} 78f5eb7cf0SAndreas Gohr 79f5eb7cf0SAndreas Gohr/** 80*506fa893SAndreas Gohr * Quicksearch for pagenames 81*506fa893SAndreas Gohr * 82*506fa893SAndreas Gohr * By default it only matches the pagename and ignores the 83*506fa893SAndreas Gohr * namespace. This can be changed with the second parameter 84*506fa893SAndreas Gohr * 85*506fa893SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 86*506fa893SAndreas Gohr */ 87*506fa893SAndreas Gohrfunction ft_pageLookup($id,$pageonly=true){ 88*506fa893SAndreas Gohr global $conf; 89*506fa893SAndreas Gohr $id = preg_quote($id,'/'); 90*506fa893SAndreas Gohr $pages = file($conf['cachedir'].'/page.idx'); 91*506fa893SAndreas Gohr $pages = array_values(preg_grep('/'.$id.'/',$pages)); 92*506fa893SAndreas Gohr 93*506fa893SAndreas Gohr $cnt = count($pages); 94*506fa893SAndreas Gohr for($i=0; $i<$cnt; $i++){ 95*506fa893SAndreas Gohr if($pageonly){ 96*506fa893SAndreas Gohr if(!preg_match('/'.$id.'/',noNS($pages[$i]))){ 97*506fa893SAndreas Gohr unset($pages[$i]); 98*506fa893SAndreas Gohr continue; 99*506fa893SAndreas Gohr } 100*506fa893SAndreas Gohr } 101*506fa893SAndreas Gohr if(!@file_exists(wikiFN($pages[$i]))){ 102*506fa893SAndreas Gohr unset($pages[$i]); 103*506fa893SAndreas Gohr continue; 104*506fa893SAndreas Gohr } 105*506fa893SAndreas Gohr } 106*506fa893SAndreas Gohr sort($pages); 107*506fa893SAndreas Gohr return $pages; 108*506fa893SAndreas Gohr} 109*506fa893SAndreas Gohr 110*506fa893SAndreas Gohr/** 111*506fa893SAndreas Gohr * Creates a snippet extract 112*506fa893SAndreas Gohr * 113*506fa893SAndreas Gohr * @author Andreas Gohr <andi@splitbrain.org> 114*506fa893SAndreas Gohr */ 115*506fa893SAndreas Gohrfunction ft_snippet($id,$poswords){ 116*506fa893SAndreas Gohr $poswords = preg_quote($poswords,'#'); 117*506fa893SAndreas Gohr $re = '('.str_replace(' ','|',$poswords).')'; 118*506fa893SAndreas Gohr $text = rawWiki($id); 119*506fa893SAndreas Gohr //FIXME caseinsensitive matching doesn't work with UTF-8!? 120*506fa893SAndreas Gohr preg_match_all('#(.{0,50})'.$re.'(.{0,50})#iu',$text,$matches,PREG_SET_ORDER); 121*506fa893SAndreas Gohr 122*506fa893SAndreas Gohr $cnt = 0; 123*506fa893SAndreas Gohr $snippet = ''; 124*506fa893SAndreas Gohr foreach($matches as $match){ 125*506fa893SAndreas Gohr $snippet .= '...'.htmlspecialchars($match[1]); 126*506fa893SAndreas Gohr $snippet .= '<span class="search_hit">'; 127*506fa893SAndreas Gohr $snippet .= htmlspecialchars($match[2]); 128*506fa893SAndreas Gohr $snippet .= '</span>'; 129*506fa893SAndreas Gohr $snippet .= htmlspecialchars($match[3]).'... '; 130*506fa893SAndreas Gohr if($cnt++ == 2) break; 131*506fa893SAndreas Gohr } 132*506fa893SAndreas Gohr 133*506fa893SAndreas Gohr return $snippet; 134*506fa893SAndreas Gohr} 135*506fa893SAndreas Gohr 136*506fa893SAndreas Gohr/** 137f5eb7cf0SAndreas Gohr * Combine found documents and sum up their scores 138f5eb7cf0SAndreas Gohr * 139f5eb7cf0SAndreas Gohr * This function is used to combine searched words with a logical 140f5eb7cf0SAndreas Gohr * AND. Only documents available in all arrays are returned. 141f5eb7cf0SAndreas Gohr * 142f5eb7cf0SAndreas Gohr * based upon PEAR's PHP_Compat function for array_intersect_key() 143f5eb7cf0SAndreas Gohr * 144f5eb7cf0SAndreas Gohr * @param array $args An array of page arrays 145f5eb7cf0SAndreas Gohr */ 146f5eb7cf0SAndreas Gohrfunction ft_resultCombine($args){ 147f5eb7cf0SAndreas Gohr $array_count = count($args); 148f5eb7cf0SAndreas Gohr $result = array(); 149f5eb7cf0SAndreas Gohr foreach ($args[0] as $key1 => $value1) { 150f5eb7cf0SAndreas Gohr for ($i = 1; $i !== $array_count; $i++) { 151f5eb7cf0SAndreas Gohr foreach ($args[$i] as $key2 => $value2) { 152f5eb7cf0SAndreas Gohr if ((string) $key1 === (string) $key2) { 153f5eb7cf0SAndreas Gohr if(!isset($result[$key1])) $result[$key1] = $value1; 154f5eb7cf0SAndreas Gohr $result[$key1] += $value2; 155f5eb7cf0SAndreas Gohr } 156f5eb7cf0SAndreas Gohr } 157f5eb7cf0SAndreas Gohr } 158f5eb7cf0SAndreas Gohr } 159f5eb7cf0SAndreas Gohr return $result; 160f5eb7cf0SAndreas Gohr} 161f5eb7cf0SAndreas Gohr 162f5eb7cf0SAndreas Gohr/** 163f5eb7cf0SAndreas Gohr * Builds an array of search words from a query 164f5eb7cf0SAndreas Gohr * 165f5eb7cf0SAndreas Gohr * @todo support OR and parenthesises? 166f5eb7cf0SAndreas Gohr */ 167f5eb7cf0SAndreas Gohrfunction ft_queryParser($query){ 168f5eb7cf0SAndreas Gohr global $conf; 169f5eb7cf0SAndreas Gohr $swfile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 170f5eb7cf0SAndreas Gohr if(@file_exists($swfile)){ 171f5eb7cf0SAndreas Gohr $stopwords = file($swfile); 172f5eb7cf0SAndreas Gohr }else{ 173f5eb7cf0SAndreas Gohr $stopwords = array(); 174f5eb7cf0SAndreas Gohr } 175f5eb7cf0SAndreas Gohr 176f5eb7cf0SAndreas Gohr $q = array(); 177f5eb7cf0SAndreas Gohr $q['query'] = $query; 178f5eb7cf0SAndreas Gohr $q['phrases'] = array(); 179f5eb7cf0SAndreas Gohr $q['and'] = array(); 180f5eb7cf0SAndreas Gohr $q['not'] = array(); 181f5eb7cf0SAndreas Gohr 182f5eb7cf0SAndreas Gohr // handle phrase searches 183f5eb7cf0SAndreas Gohr while(preg_match('/"(.*?)"/',$query,$match)){ 184f5eb7cf0SAndreas Gohr $q['phrases'][] = $match[0]; 185f5eb7cf0SAndreas Gohr $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords)); 186f5eb7cf0SAndreas Gohr $query = preg_replace('/"(.*?)"/','',$query,1); 187f5eb7cf0SAndreas Gohr } 188f5eb7cf0SAndreas Gohr 189f5eb7cf0SAndreas Gohr $words = explode(' ',$query); 190f5eb7cf0SAndreas Gohr foreach($words as $w){ 191f5eb7cf0SAndreas Gohr if($w{0} == '-'){ 192f5eb7cf0SAndreas Gohr $token = idx_tokenizer($w,$stopwords); 193f5eb7cf0SAndreas Gohr if(count($token)) $q['not'] = array_merge($q['not'],$token); 194f5eb7cf0SAndreas Gohr }else{ 195f5eb7cf0SAndreas Gohr $token = idx_tokenizer($w,$stopwords); 196f5eb7cf0SAndreas Gohr if(count($token)) $q['and'] = array_merge($q['and'],$token); 197f5eb7cf0SAndreas Gohr } 198f5eb7cf0SAndreas Gohr } 199f5eb7cf0SAndreas Gohr 200f5eb7cf0SAndreas Gohr return $q; 201f5eb7cf0SAndreas Gohr} 202f5eb7cf0SAndreas Gohr 203*506fa893SAndreas Gohr//Setup VIM: ex: et ts=4 enc=utf-8 : 204