xref: /dokuwiki/inc/fulltext.php (revision ea3a66b20ac38cf452125dca0bc416d480d6d82c)
1<?php
2/**
3 * DokuWiki fulltextsearch functions using the index
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_INC.'inc/indexer.php');
11
12
13/**
14 * The fulltext search
15 *
16 * Returns a list of matching documents for the given query
17 *
18 */
19function ft_pageSearch($query,&$poswords){
20    $q = ft_queryParser($query);
21
22    // use this for higlighting later:
23    $poswords = join(' ',$q['and']);
24
25    // lookup all words found in the query
26    $words  = array_merge($q['and'],$q['not']);
27    if(!count($words)) return array();
28    $result = idx_lookup($words);
29
30    // merge search results with query
31    foreach($q['and'] as $pos => $w){
32        $q['and'][$pos] = $result[$w];
33    }
34    // create a list of unwanted docs
35    $not = array();
36    foreach($q['not'] as $pos => $w){
37        $not = array_merge($not,array_keys($result[$w]));
38    }
39
40    // combine and-words
41    if(count($q['and']) > 1){
42        $docs = ft_resultCombine($q['and']);
43    }else{
44        $docs = $q['and'][0];
45    }
46    if(!count($docs)) return array();
47
48    // remove negative matches
49    foreach($not as $n){
50        unset($docs[$n]);
51    }
52
53    if(!count($docs)) return array();
54
55    // handle phrases
56    if(count($q['phrases'])){
57        //build a regexp
58        $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
59        $q['phrases'] = array_map('preg_quote',$q['phrases']);
60        $regex = '('.join('|',$q['phrases']).')';
61
62        // check the source of all documents for the exact phrases
63        foreach(array_keys($docs) as $id){
64            $text  = utf8_strtolower(rawWiki($id));
65            if(!preg_match('/'.$regex.'/usi',$text)){
66                unset($docs[$id]); // no hit - remove
67            }
68        }
69    }
70
71    if(!count($docs)) return array();
72
73    // check ACL permissions
74    foreach(array_keys($docs) as $doc){
75        if(auth_quickaclcheck($doc) < AUTH_READ){
76            unset($docs[$doc]);
77        }
78    }
79
80    if(!count($docs)) return array();
81
82    // if there are any hits left, sort them by count
83    arsort($docs);
84
85    return $docs;
86}
87
88/**
89 * Returns the backlinks for a given page
90 *
91 * Does a quick lookup with the fulltext index, then
92 * evaluates the instructions of the found pages
93 */
94function ft_backlinks($id){
95    global $conf;
96    $result = array();
97
98    // quick lookup of the pagename
99    $page    = noNS($id);
100    $sw      = array(); // we don't use stopwords here
101    $matches = idx_lookup(idx_tokenizer($page,$sw));  //pagename may contain specials (_ or .)
102    $docs = ft_resultCombine(array_values($matches));
103
104    if(!count($docs)) return $result;
105    require_once(DOKU_INC.'inc/parserutils.php');
106
107    // check instructions for matching links
108    foreach(array_keys($docs) as $match){
109        $instructions = p_cached_instructions(wikiFN($match),true);
110        if(is_null($instructions)) continue;
111
112        $match_ns =  getNS($match);
113
114        foreach($instructions as $ins){
115            if($ins[0] == 'internallink' || ($conf['camelcase'] && $ins[0] == 'camelcaselink') ){
116                $link = $ins[1][0];
117                resolve_pageid($match_ns,$link,$exists); //exists is not used
118                if($link == $id){
119                    //we have a match - finish
120                    $result[] = $match;
121                    break;
122                }
123            }
124        }
125    }
126
127    if(!count($result)) return $result;
128
129    // check ACL permissions
130    foreach(array_keys($result) as $idx){
131        if(auth_quickaclcheck($result[$idx]) < AUTH_READ){
132            unset($result[$idx]);
133        }
134    }
135
136    sort($result);
137    return $result;
138}
139
140/**
141 * Quicksearch for pagenames
142 *
143 * By default it only matches the pagename and ignores the
144 * namespace. This can be changed with the second parameter
145 *
146 * @author Andreas Gohr <andi@splitbrain.org>
147 */
148function ft_pageLookup($id,$pageonly=true){
149    global $conf;
150    $id    = preg_quote($id,'/');
151    $pages = file($conf['cachedir'].'/page.idx');
152    $pages = array_values(preg_grep('/'.$id.'/',$pages));
153
154    $cnt = count($pages);
155    for($i=0; $i<$cnt; $i++){
156        if($pageonly){
157            if(!preg_match('/'.$id.'/',noNS($pages[$i]))){
158                unset($pages[$i]);
159                continue;
160            }
161        }
162        if(!@file_exists(wikiFN($pages[$i]))){
163            unset($pages[$i]);
164            continue;
165        }
166    }
167
168    if(!count($pages)) return array();
169
170    // check ACL permissions
171    foreach(array_keys($pages) as $idx){
172        if(auth_quickaclcheck($pages[$idx]) < AUTH_READ){
173            unset($pages[$idx]);
174        }
175    }
176
177    sort($pages);
178    return $pages;
179}
180
181/**
182 * Creates a snippet extract
183 *
184 * @author Andreas Gohr <andi@splitbrain.org>
185 */
186function ft_snippet($id,$poswords){
187    $poswords = preg_quote($poswords,'#');
188    $re       = '('.str_replace(' ','|',$poswords).')';
189    $text     = rawWiki($id);
190    //FIXME caseinsensitive matching doesn't work with UTF-8!?
191    preg_match_all('#(.{0,50})'.$re.'(.{0,50})#iu',$text,$matches,PREG_SET_ORDER);
192
193    $cnt = 0;
194    $snippet = '';
195    foreach($matches as $match){
196        $snippet .= '...'.htmlspecialchars($match[1]);
197        $snippet .= '<span class="search_hit">';
198        $snippet .= htmlspecialchars($match[2]);
199        $snippet .= '</span>';
200        $snippet .= htmlspecialchars($match[3]).'... ';
201        if($cnt++ == 2) break;
202    }
203
204    return $snippet;
205}
206
207/**
208 * Combine found documents and sum up their scores
209 *
210 * This function is used to combine searched words with a logical
211 * AND. Only documents available in all arrays are returned.
212 *
213 * based upon PEAR's PHP_Compat function for array_intersect_key()
214 *
215 * @param array $args An array of page arrays
216 */
217function ft_resultCombine($args){
218    $array_count = count($args);
219    $result = array();
220    foreach ($args[0] as $key1 => $value1) {
221        for ($i = 1; $i !== $array_count; $i++) {
222            foreach ($args[$i] as $key2 => $value2) {
223                if ((string) $key1 === (string) $key2) {
224                    if(!isset($result[$key1])) $result[$key1] = $value1;
225                    $result[$key1] += $value2;
226                }
227            }
228        }
229    }
230    return $result;
231}
232
233/**
234 * Builds an array of search words from a query
235 *
236 * @todo support OR and parenthesises?
237 */
238function ft_queryParser($query){
239    global $conf;
240    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
241    if(@file_exists($swfile)){
242        $stopwords = file($swfile);
243    }else{
244        $stopwords = array();
245    }
246
247    $q = array();
248    $q['query']   = $query;
249    $q['phrases'] = array();
250    $q['and']     = array();
251    $q['not']     = array();
252
253    // handle phrase searches
254    while(preg_match('/"(.*?)"/',$query,$match)){
255        $q['phrases'][] = $match[0];
256        $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords));
257        $query = preg_replace('/"(.*?)"/','',$query,1);
258    }
259
260    $words = explode(' ',$query);
261    foreach($words as $w){
262        if($w{0} == '-'){
263            $token = idx_tokenizer($w,$stopwords);
264            if(count($token)) $q['not'] = array_merge($q['not'],$token);
265        }else{
266            $token = idx_tokenizer($w,$stopwords);
267            if(count($token)) $q['and'] = array_merge($q['and'],$token);
268        }
269    }
270
271    return $q;
272}
273
274//Setup VIM: ex: et ts=4 enc=utf-8 :
275