xref: /dokuwiki/inc/fulltext.php (revision 5783998f1518db5000b33432885f3153de6b579f)
1<?php
2/**
3 * DokuWiki fulltextsearch functions using the index
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_INC.'inc/indexer.php');
11
12
13/**
14 * The fulltext search
15 *
16 * Returns a list of matching documents for the given query
17 *
18 */
19function ft_pageSearch($query,&$poswords){
20    $q = ft_queryParser($query);
21    // use this for higlighting later:
22    $poswords = join(' ',$q['and']);
23
24    // lookup all words found in the query
25    $words  = array_merge($q['and'],$q['not']);
26    if(!count($words)) return array();
27    $result = idx_lookup($words);
28
29    // merge search results with query
30    foreach($q['and'] as $pos => $w){
31        $q['and'][$pos] = $result[$w];
32    }
33    // create a list of unwanted docs
34    $not = array();
35    foreach($q['not'] as $pos => $w){
36        $not = array_merge($not,array_keys($result[$w]));
37    }
38
39    // combine and-words
40    if(count($q['and']) > 1){
41        $docs = ft_resultCombine($q['and']);
42    }else{
43        $docs = $q['and'][0];
44    }
45    if(!count($docs)) return array();
46
47    // remove negative matches
48    foreach($not as $n){
49        unset($docs[$n]);
50    }
51
52    if(!count($docs)) return array();
53    // handle phrases
54    if(count($q['phrases'])){
55        //build a regexp
56        $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
57        $q['phrases'] = array_map('preg_quote',$q['phrases']);
58        $regex = '('.join('|',$q['phrases']).')';
59        // check the source of all documents for the exact phrases
60        foreach(array_keys($docs) as $id){
61            $text  = utf8_strtolower(rawWiki($id));
62            if(!preg_match('/'.$regex.'/usi',$text)){
63                unset($docs[$id]); // no hit - remove
64            }
65        }
66    }
67
68    if(!count($docs)) return array();
69
70    // check ACL permissions
71    foreach(array_keys($docs) as $doc){
72        if(auth_quickaclcheck($doc) < AUTH_READ){
73            unset($docs[$doc]);
74        }
75    }
76
77    if(!count($docs)) return array();
78
79    // if there are any hits left, sort them by count
80    arsort($docs);
81
82    return $docs;
83}
84
85/**
86 * Returns the backlinks for a given page
87 *
88 * Does a quick lookup with the fulltext index, then
89 * evaluates the instructions of the found pages
90 */
91function ft_backlinks($id){
92    global $conf;
93    $result = array();
94
95    // quick lookup of the pagename
96    $page    = noNS($id);
97    $sw      = array(); // we don't use stopwords here
98    $matches = idx_lookup(idx_tokenizer($page,$sw));  //pagename may contain specials (_ or .)
99    $docs = ft_resultCombine(array_values($matches));
100    if(!count($docs)) return $result;
101    require_once(DOKU_INC.'inc/parserutils.php');
102
103    // check instructions for matching links
104    foreach(array_keys($docs) as $match){
105        $instructions = p_cached_instructions(wikiFN($match),true);
106        if(is_null($instructions)) continue;
107
108        $match_ns =  getNS($match);
109
110        foreach($instructions as $ins){
111            if($ins[0] == 'internallink' || ($conf['camelcase'] && $ins[0] == 'camelcaselink') ){
112                $link = $ins[1][0];
113                resolve_pageid($match_ns,$link,$exists); //exists is not used
114                if($link == $id){
115                    //we have a match - finish
116                    $result[] = $match;
117                    break;
118                }
119            }
120        }
121    }
122
123    if(!count($result)) return $result;
124
125    // check ACL permissions
126    foreach(array_keys($result) as $idx){
127        if(auth_quickaclcheck($result[$idx]) < AUTH_READ){
128            unset($result[$idx]);
129        }
130    }
131
132    sort($result);
133    return $result;
134}
135
136/**
137 * Quicksearch for pagenames
138 *
139 * By default it only matches the pagename and ignores the
140 * namespace. This can be changed with the second parameter
141 *
142 * @author Andreas Gohr <andi@splitbrain.org>
143 */
144function ft_pageLookup($id,$pageonly=true){
145    global $conf;
146    $id    = preg_quote($id,'/');
147    $pages = file($conf['cachedir'].'/page.idx');
148    $pages = array_values(preg_grep('/'.$id.'/',$pages));
149
150    $cnt = count($pages);
151    for($i=0; $i<$cnt; $i++){
152        if($pageonly){
153            if(!preg_match('/'.$id.'/',noNS($pages[$i]))){
154                unset($pages[$i]);
155                continue;
156            }
157        }
158        if(!@file_exists(wikiFN($pages[$i]))){
159            unset($pages[$i]);
160            continue;
161        }
162    }
163
164    if(!count($pages)) return array();
165
166    // check ACL permissions
167    foreach(array_keys($pages) as $idx){
168        if(auth_quickaclcheck($pages[$idx]) < AUTH_READ){
169            unset($pages[$idx]);
170        }
171    }
172
173    sort($pages);
174    return $pages;
175}
176
177/**
178 * Creates a snippet extract
179 *
180 * @author Andreas Gohr <andi@splitbrain.org>
181 */
182function ft_snippet($id,$poswords){
183    $poswords = preg_quote($poswords,'#');
184    $re       = '('.str_replace(' ','|',$poswords).')';
185    $text     = rawWiki($id);
186    //FIXME caseinsensitive matching doesn't work with UTF-8!?
187    preg_match_all('#(.{0,50})'.$re.'(.{0,50})#iu',$text,$matches,PREG_SET_ORDER);
188
189    $cnt = 0;
190    $snippet = '';
191    foreach($matches as $match){
192        $snippet .= '...'.htmlspecialchars($match[1]);
193        $snippet .= '<span class="search_hit">';
194        $snippet .= htmlspecialchars($match[2]);
195        $snippet .= '</span>';
196        $snippet .= htmlspecialchars($match[3]).'... ';
197        if($cnt++ == 2) break;
198    }
199
200    return $snippet;
201}
202
203/**
204 * Combine found documents and sum up their scores
205 *
206 * This function is used to combine searched words with a logical
207 * AND. Only documents available in all arrays are returned.
208 *
209 * based upon PEAR's PHP_Compat function for array_intersect_key()
210 *
211 * @param array $args An array of page arrays
212 */
213function ft_resultCombine($args){
214    $array_count = count($args);
215    if($array_count == 1){
216        return $args[0];
217    }
218
219    $result = array();
220    foreach ($args[0] as $key1 => $value1) {
221        for ($i = 1; $i !== $array_count; $i++) {
222            foreach ($args[$i] as $key2 => $value2) {
223                if ((string) $key1 === (string) $key2) {
224                    if(!isset($result[$key1])) $result[$key1] = $value1;
225                    $result[$key1] += $value2;
226                }
227            }
228        }
229    }
230    return $result;
231}
232
233/**
234 * Builds an array of search words from a query
235 *
236 * @todo support OR and parenthesises?
237 * @todo add namespace handling
238 */
239function ft_queryParser($query){
240    global $conf;
241    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
242    if(@file_exists($swfile)){
243        $stopwords = file($swfile);
244    }else{
245        $stopwords = array();
246    }
247
248    $q = array();
249    $q['query']   = $query;
250    $q['phrases'] = array();
251    $q['and']     = array();
252    $q['not']     = array();
253
254    // handle phrase searches
255    while(preg_match('/"(.*?)"/',$query,$match)){
256        $q['phrases'][] = $match[1];
257        $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords));
258        $query = preg_replace('/"(.*?)"/','',$query,1);
259    }
260
261    $words = explode(' ',$query);
262    foreach($words as $w){
263        if($w{0} == '-'){
264            $token = idx_tokenizer($w,$stopwords);
265            if(count($token)) $q['not'] = array_merge($q['not'],$token);
266        }else{
267            // asian "words" need to be searched as phrases
268            if(preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){
269                $q['phrases'] = array_merge($q['phrases'],$matches[1]);
270
271            }
272            $token = idx_tokenizer($w,$stopwords);
273            if(count($token)) $q['and'] = array_merge($q['and'],$token);
274        }
275    }
276
277    return $q;
278}
279
280//Setup VIM: ex: et ts=4 enc=utf-8 :
281