xref: /dokuwiki/inc/fulltext.php (revision 5fb14d63b6a7de75860bd8a94caa8d2f2fe8ee5e)
1<?php
2/**
3 * DokuWiki fulltextsearch functions using the index
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_INC.'inc/indexer.php');
11
12
13/**
14 * The fulltext search
15 *
16 * Returns a list of matching documents for the given query
17 *
18 */
19function ft_pageSearch($query,&$poswords){
20    $q = ft_queryParser($query);
21    // use this for higlighting later:
22    $poswords = str_replace('*','',join(' ',$q['and']));
23
24    // lookup all words found in the query
25    $words  = array_merge($q['and'],$q['not']);
26    if(!count($words)) return array();
27    $result = idx_lookup($words);
28
29    // merge search results with query
30    foreach($q['and'] as $pos => $w){
31        $q['and'][$pos] = $result[$w];
32    }
33    // create a list of unwanted docs
34    $not = array();
35    foreach($q['not'] as $pos => $w){
36        $not = array_merge($not,array_keys($result[$w]));
37    }
38
39    // combine and-words
40    if(count($q['and']) > 1){
41        $docs = ft_resultCombine($q['and']);
42    }else{
43        $docs = $q['and'][0];
44    }
45    if(!count($docs)) return array();
46
47    // create a list of hidden pages in the result
48    $hidden = array();
49    $hidden = array_filter(array_keys($docs),'isHiddenPage');
50    $not = array_merge($not,$hidden);
51
52    // remove negative matches
53    foreach($not as $n){
54        unset($docs[$n]);
55    }
56
57    if(!count($docs)) return array();
58    // handle phrases
59    if(count($q['phrases'])){
60        //build a regexp
61        $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
62        $q['phrases'] = array_map('preg_quote',$q['phrases']);
63        $regex = '('.join('|',$q['phrases']).')';
64        // check the source of all documents for the exact phrases
65        foreach(array_keys($docs) as $id){
66            $text  = utf8_strtolower(rawWiki($id));
67            if(!preg_match('/'.$regex.'/usi',$text)){
68                unset($docs[$id]); // no hit - remove
69            }
70        }
71    }
72
73    if(!count($docs)) return array();
74
75    // check ACL permissions
76    foreach(array_keys($docs) as $doc){
77        if(auth_quickaclcheck($doc) < AUTH_READ){
78            unset($docs[$doc]);
79        }
80    }
81
82    if(!count($docs)) return array();
83
84    // if there are any hits left, sort them by count
85    arsort($docs);
86
87    return $docs;
88}
89
90/**
91 * Returns the backlinks for a given page
92 *
93 * Does a quick lookup with the fulltext index, then
94 * evaluates the instructions of the found pages
95 */
96function ft_backlinks($id){
97    global $conf;
98    $result = array();
99
100    // quick lookup of the pagename
101    $page    = noNS($id);
102    $sw      = array(); // we don't use stopwords here
103    $matches = idx_lookup(idx_tokenizer($page,$sw));  // pagename may contain specials (_ or .)
104    $docs    = array_keys(ft_resultCombine(array_values($matches)));
105    $docs    = array_filter($docs,'isVisiblePage'); // discard hidden pages
106    if(!count($docs)) return $result;
107    require_once(DOKU_INC.'inc/parserutils.php');
108
109    // check instructions for matching links
110    foreach($docs as $match){
111        $instructions = p_cached_instructions(wikiFN($match),true);
112        if(is_null($instructions)) continue;
113
114        $match_ns =  getNS($match);
115
116        foreach($instructions as $ins){
117            if($ins[0] == 'internallink' || ($conf['camelcase'] && $ins[0] == 'camelcaselink') ){
118                $link = $ins[1][0];
119                resolve_pageid($match_ns,$link,$exists); //exists is not used
120                if($link == $id){
121                    //we have a match - finish
122                    $result[] = $match;
123                    break;
124                }
125            }
126        }
127    }
128
129    if(!count($result)) return $result;
130
131    // check ACL permissions
132    foreach(array_keys($result) as $idx){
133        if(auth_quickaclcheck($result[$idx]) < AUTH_READ){
134            unset($result[$idx]);
135        }
136    }
137
138    sort($result);
139    return $result;
140}
141
142/**
143 * Quicksearch for pagenames
144 *
145 * By default it only matches the pagename and ignores the
146 * namespace. This can be changed with the second parameter
147 *
148 * @author Andreas Gohr <andi@splitbrain.org>
149 */
150function ft_pageLookup($id,$pageonly=true){
151    global $conf;
152    $id    = preg_quote($id,'/');
153    $pages = file($conf['cachedir'].'/page.idx');
154    $pages = array_values(preg_grep('/'.$id.'/',$pages));
155
156    $cnt = count($pages);
157    for($i=0; $i<$cnt; $i++){
158        if($pageonly){
159            if(!preg_match('/'.$id.'/',noNS($pages[$i]))){
160                unset($pages[$i]);
161                continue;
162            }
163        }
164        if(!@file_exists(wikiFN($pages[$i]))){
165            unset($pages[$i]);
166            continue;
167        }
168    }
169
170    $pages = array_filter($pages,'isVisiblePage'); // discard hidden pages
171    if(!count($pages)) return array();
172
173    // check ACL permissions
174    foreach(array_keys($pages) as $idx){
175        if(auth_quickaclcheck($pages[$idx]) < AUTH_READ){
176            unset($pages[$idx]);
177        }
178    }
179
180    sort($pages);
181    return $pages;
182}
183
184/**
185 * Creates a snippet extract
186 *
187 * @author Andreas Gohr <andi@splitbrain.org>
188 */
189function ft_snippet($id,$poswords){
190    $poswords = preg_quote($poswords,'#');
191    $re       = '('.str_replace(' ','|',$poswords).')';
192    $text     = rawWiki($id);
193    //FIXME caseinsensitive matching doesn't work with UTF-8!?
194    preg_match_all('#(.{0,50})'.$re.'(.{0,50})#iu',$text,$matches,PREG_SET_ORDER);
195
196    $cnt = 0;
197    $snippet = '';
198    foreach($matches as $match){
199        $snippet .= '...'.htmlspecialchars($match[1]);
200        $snippet .= '<span class="search_hit">';
201        $snippet .= htmlspecialchars($match[2]);
202        $snippet .= '</span>';
203        $snippet .= htmlspecialchars($match[3]).'... ';
204        if($cnt++ == 2) break;
205    }
206
207    return $snippet;
208}
209
210/**
211 * Combine found documents and sum up their scores
212 *
213 * This function is used to combine searched words with a logical
214 * AND. Only documents available in all arrays are returned.
215 *
216 * based upon PEAR's PHP_Compat function for array_intersect_key()
217 *
218 * @param array $args An array of page arrays
219 */
220function ft_resultCombine($args){
221    $array_count = count($args);
222    if($array_count == 1){
223        return $args[0];
224    }
225
226    $result = array();
227    foreach ($args[0] as $key1 => $value1) {
228        for ($i = 1; $i !== $array_count; $i++) {
229            foreach ($args[$i] as $key2 => $value2) {
230                if ((string) $key1 === (string) $key2) {
231                    if(!isset($result[$key1])) $result[$key1] = $value1;
232                    $result[$key1] += $value2;
233                }
234            }
235        }
236    }
237    return $result;
238}
239
240/**
241 * Builds an array of search words from a query
242 *
243 * @todo support OR and parenthesises?
244 * @todo add namespace handling
245 */
246function ft_queryParser($query){
247    global $conf;
248    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
249    if(@file_exists($swfile)){
250        $stopwords = file($swfile);
251    }else{
252        $stopwords = array();
253    }
254
255    $q = array();
256    $q['query']   = $query;
257    $q['phrases'] = array();
258    $q['and']     = array();
259    $q['not']     = array();
260
261    // handle phrase searches
262    while(preg_match('/"(.*?)"/',$query,$match)){
263        $q['phrases'][] = $match[1];
264        $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords));
265        $query = preg_replace('/"(.*?)"/','',$query,1);
266    }
267
268    $words = explode(' ',$query);
269    foreach($words as $w){
270        if($w{0} == '-'){
271            $token = idx_tokenizer($w,$stopwords,true);
272            if(count($token)) $q['not'] = array_merge($q['not'],$token);
273        }else{
274            // asian "words" need to be searched as phrases
275            if(@preg_match_all('/('.IDX_ASIAN.'+)/u',$w,$matches)){
276                $q['phrases'] = array_merge($q['phrases'],$matches[1]);
277
278            }
279            $token = idx_tokenizer($w,$stopwords,true);
280            if(count($token)) $q['and'] = array_merge($q['and'],$token);
281        }
282    }
283
284    return $q;
285}
286
287//Setup VIM: ex: et ts=4 enc=utf-8 :
288