xref: /dokuwiki/inc/fulltext.php (revision 5397cb9776dac56cc5b050b86d6fb9bfcb63dc6d)
1<?php
2/**
3 * DokuWiki fulltextsearch functions using the index
4 *
5 * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6 * @author     Andreas Gohr <andi@splitbrain.org>
7 */
8
9  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10  require_once(DOKU_INC.'inc/indexer.php');
11
12
13/**
14 * The fulltext search
15 *
16 * Returns a list of matching documents for the given query
17 */
18function ft_pageSearch($query){
19    $q = ft_queryParser($query);
20
21    // lookup all words found in the query
22    $words  = array_merge($q['and'],$q['not']);
23    foreach($q['phrases'] as $phrase){
24        $words  = array_merge($words,$phrase['words']);
25    }
26    if(!count($words)) return array();
27    $result = idx_lookup($words);
28
29    // merge search results with query
30    foreach($q['and'] as $pos => $w){
31        $q['and'][$pos] = $result[$w];
32    }
33    // create a list of unwanted docs
34    $not = array();
35    foreach($q['not'] as $pos => $w){
36        $not = array_merge($not,array_keys($result[$w]));
37    }
38
39
40    // combine and words
41    if(count($q['and']) > 1){
42        $docs = ft_resultCombine($q['and']);
43    }else{
44        $docs = $q['and'][0];
45    }
46    if(!count($docs)) return array();
47
48    // remove negative matches
49    foreach($not as $n){
50        unset($docs[$n]);
51    }
52
53    if(!count($docs)) return array();
54
55
56    // handle phrases
57    if(count($q['phrases'])){
58        //build a regexp
59        $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
60        $q['phrases'] = array_map('preg_quote',$q['phrases']);
61        $regex = '('.join('|',$q['phrases']).')';
62
63        // check the source of all documents for the exact phrases
64        foreach(array_keys($docs) as $id){
65            $text  = utf8_strtolower(rawWiki($id));
66            if(!preg_match_all('/'.$regex.'/usi',$text)){
67                unset($docs[$id]); // no hit - remove
68            }
69        }
70    }
71
72    if(!count($docs)) return array();
73
74    // if there are any hits left, sort them by count
75    arsort($docs);
76
77    return $docs;
78}
79
80/**
81 * Combine found documents and sum up their scores
82 *
83 * This function is used to combine searched words with a logical
84 * AND. Only documents available in all arrays are returned.
85 *
86 * based upon PEAR's PHP_Compat function for array_intersect_key()
87 *
88 * @param array $args An array of page arrays
89 */
90function ft_resultCombine($args){
91    $array_count = count($args);
92    $result = array();
93    foreach ($args[0] as $key1 => $value1) {
94        for ($i = 1; $i !== $array_count; $i++) {
95            foreach ($args[$i] as $key2 => $value2) {
96                if ((string) $key1 === (string) $key2) {
97                    if(!isset($result[$key1])) $result[$key1] = $value1;
98                    $result[$key1] += $value2;
99                }
100            }
101        }
102    }
103    return $result;
104}
105
106/**
107 * Builds an array of search words from a query
108 *
109 * @todo support OR and parenthesises?
110 */
111function ft_queryParser($query){
112    global $conf;
113    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
114    if(@file_exists($swfile)){
115        $stopwords = file($swfile);
116    }else{
117        $stopwords = array();
118    }
119
120    $q = array();
121    $q['query']   = $query;
122    $q['phrases'] = array();
123    $q['and']     = array();
124    $q['not']     = array();
125
126    // handle phrase searches
127    while(preg_match('/"(.*?)"/',$query,$match)){
128        $q['phrases'][] = $match[0];
129        $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords));
130        $query = preg_replace('/"(.*?)"/','',$query,1);
131    }
132
133    $words = explode(' ',$query);
134    foreach($words as $w){
135        if($w{0} == '-'){
136            $token = idx_tokenizer($w,$stopwords);
137            if(count($token)) $q['not'] = array_merge($q['not'],$token);
138        }else{
139            $token = idx_tokenizer($w,$stopwords);
140            if(count($token)) $q['and'] = array_merge($q['and'],$token);
141        }
142    }
143
144    return $q;
145}
146
147
148