xref: /dokuwiki/inc/fulltext.php (revision f5eb7cf010ced7faf2c4e09cbc3ddaeff6b0f694)
1*f5eb7cf0SAndreas Gohr<?php
2*f5eb7cf0SAndreas Gohr/**
3*f5eb7cf0SAndreas Gohr * DokuWiki fulltextsearch functions using the index
4*f5eb7cf0SAndreas Gohr *
5*f5eb7cf0SAndreas Gohr * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
6*f5eb7cf0SAndreas Gohr * @author     Andreas Gohr <andi@splitbrain.org>
7*f5eb7cf0SAndreas Gohr */
8*f5eb7cf0SAndreas Gohr
9*f5eb7cf0SAndreas Gohr  if(!defined('DOKU_INC')) define('DOKU_INC',realpath(dirname(__FILE__).'/../').'/');
10*f5eb7cf0SAndreas Gohr  require_once(DOKU_INC.'inc/indexer.php');
11*f5eb7cf0SAndreas Gohr
12*f5eb7cf0SAndreas Gohr
13*f5eb7cf0SAndreas Gohr/**
14*f5eb7cf0SAndreas Gohr * The fulltext search
15*f5eb7cf0SAndreas Gohr *
16*f5eb7cf0SAndreas Gohr * Returns a list of matching documents for the given query
17*f5eb7cf0SAndreas Gohr */
18*f5eb7cf0SAndreas Gohrfunction ft_pageSearch($query){
19*f5eb7cf0SAndreas Gohr    $q = ft_queryParser($query);
20*f5eb7cf0SAndreas Gohr
21*f5eb7cf0SAndreas Gohr    // lookup all words found in the query
22*f5eb7cf0SAndreas Gohr    $words  = array_merge($q['and'],$q['not']);
23*f5eb7cf0SAndreas Gohr    foreach($q['phrases'] as $phrase){
24*f5eb7cf0SAndreas Gohr        $words  = array_merge($words,$phrase['words']);
25*f5eb7cf0SAndreas Gohr    }
26*f5eb7cf0SAndreas Gohr    if(!count($words)) return array();
27*f5eb7cf0SAndreas Gohr    $result = idx_lookup($words);
28*f5eb7cf0SAndreas Gohr
29*f5eb7cf0SAndreas Gohr    // merge search results with query
30*f5eb7cf0SAndreas Gohr    foreach($q['and'] as $pos => $w){
31*f5eb7cf0SAndreas Gohr        $q['and'][$pos] = $result[$w];
32*f5eb7cf0SAndreas Gohr    }
33*f5eb7cf0SAndreas Gohr    // create a list of unwanted docs
34*f5eb7cf0SAndreas Gohr    $not = array();
35*f5eb7cf0SAndreas Gohr    foreach($q['not'] as $pos => $w){
36*f5eb7cf0SAndreas Gohr        $not = array_merge($not,array_keys($result[$w]));
37*f5eb7cf0SAndreas Gohr    }
38*f5eb7cf0SAndreas Gohr
39*f5eb7cf0SAndreas Gohr
40*f5eb7cf0SAndreas Gohr    // combine and words
41*f5eb7cf0SAndreas Gohr    if(count($q['and']) > 1){
42*f5eb7cf0SAndreas Gohr        $docs = ft_resultCombine($q['and']);
43*f5eb7cf0SAndreas Gohr    }else{
44*f5eb7cf0SAndreas Gohr        $docs = $q['and'][0];
45*f5eb7cf0SAndreas Gohr    }
46*f5eb7cf0SAndreas Gohr    if(!count($docs)) return array();
47*f5eb7cf0SAndreas Gohr
48*f5eb7cf0SAndreas Gohr    // remove negative matches
49*f5eb7cf0SAndreas Gohr    foreach($not as $n){
50*f5eb7cf0SAndreas Gohr        unset($docs[$n]);
51*f5eb7cf0SAndreas Gohr    }
52*f5eb7cf0SAndreas Gohr
53*f5eb7cf0SAndreas Gohr    if(!count($docs)) return array();
54*f5eb7cf0SAndreas Gohr
55*f5eb7cf0SAndreas Gohr
56*f5eb7cf0SAndreas Gohr    // handle phrases
57*f5eb7cf0SAndreas Gohr    if(count($q['phrases'])){
58*f5eb7cf0SAndreas Gohr        //build a regexp
59*f5eb7cf0SAndreas Gohr        $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
60*f5eb7cf0SAndreas Gohr        $q['phrases'] = array_map('preg_quote',$q['phrases']);
61*f5eb7cf0SAndreas Gohr        $regex = '('.join('|',$q['phrases']).')';
62*f5eb7cf0SAndreas Gohr
63*f5eb7cf0SAndreas Gohr        // check the source of all documents for the exact phrases
64*f5eb7cf0SAndreas Gohr        foreach(array_keys($docs) as $id){
65*f5eb7cf0SAndreas Gohr            $text  = utf8_strtolower(rawWiki($id));
66*f5eb7cf0SAndreas Gohr            if(!preg_match_all('/'.$regex.'/usi',$text)){
67*f5eb7cf0SAndreas Gohr                unset($docs[$id]); // no hit - remove
68*f5eb7cf0SAndreas Gohr            }
69*f5eb7cf0SAndreas Gohr        }
70*f5eb7cf0SAndreas Gohr    }
71*f5eb7cf0SAndreas Gohr
72*f5eb7cf0SAndreas Gohr    if(!count($docs)) return array();
73*f5eb7cf0SAndreas Gohr
74*f5eb7cf0SAndreas Gohr    // if there are any hits left, sort them by count
75*f5eb7cf0SAndreas Gohr    arsort($docs);
76*f5eb7cf0SAndreas Gohr
77*f5eb7cf0SAndreas Gohr    return $docs;
78*f5eb7cf0SAndreas Gohr}
79*f5eb7cf0SAndreas Gohr
80*f5eb7cf0SAndreas Gohr/**
81*f5eb7cf0SAndreas Gohr * Combine found documents and sum up their scores
82*f5eb7cf0SAndreas Gohr *
83*f5eb7cf0SAndreas Gohr * This function is used to combine searched words with a logical
84*f5eb7cf0SAndreas Gohr * AND. Only documents available in all arrays are returned.
85*f5eb7cf0SAndreas Gohr *
86*f5eb7cf0SAndreas Gohr * based upon PEAR's PHP_Compat function for array_intersect_key()
87*f5eb7cf0SAndreas Gohr *
88*f5eb7cf0SAndreas Gohr * @param array $args An array of page arrays
89*f5eb7cf0SAndreas Gohr */
90*f5eb7cf0SAndreas Gohrfunction ft_resultCombine($args){
91*f5eb7cf0SAndreas Gohr    $array_count = count($args);
92*f5eb7cf0SAndreas Gohr    $result = array();
93*f5eb7cf0SAndreas Gohr    foreach ($args[0] as $key1 => $value1) {
94*f5eb7cf0SAndreas Gohr        for ($i = 1; $i !== $array_count; $i++) {
95*f5eb7cf0SAndreas Gohr            foreach ($args[$i] as $key2 => $value2) {
96*f5eb7cf0SAndreas Gohr                if ((string) $key1 === (string) $key2) {
97*f5eb7cf0SAndreas Gohr                    if(!isset($result[$key1])) $result[$key1] = $value1;
98*f5eb7cf0SAndreas Gohr                    $result[$key1] += $value2;
99*f5eb7cf0SAndreas Gohr                }
100*f5eb7cf0SAndreas Gohr            }
101*f5eb7cf0SAndreas Gohr        }
102*f5eb7cf0SAndreas Gohr    }
103*f5eb7cf0SAndreas Gohr    return $result;
104*f5eb7cf0SAndreas Gohr}
105*f5eb7cf0SAndreas Gohr
106*f5eb7cf0SAndreas Gohr/**
107*f5eb7cf0SAndreas Gohr * Builds an array of search words from a query
108*f5eb7cf0SAndreas Gohr *
109*f5eb7cf0SAndreas Gohr * @todo support OR and parenthesises?
110*f5eb7cf0SAndreas Gohr */
111*f5eb7cf0SAndreas Gohrfunction ft_queryParser($query){
112*f5eb7cf0SAndreas Gohr    global $conf;
113*f5eb7cf0SAndreas Gohr    $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
114*f5eb7cf0SAndreas Gohr    if(@file_exists($swfile)){
115*f5eb7cf0SAndreas Gohr        $stopwords = file($swfile);
116*f5eb7cf0SAndreas Gohr    }else{
117*f5eb7cf0SAndreas Gohr        $stopwords = array();
118*f5eb7cf0SAndreas Gohr    }
119*f5eb7cf0SAndreas Gohr
120*f5eb7cf0SAndreas Gohr    $q = array();
121*f5eb7cf0SAndreas Gohr    $q['query']   = $query;
122*f5eb7cf0SAndreas Gohr    $q['phrases'] = array();
123*f5eb7cf0SAndreas Gohr    $q['and']     = array();
124*f5eb7cf0SAndreas Gohr    $q['not']     = array();
125*f5eb7cf0SAndreas Gohr
126*f5eb7cf0SAndreas Gohr    // handle phrase searches
127*f5eb7cf0SAndreas Gohr    while(preg_match('/"(.*?)"/',$query,$match)){
128*f5eb7cf0SAndreas Gohr        $q['phrases'][] = $match[0];
129*f5eb7cf0SAndreas Gohr        $q['and'] = array_merge(idx_tokenizer($match[0],$stopwords));
130*f5eb7cf0SAndreas Gohr        $query = preg_replace('/"(.*?)"/','',$query,1);
131*f5eb7cf0SAndreas Gohr    }
132*f5eb7cf0SAndreas Gohr
133*f5eb7cf0SAndreas Gohr    $words = explode(' ',$query);
134*f5eb7cf0SAndreas Gohr    foreach($words as $w){
135*f5eb7cf0SAndreas Gohr        if($w{0} == '-'){
136*f5eb7cf0SAndreas Gohr            $token = idx_tokenizer($w,$stopwords);
137*f5eb7cf0SAndreas Gohr            if(count($token)) $q['not'] = array_merge($q['not'],$token);
138*f5eb7cf0SAndreas Gohr        }else{
139*f5eb7cf0SAndreas Gohr            $token = idx_tokenizer($w,$stopwords);
140*f5eb7cf0SAndreas Gohr            if(count($token)) $q['and'] = array_merge($q['and'],$token);
141*f5eb7cf0SAndreas Gohr        }
142*f5eb7cf0SAndreas Gohr    }
143*f5eb7cf0SAndreas Gohr
144*f5eb7cf0SAndreas Gohr    return $q;
145*f5eb7cf0SAndreas Gohr}
146*f5eb7cf0SAndreas Gohr
147*f5eb7cf0SAndreas Gohr
148