xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 6b6becabab901637e0541d9f2254898b188f156e)
1173bfbcaSSatoshi Sahara<?php
2173bfbcaSSatoshi Saharanamespace dokuwiki\Search;
3173bfbcaSSatoshi Sahara
4173bfbcaSSatoshi Saharause dokuwiki\Extension\Event;
5c31af4f3SSatoshi Saharause dokuwiki\Search\Indexer;
63837ea91SSatoshi Saharause dokuwiki\Search\QueryParser;
7c31af4f3SSatoshi Saharause dokuwiki\Utf8;
8173bfbcaSSatoshi Sahara
9173bfbcaSSatoshi Sahara/**
10173bfbcaSSatoshi Sahara * Class DokuWiki Fulltext Search
11173bfbcaSSatoshi Sahara *
12173bfbcaSSatoshi Sahara * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
13173bfbcaSSatoshi Sahara * @author     Andreas Gohr <andi@splitbrain.org>
14173bfbcaSSatoshi Sahara */
15173bfbcaSSatoshi Saharaclass FulltextSearch
16173bfbcaSSatoshi Sahara{
17173bfbcaSSatoshi Sahara    /**
18173bfbcaSSatoshi Sahara     *  Fulltext Search constructor. prevent direct object creation
19173bfbcaSSatoshi Sahara     */
20173bfbcaSSatoshi Sahara    protected function __construct() {}
21173bfbcaSSatoshi Sahara
22173bfbcaSSatoshi Sahara    /**
23173bfbcaSSatoshi Sahara     * The fulltext search
24173bfbcaSSatoshi Sahara     *
25173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
26173bfbcaSSatoshi Sahara     *
27173bfbcaSSatoshi Sahara     * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
28173bfbcaSSatoshi Sahara     *
29173bfbcaSSatoshi Sahara     * @param string     $query
30173bfbcaSSatoshi Sahara     * @param array      $highlight
31173bfbcaSSatoshi Sahara     * @param string     $sort
32173bfbcaSSatoshi Sahara     * @param int|string $after  only show results with mtime after this date,
33173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
34173bfbcaSSatoshi Sahara     * @param int|string $before only show results with mtime before this date,
35173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
36173bfbcaSSatoshi Sahara     *
37173bfbcaSSatoshi Sahara     * @return array
38173bfbcaSSatoshi Sahara     */
39173bfbcaSSatoshi Sahara    public static function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
40173bfbcaSSatoshi Sahara    {
41173bfbcaSSatoshi Sahara        if ($sort === null) {
42173bfbcaSSatoshi Sahara            $sort = 'hits';
43173bfbcaSSatoshi Sahara        }
44173bfbcaSSatoshi Sahara        $data = [
45173bfbcaSSatoshi Sahara            'query' => $query,
46173bfbcaSSatoshi Sahara            'sort' => $sort,
47173bfbcaSSatoshi Sahara            'after' => $after,
48173bfbcaSSatoshi Sahara            'before' => $before
49173bfbcaSSatoshi Sahara        ];
50173bfbcaSSatoshi Sahara        $data['highlight'] =& $highlight;
51173bfbcaSSatoshi Sahara        $action = static::class.'::callback_pageSearch';
52173bfbcaSSatoshi Sahara        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
53173bfbcaSSatoshi Sahara    }
54173bfbcaSSatoshi Sahara
55173bfbcaSSatoshi Sahara    /**
56173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
57173bfbcaSSatoshi Sahara     *
58173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
59173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
60173bfbcaSSatoshi Sahara     *
61173bfbcaSSatoshi Sahara     * @param array $data  event data
62173bfbcaSSatoshi Sahara     * @return array       matching documents
63173bfbcaSSatoshi Sahara     */
64*6b6becabSSatoshi Sahara    public static function callback_pageSearch($data)
65173bfbcaSSatoshi Sahara    {
66c31af4f3SSatoshi Sahara        $Indexer = Indexer::getInstance();
67173bfbcaSSatoshi Sahara
68173bfbcaSSatoshi Sahara        // parse the given query
693837ea91SSatoshi Sahara        $q = QueryParser::convert($data['query']);
70173bfbcaSSatoshi Sahara        $data['highlight'] = $q['highlight'];
71173bfbcaSSatoshi Sahara
72173bfbcaSSatoshi Sahara        if (empty($q['parsed_ary'])) return array();
73173bfbcaSSatoshi Sahara
74173bfbcaSSatoshi Sahara        // lookup all words found in the query
75173bfbcaSSatoshi Sahara        $lookup = $Indexer->lookup($q['words']);
76173bfbcaSSatoshi Sahara
77173bfbcaSSatoshi Sahara        // get all pages in this dokuwiki site (!: includes nonexistent pages)
78173bfbcaSSatoshi Sahara        $pages_all = array();
79173bfbcaSSatoshi Sahara        foreach ($Indexer->getPages() as $id) {
80173bfbcaSSatoshi Sahara            $pages_all[$id] = 0; // base: 0 hit
81173bfbcaSSatoshi Sahara        }
82173bfbcaSSatoshi Sahara
83173bfbcaSSatoshi Sahara        // process the query
84173bfbcaSSatoshi Sahara        $stack = array();
85173bfbcaSSatoshi Sahara        foreach ($q['parsed_ary'] as $token) {
86173bfbcaSSatoshi Sahara            switch (substr($token, 0, 3)) {
87173bfbcaSSatoshi Sahara                case 'W+:':
88173bfbcaSSatoshi Sahara                case 'W-:':
89173bfbcaSSatoshi Sahara                case 'W_:': // word
90173bfbcaSSatoshi Sahara                    $word    = substr($token, 3);
91173bfbcaSSatoshi Sahara                    $stack[] = (array) $lookup[$word];
92173bfbcaSSatoshi Sahara                    break;
93173bfbcaSSatoshi Sahara                case 'P+:':
94173bfbcaSSatoshi Sahara                case 'P-:': // phrase
95173bfbcaSSatoshi Sahara                    $phrase = substr($token, 3);
96173bfbcaSSatoshi Sahara                    // since phrases are always parsed as ((W1)(W2)...(P)),
97173bfbcaSSatoshi Sahara                    // the end($stack) always points the pages that contain
98173bfbcaSSatoshi Sahara                    // all words in this phrase
99173bfbcaSSatoshi Sahara                    $pages  = end($stack);
100173bfbcaSSatoshi Sahara                    $pages_matched = array();
101173bfbcaSSatoshi Sahara                    foreach (array_keys($pages) as $id) {
102173bfbcaSSatoshi Sahara                        $evdata = array(
103173bfbcaSSatoshi Sahara                            'id' => $id,
104173bfbcaSSatoshi Sahara                            'phrase' => $phrase,
105173bfbcaSSatoshi Sahara                            'text' => rawWiki($id)
106173bfbcaSSatoshi Sahara                        );
107173bfbcaSSatoshi Sahara                        $evt = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
108173bfbcaSSatoshi Sahara                        if ($evt->advise_before() && $evt->result !== true) {
109c31af4f3SSatoshi Sahara                            $text = Utf8\PhpString::strtolower($evdata['text']);
110173bfbcaSSatoshi Sahara                            if (strpos($text, $phrase) !== false) {
111173bfbcaSSatoshi Sahara                                $evt->result = true;
112173bfbcaSSatoshi Sahara                            }
113173bfbcaSSatoshi Sahara                        }
114173bfbcaSSatoshi Sahara                        $evt->advise_after();
115173bfbcaSSatoshi Sahara                        if ($evt->result === true) {
116173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // phrase: always 0 hit
117173bfbcaSSatoshi Sahara                        }
118173bfbcaSSatoshi Sahara                    }
119173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
120173bfbcaSSatoshi Sahara                    break;
121173bfbcaSSatoshi Sahara                case 'N+:':
122173bfbcaSSatoshi Sahara                case 'N-:': // namespace
123173bfbcaSSatoshi Sahara                    $ns = cleanID(substr($token, 3)) . ':';
124173bfbcaSSatoshi Sahara                    $pages_matched = array();
125173bfbcaSSatoshi Sahara                    foreach (array_keys($pages_all) as $id) {
126173bfbcaSSatoshi Sahara                        if (strpos($id, $ns) === 0) {
127173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // namespace: always 0 hit
128173bfbcaSSatoshi Sahara                        }
129173bfbcaSSatoshi Sahara                    }
130173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
131173bfbcaSSatoshi Sahara                    break;
132173bfbcaSSatoshi Sahara                case 'AND': // and operation
133173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
134173bfbcaSSatoshi Sahara                    $stack[] = static::resultCombine(array($pages1, $pages2));
135173bfbcaSSatoshi Sahara                    break;
136173bfbcaSSatoshi Sahara                case 'OR':  // or operation
137173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
138173bfbcaSSatoshi Sahara                    $stack[] = static::resultUnite(array($pages1, $pages2));
139173bfbcaSSatoshi Sahara                    break;
140173bfbcaSSatoshi Sahara                case 'NOT': // not operation (unary)
141173bfbcaSSatoshi Sahara                    $pages   = array_pop($stack);
142173bfbcaSSatoshi Sahara                    $stack[] = static::resultComplement(array($pages_all, $pages));
143173bfbcaSSatoshi Sahara                    break;
144173bfbcaSSatoshi Sahara            }
145173bfbcaSSatoshi Sahara        }
146173bfbcaSSatoshi Sahara        $docs = array_pop($stack);
147173bfbcaSSatoshi Sahara
148173bfbcaSSatoshi Sahara        if (empty($docs)) return array();
149173bfbcaSSatoshi Sahara
150173bfbcaSSatoshi Sahara        // check: settings, acls, existence
151173bfbcaSSatoshi Sahara        foreach (array_keys($docs) as $id) {
152173bfbcaSSatoshi Sahara            if (isHiddenPage($id)
153173bfbcaSSatoshi Sahara                || auth_quickaclcheck($id) < AUTH_READ
154173bfbcaSSatoshi Sahara                || !page_exists($id, '', false)
155173bfbcaSSatoshi Sahara            ) {
156173bfbcaSSatoshi Sahara                unset($docs[$id]);
157173bfbcaSSatoshi Sahara            }
158173bfbcaSSatoshi Sahara        }
159173bfbcaSSatoshi Sahara
160173bfbcaSSatoshi Sahara        $docs = static::filterResultsByTime($docs, $data['after'], $data['before']);
161173bfbcaSSatoshi Sahara
162173bfbcaSSatoshi Sahara        if ($data['sort'] === 'mtime') {
163173bfbcaSSatoshi Sahara            uksort($docs, static::class.'::pagemtimesorter');
164173bfbcaSSatoshi Sahara        } else {
165173bfbcaSSatoshi Sahara            // sort docs by count
166173bfbcaSSatoshi Sahara            arsort($docs);
167173bfbcaSSatoshi Sahara        }
168173bfbcaSSatoshi Sahara
169173bfbcaSSatoshi Sahara        return $docs;
170173bfbcaSSatoshi Sahara    }
171173bfbcaSSatoshi Sahara
172173bfbcaSSatoshi Sahara    /**
173173bfbcaSSatoshi Sahara     * @param array      $results search results in the form pageid => value
174173bfbcaSSatoshi Sahara     * @param int|string $after   only returns results with mtime after this date,
175173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
176173bfbcaSSatoshi Sahara     * @param int|string $before  only returns results with mtime after this date,
177173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
178173bfbcaSSatoshi Sahara     *
179173bfbcaSSatoshi Sahara     * @return array
180173bfbcaSSatoshi Sahara     */
181173bfbcaSSatoshi Sahara    protected static function filterResultsByTime(array $results, $after, $before)
182173bfbcaSSatoshi Sahara    {
183173bfbcaSSatoshi Sahara        if ($after || $before) {
184173bfbcaSSatoshi Sahara            $after = is_int($after) ? $after : strtotime($after);
185173bfbcaSSatoshi Sahara            $before = is_int($before) ? $before : strtotime($before);
186173bfbcaSSatoshi Sahara
187173bfbcaSSatoshi Sahara            foreach ($results as $id => $value) {
188173bfbcaSSatoshi Sahara                $mTime = filemtime(wikiFN($id));
189173bfbcaSSatoshi Sahara                if ($after && $after > $mTime) {
190173bfbcaSSatoshi Sahara                    unset($results[$id]);
191173bfbcaSSatoshi Sahara                    continue;
192173bfbcaSSatoshi Sahara                }
193173bfbcaSSatoshi Sahara                if ($before && $before < $mTime) {
194173bfbcaSSatoshi Sahara                    unset($results[$id]);
195173bfbcaSSatoshi Sahara                }
196173bfbcaSSatoshi Sahara            }
197173bfbcaSSatoshi Sahara        }
198173bfbcaSSatoshi Sahara        return $results;
199173bfbcaSSatoshi Sahara    }
200173bfbcaSSatoshi Sahara
201173bfbcaSSatoshi Sahara    /**
202173bfbcaSSatoshi Sahara     * Sort pages by their mtime, from newest to oldest
203173bfbcaSSatoshi Sahara     *
204173bfbcaSSatoshi Sahara     * @param string $a
205173bfbcaSSatoshi Sahara     * @param string $b
206173bfbcaSSatoshi Sahara     *
207173bfbcaSSatoshi Sahara     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
208173bfbcaSSatoshi Sahara     *             and 0 if they are of the same age
209173bfbcaSSatoshi Sahara     */
210173bfbcaSSatoshi Sahara    protected static function pagemtimesorter($a, $b)
211173bfbcaSSatoshi Sahara    {
212173bfbcaSSatoshi Sahara        $mtimeA = filemtime(wikiFN($a));
213173bfbcaSSatoshi Sahara        $mtimeB = filemtime(wikiFN($b));
214173bfbcaSSatoshi Sahara        return $mtimeB - $mtimeA;
215173bfbcaSSatoshi Sahara    }
216173bfbcaSSatoshi Sahara
217173bfbcaSSatoshi Sahara    /**
218173bfbcaSSatoshi Sahara     * Creates a snippet extract
219173bfbcaSSatoshi Sahara     *
220173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
221173bfbcaSSatoshi Sahara     * @triggers FULLTEXT_SNIPPET_CREATE
222173bfbcaSSatoshi Sahara     *
223173bfbcaSSatoshi Sahara     * @param string $id page id
224173bfbcaSSatoshi Sahara     * @param array $highlight
225173bfbcaSSatoshi Sahara     * @return mixed
226173bfbcaSSatoshi Sahara     */
227173bfbcaSSatoshi Sahara    public static function snippet($id, $highlight)
228173bfbcaSSatoshi Sahara    {
229173bfbcaSSatoshi Sahara        $text = rawWiki($id);
230173bfbcaSSatoshi Sahara        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
231173bfbcaSSatoshi Sahara        $evdata = array(
232173bfbcaSSatoshi Sahara            'id'        => $id,
233173bfbcaSSatoshi Sahara            'text'      => &$text,
234173bfbcaSSatoshi Sahara            'highlight' => &$highlight,
235173bfbcaSSatoshi Sahara            'snippet'   => '',
236173bfbcaSSatoshi Sahara        );
237173bfbcaSSatoshi Sahara
238173bfbcaSSatoshi Sahara        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
239173bfbcaSSatoshi Sahara        if ($evt->advise_before()) {
240173bfbcaSSatoshi Sahara            $match = array();
241173bfbcaSSatoshi Sahara            $snippets = array();
242173bfbcaSSatoshi Sahara            $utf8_offset = $offset = $end = 0;
2430a3e25f4SSatoshi Sahara            $len = Utf8\PhpString::strlen($text);
244173bfbcaSSatoshi Sahara
245173bfbcaSSatoshi Sahara            // build a regexp from the phrases to highlight
246173bfbcaSSatoshi Sahara            $re1 = '(' .
247173bfbcaSSatoshi Sahara                join(
248173bfbcaSSatoshi Sahara                    '|',
249173bfbcaSSatoshi Sahara                    array_map(
250173bfbcaSSatoshi Sahara                        static::class.'::snippet_re_preprocess',
251173bfbcaSSatoshi Sahara                        array_map(
252173bfbcaSSatoshi Sahara                            'preg_quote_cb',
253173bfbcaSSatoshi Sahara                            array_filter((array) $highlight)
254173bfbcaSSatoshi Sahara                        )
255173bfbcaSSatoshi Sahara                    )
256173bfbcaSSatoshi Sahara                ) .
257173bfbcaSSatoshi Sahara                ')';
258173bfbcaSSatoshi Sahara            $re2 = "$re1.{0,75}(?!\\1)$re1";
259173bfbcaSSatoshi Sahara            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
260173bfbcaSSatoshi Sahara
261173bfbcaSSatoshi Sahara            for ($cnt=4; $cnt--;) {
262173bfbcaSSatoshi Sahara                if (0) {
263173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
264173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
265173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
266173bfbcaSSatoshi Sahara                } else {
267173bfbcaSSatoshi Sahara                    break;
268173bfbcaSSatoshi Sahara                }
269173bfbcaSSatoshi Sahara
270173bfbcaSSatoshi Sahara                list($str, $idx) = $match[0];
271173bfbcaSSatoshi Sahara
272173bfbcaSSatoshi Sahara                // convert $idx (a byte offset) into a utf8 character offset
2730a3e25f4SSatoshi Sahara                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
2740a3e25f4SSatoshi Sahara                $utf8_len = Utf8\PhpString::strlen($str);
275173bfbcaSSatoshi Sahara
276173bfbcaSSatoshi Sahara                // establish context, 100 bytes surrounding the match string
277173bfbcaSSatoshi Sahara                // first look to see if we can go 100 either side,
278173bfbcaSSatoshi Sahara                // then drop to 50 adding any excess if the other side can't go to 50,
279173bfbcaSSatoshi Sahara                $pre = min($utf8_idx - $utf8_offset, 100);
280173bfbcaSSatoshi Sahara                $post = min($len - $utf8_idx - $utf8_len, 100);
281173bfbcaSSatoshi Sahara
282173bfbcaSSatoshi Sahara                if ($pre > 50 && $post > 50) {
283173bfbcaSSatoshi Sahara                    $pre = $post = 50;
284173bfbcaSSatoshi Sahara                } elseif ($pre > 50) {
285173bfbcaSSatoshi Sahara                    $pre = min($pre, 100 - $post);
286173bfbcaSSatoshi Sahara                } elseif ($post > 50) {
287173bfbcaSSatoshi Sahara                    $post = min($post, 100 - $pre);
288173bfbcaSSatoshi Sahara                } elseif ($offset == 0) {
289173bfbcaSSatoshi Sahara                    // both are less than 50, means the context is the whole string
290173bfbcaSSatoshi Sahara                    // make it so and break out of this loop - there is no need for the
291173bfbcaSSatoshi Sahara                    // complex snippet calculations
292173bfbcaSSatoshi Sahara                    $snippets = array($text);
293173bfbcaSSatoshi Sahara                    break;
294173bfbcaSSatoshi Sahara                }
295173bfbcaSSatoshi Sahara
296173bfbcaSSatoshi Sahara                // establish context start and end points, try to append to previous
297173bfbcaSSatoshi Sahara                // context if possible
298173bfbcaSSatoshi Sahara                $start = $utf8_idx - $pre;
299173bfbcaSSatoshi Sahara                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
300173bfbcaSSatoshi Sahara                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
301173bfbcaSSatoshi Sahara
302173bfbcaSSatoshi Sahara                if ($append) {
3030a3e25f4SSatoshi Sahara                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
304173bfbcaSSatoshi Sahara                } else {
3050a3e25f4SSatoshi Sahara                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
306173bfbcaSSatoshi Sahara                }
307173bfbcaSSatoshi Sahara
308173bfbcaSSatoshi Sahara                // set $offset for next match attempt
309173bfbcaSSatoshi Sahara                // continue matching after the current match
310173bfbcaSSatoshi Sahara                // if the current match is not the longest possible match starting at the current offset
311173bfbcaSSatoshi Sahara                // this prevents further matching of this snippet but for possible matches of length
312173bfbcaSSatoshi Sahara                // smaller than match length + context (at least 50 characters) this match is part of the context
313173bfbcaSSatoshi Sahara                $utf8_offset = $utf8_idx + $utf8_len;
3140a3e25f4SSatoshi Sahara                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
3150a3e25f4SSatoshi Sahara                $offset = Utf8\Clean::correctIdx($text, $offset);
316173bfbcaSSatoshi Sahara            }
317173bfbcaSSatoshi Sahara
318173bfbcaSSatoshi Sahara            $m = "\1";
319173bfbcaSSatoshi Sahara            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
320173bfbcaSSatoshi Sahara            $snippet = preg_replace(
321173bfbcaSSatoshi Sahara                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
322173bfbcaSSatoshi Sahara                '<strong class="search_hit">$1</strong>',
323173bfbcaSSatoshi Sahara                hsc(join('... ', $snippets))
324173bfbcaSSatoshi Sahara            );
325173bfbcaSSatoshi Sahara
326173bfbcaSSatoshi Sahara            $evdata['snippet'] = $snippet;
327173bfbcaSSatoshi Sahara        }
328173bfbcaSSatoshi Sahara        $evt->advise_after();
329173bfbcaSSatoshi Sahara        unset($evt);
330173bfbcaSSatoshi Sahara
331173bfbcaSSatoshi Sahara        return $evdata['snippet'];
332173bfbcaSSatoshi Sahara    }
333173bfbcaSSatoshi Sahara
334173bfbcaSSatoshi Sahara    /**
335173bfbcaSSatoshi Sahara     * Wraps a search term in regex boundary checks.
336173bfbcaSSatoshi Sahara     *
337173bfbcaSSatoshi Sahara     * @param string $term
338173bfbcaSSatoshi Sahara     * @return string
339173bfbcaSSatoshi Sahara     */
340173bfbcaSSatoshi Sahara    public static function snippet_re_preprocess($term)
341173bfbcaSSatoshi Sahara    {
342173bfbcaSSatoshi Sahara        // do not process asian terms where word boundaries are not explicit
3430a3e25f4SSatoshi Sahara        if (Utf8\Asian::isAsianWords($term)) return $term;
344173bfbcaSSatoshi Sahara
345173bfbcaSSatoshi Sahara        if (UTF8_PROPERTYSUPPORT) {
346173bfbcaSSatoshi Sahara            // unicode word boundaries
347173bfbcaSSatoshi Sahara            // see http://stackoverflow.com/a/2449017/172068
348173bfbcaSSatoshi Sahara            $BL = '(?<!\pL)';
349173bfbcaSSatoshi Sahara            $BR = '(?!\pL)';
350173bfbcaSSatoshi Sahara        } else {
351173bfbcaSSatoshi Sahara            // not as correct as above, but at least won't break
352173bfbcaSSatoshi Sahara            $BL = '\b';
353173bfbcaSSatoshi Sahara            $BR = '\b';
354173bfbcaSSatoshi Sahara        }
355173bfbcaSSatoshi Sahara
356173bfbcaSSatoshi Sahara        if (substr($term, 0, 2) == '\\*') {
357173bfbcaSSatoshi Sahara            $term = substr($term, 2);
358173bfbcaSSatoshi Sahara        } else {
359173bfbcaSSatoshi Sahara            $term = $BL.$term;
360173bfbcaSSatoshi Sahara        }
361173bfbcaSSatoshi Sahara
362173bfbcaSSatoshi Sahara        if (substr($term, -2, 2) == '\\*') {
363173bfbcaSSatoshi Sahara            $term = substr($term, 0, -2);
364173bfbcaSSatoshi Sahara        } else {
365173bfbcaSSatoshi Sahara            $term = $term.$BR;
366173bfbcaSSatoshi Sahara        }
367173bfbcaSSatoshi Sahara
368173bfbcaSSatoshi Sahara        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
369173bfbcaSSatoshi Sahara            $term = '';
370173bfbcaSSatoshi Sahara        }
371173bfbcaSSatoshi Sahara        return $term;
372173bfbcaSSatoshi Sahara    }
373173bfbcaSSatoshi Sahara
374173bfbcaSSatoshi Sahara    /**
375173bfbcaSSatoshi Sahara     * Combine found documents and sum up their scores
376173bfbcaSSatoshi Sahara     *
377173bfbcaSSatoshi Sahara     * This function is used to combine searched words with a logical
378173bfbcaSSatoshi Sahara     * AND. Only documents available in all arrays are returned.
379173bfbcaSSatoshi Sahara     *
380173bfbcaSSatoshi Sahara     * based upon PEAR's PHP_Compat function for array_intersect_key()
381173bfbcaSSatoshi Sahara     *
382173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
383173bfbcaSSatoshi Sahara     * @return array
384173bfbcaSSatoshi Sahara     */
385173bfbcaSSatoshi Sahara    protected static function resultCombine($args)
386173bfbcaSSatoshi Sahara    {
387173bfbcaSSatoshi Sahara        $array_count = count($args);
388173bfbcaSSatoshi Sahara        if ($array_count == 1) {
389173bfbcaSSatoshi Sahara            return $args[0];
390173bfbcaSSatoshi Sahara        }
391173bfbcaSSatoshi Sahara
392173bfbcaSSatoshi Sahara        $result = array();
393173bfbcaSSatoshi Sahara        if ($array_count > 1) {
394173bfbcaSSatoshi Sahara            foreach ($args[0] as $key => $value) {
395173bfbcaSSatoshi Sahara                $result[$key] = $value;
396173bfbcaSSatoshi Sahara                for ($i = 1; $i !== $array_count; $i++) {
397173bfbcaSSatoshi Sahara                    if (!isset($args[$i][$key])) {
398173bfbcaSSatoshi Sahara                        unset($result[$key]);
399173bfbcaSSatoshi Sahara                        break;
400173bfbcaSSatoshi Sahara                    }
401173bfbcaSSatoshi Sahara                    $result[$key] += $args[$i][$key];
402173bfbcaSSatoshi Sahara                }
403173bfbcaSSatoshi Sahara            }
404173bfbcaSSatoshi Sahara        }
405173bfbcaSSatoshi Sahara        return $result;
406173bfbcaSSatoshi Sahara    }
407173bfbcaSSatoshi Sahara
408173bfbcaSSatoshi Sahara    /**
409173bfbcaSSatoshi Sahara     * Unites found documents and sum up their scores
410173bfbcaSSatoshi Sahara     * based upon resultCombine() method
411173bfbcaSSatoshi Sahara     *
412173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
413173bfbcaSSatoshi Sahara     * @return array
414173bfbcaSSatoshi Sahara     *
415173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
416173bfbcaSSatoshi Sahara     */
417173bfbcaSSatoshi Sahara    protected static function resultUnite($args)
418173bfbcaSSatoshi Sahara    {
419173bfbcaSSatoshi Sahara        $array_count = count($args);
420173bfbcaSSatoshi Sahara        if ($array_count === 1) {
421173bfbcaSSatoshi Sahara            return $args[0];
422173bfbcaSSatoshi Sahara        }
423173bfbcaSSatoshi Sahara
424173bfbcaSSatoshi Sahara        $result = $args[0];
425173bfbcaSSatoshi Sahara        for ($i = 1; $i !== $array_count; $i++) {
426173bfbcaSSatoshi Sahara            foreach (array_keys($args[$i]) as $id) {
427173bfbcaSSatoshi Sahara                $result[$id] += $args[$i][$id];
428173bfbcaSSatoshi Sahara            }
429173bfbcaSSatoshi Sahara        }
430173bfbcaSSatoshi Sahara        return $result;
431173bfbcaSSatoshi Sahara    }
432173bfbcaSSatoshi Sahara
433173bfbcaSSatoshi Sahara    /**
434173bfbcaSSatoshi Sahara     * Computes the difference of documents using page id for comparison
435173bfbcaSSatoshi Sahara     * nearly identical to PHP5's array_diff_key()
436173bfbcaSSatoshi Sahara     *
437173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
438173bfbcaSSatoshi Sahara     * @return array
439173bfbcaSSatoshi Sahara     *
440173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
441173bfbcaSSatoshi Sahara     */
442173bfbcaSSatoshi Sahara    protected static function resultComplement($args)
443173bfbcaSSatoshi Sahara    {
444173bfbcaSSatoshi Sahara        $array_count = count($args);
445173bfbcaSSatoshi Sahara        if ($array_count === 1) {
446173bfbcaSSatoshi Sahara            return $args[0];
447173bfbcaSSatoshi Sahara        }
448173bfbcaSSatoshi Sahara
449173bfbcaSSatoshi Sahara        $result = $args[0];
450173bfbcaSSatoshi Sahara        foreach (array_keys($result) as $id) {
451173bfbcaSSatoshi Sahara            for ($i = 1; $i !== $array_count; $i++) {
452173bfbcaSSatoshi Sahara                if (isset($args[$i][$id])) unset($result[$id]);
453173bfbcaSSatoshi Sahara            }
454173bfbcaSSatoshi Sahara        }
455173bfbcaSSatoshi Sahara        return $result;
456173bfbcaSSatoshi Sahara    }
457173bfbcaSSatoshi Sahara}
458