xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 9329b002986cc3f43c18c207dd9d0fdfd0f8a5e8)
1173bfbcaSSatoshi Sahara<?php
246b83514SSatoshi Sahara
3173bfbcaSSatoshi Saharanamespace dokuwiki\Search;
4173bfbcaSSatoshi Sahara
5173bfbcaSSatoshi Saharause dokuwiki\Extension\Event;
6743c9a28SSatoshi Saharause dokuwiki\Search\FulltextIndex;
73837ea91SSatoshi Saharause dokuwiki\Search\QueryParser;
8c31af4f3SSatoshi Saharause dokuwiki\Utf8;
9173bfbcaSSatoshi Sahara
1046b83514SSatoshi Sahara// create snippets for the first few results only
110cba610bSSatoshi Saharaconst FT_SNIPPET_NUMBER = 15;
120cba610bSSatoshi Sahara
130cba610bSSatoshi Sahara/**
14173bfbcaSSatoshi Sahara * Class DokuWiki Fulltext Search
15173bfbcaSSatoshi Sahara *
16173bfbcaSSatoshi Sahara * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
17173bfbcaSSatoshi Sahara * @author     Andreas Gohr <andi@splitbrain.org>
18173bfbcaSSatoshi Sahara */
19173bfbcaSSatoshi Saharaclass FulltextSearch
20173bfbcaSSatoshi Sahara{
21*9329b002SSatoshi Sahara    /** @var FulltextSearch $instance */
22*9329b002SSatoshi Sahara    protected static $instance = null;
23*9329b002SSatoshi Sahara
24*9329b002SSatoshi Sahara    /**
25*9329b002SSatoshi Sahara     * Get new or existing singleton instance of the FulltextSearch
26*9329b002SSatoshi Sahara     *
27*9329b002SSatoshi Sahara     * @return FulltextSearch
28*9329b002SSatoshi Sahara     */
29*9329b002SSatoshi Sahara    public static function getInstance()
30*9329b002SSatoshi Sahara    {
31*9329b002SSatoshi Sahara        if (is_null(static::$instance)) {
32*9329b002SSatoshi Sahara            static::$instance = new static();
33*9329b002SSatoshi Sahara        }
34*9329b002SSatoshi Sahara        return static::$instance;
35*9329b002SSatoshi Sahara    }
36*9329b002SSatoshi Sahara
37173bfbcaSSatoshi Sahara    /**
38173bfbcaSSatoshi Sahara     *  Fulltext Search constructor. prevent direct object creation
39173bfbcaSSatoshi Sahara     */
40173bfbcaSSatoshi Sahara    protected function __construct() {}
41173bfbcaSSatoshi Sahara
42173bfbcaSSatoshi Sahara    /**
43173bfbcaSSatoshi Sahara     * The fulltext search
44173bfbcaSSatoshi Sahara     *
45173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
46173bfbcaSSatoshi Sahara     *
4746b83514SSatoshi Sahara     * refactored into pageSearch(), pageSearchCallBack() and trigger_event()
48173bfbcaSSatoshi Sahara     *
49173bfbcaSSatoshi Sahara     * @param string     $query
50173bfbcaSSatoshi Sahara     * @param array      $highlight
51173bfbcaSSatoshi Sahara     * @param string     $sort
52173bfbcaSSatoshi Sahara     * @param int|string $after  only show results with mtime after this date,
53173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
54173bfbcaSSatoshi Sahara     * @param int|string $before only show results with mtime before this date,
55173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
56173bfbcaSSatoshi Sahara     *
57173bfbcaSSatoshi Sahara     * @return array
58173bfbcaSSatoshi Sahara     */
59*9329b002SSatoshi Sahara    public function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
60173bfbcaSSatoshi Sahara    {
61173bfbcaSSatoshi Sahara        if ($sort === null) {
62173bfbcaSSatoshi Sahara            $sort = 'hits';
63173bfbcaSSatoshi Sahara        }
64173bfbcaSSatoshi Sahara        $data = [
65173bfbcaSSatoshi Sahara            'query' => $query,
66173bfbcaSSatoshi Sahara            'sort' => $sort,
67173bfbcaSSatoshi Sahara            'after' => $after,
68173bfbcaSSatoshi Sahara            'before' => $before
69173bfbcaSSatoshi Sahara        ];
70173bfbcaSSatoshi Sahara        $data['highlight'] =& $highlight;
71*9329b002SSatoshi Sahara        $action = [$this, 'pageSearchCallBack'];
72173bfbcaSSatoshi Sahara        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
73173bfbcaSSatoshi Sahara    }
74173bfbcaSSatoshi Sahara
75173bfbcaSSatoshi Sahara    /**
76173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
77173bfbcaSSatoshi Sahara     *
78173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
79173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
80173bfbcaSSatoshi Sahara     *
81173bfbcaSSatoshi Sahara     * @param array $data  event data
82173bfbcaSSatoshi Sahara     * @return array       matching documents
83173bfbcaSSatoshi Sahara     */
84*9329b002SSatoshi Sahara    public function pageSearchCallBack(&$data)
85173bfbcaSSatoshi Sahara    {
86173bfbcaSSatoshi Sahara        // parse the given query
87*9329b002SSatoshi Sahara        $q = (new QueryParser)->convert($data['query']);
88173bfbcaSSatoshi Sahara        $data['highlight'] = $q['highlight'];
89173bfbcaSSatoshi Sahara
90173bfbcaSSatoshi Sahara        if (empty($q['parsed_ary'])) return array();
91173bfbcaSSatoshi Sahara
92173bfbcaSSatoshi Sahara        // lookup all words found in the query
93743c9a28SSatoshi Sahara        $FulltextIndex = FulltextIndex::getInstance();
94743c9a28SSatoshi Sahara        $lookup = $FulltextIndex->lookupWords($q['words']);
95173bfbcaSSatoshi Sahara
96173bfbcaSSatoshi Sahara        // get all pages in this dokuwiki site (!: includes nonexistent pages)
97173bfbcaSSatoshi Sahara        $pages_all = array();
98743c9a28SSatoshi Sahara        foreach ($FulltextIndex->getPages() as $id) {
99173bfbcaSSatoshi Sahara            $pages_all[$id] = 0; // base: 0 hit
100173bfbcaSSatoshi Sahara        }
101173bfbcaSSatoshi Sahara
102173bfbcaSSatoshi Sahara        // process the query
103173bfbcaSSatoshi Sahara        $stack = array();
104173bfbcaSSatoshi Sahara        foreach ($q['parsed_ary'] as $token) {
105173bfbcaSSatoshi Sahara            switch (substr($token, 0, 3)) {
106173bfbcaSSatoshi Sahara                case 'W+:':
107173bfbcaSSatoshi Sahara                case 'W-:':
108173bfbcaSSatoshi Sahara                case 'W_:': // word
109173bfbcaSSatoshi Sahara                    $word    = substr($token, 3);
110173bfbcaSSatoshi Sahara                    $stack[] = (array) $lookup[$word];
111173bfbcaSSatoshi Sahara                    break;
112173bfbcaSSatoshi Sahara                case 'P+:':
113173bfbcaSSatoshi Sahara                case 'P-:': // phrase
114173bfbcaSSatoshi Sahara                    $phrase = substr($token, 3);
115173bfbcaSSatoshi Sahara                    // since phrases are always parsed as ((W1)(W2)...(P)),
116173bfbcaSSatoshi Sahara                    // the end($stack) always points the pages that contain
117173bfbcaSSatoshi Sahara                    // all words in this phrase
118173bfbcaSSatoshi Sahara                    $pages  = end($stack);
119173bfbcaSSatoshi Sahara                    $pages_matched = array();
120173bfbcaSSatoshi Sahara                    foreach (array_keys($pages) as $id) {
121173bfbcaSSatoshi Sahara                        $evdata = array(
122173bfbcaSSatoshi Sahara                            'id' => $id,
123173bfbcaSSatoshi Sahara                            'phrase' => $phrase,
124173bfbcaSSatoshi Sahara                            'text' => rawWiki($id)
125173bfbcaSSatoshi Sahara                        );
12602361d2aSSatoshi Sahara                        $event = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
12702361d2aSSatoshi Sahara                        if ($event->advise_before() && $event->result !== true) {
128c31af4f3SSatoshi Sahara                            $text = Utf8\PhpString::strtolower($evdata['text']);
129173bfbcaSSatoshi Sahara                            if (strpos($text, $phrase) !== false) {
13002361d2aSSatoshi Sahara                                $event->result = true;
131173bfbcaSSatoshi Sahara                            }
132173bfbcaSSatoshi Sahara                        }
13302361d2aSSatoshi Sahara                        $event->advise_after();
13402361d2aSSatoshi Sahara                        if ($event->result === true) {
135173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // phrase: always 0 hit
136173bfbcaSSatoshi Sahara                        }
137173bfbcaSSatoshi Sahara                    }
138173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
139173bfbcaSSatoshi Sahara                    break;
140173bfbcaSSatoshi Sahara                case 'N+:':
141173bfbcaSSatoshi Sahara                case 'N-:': // namespace
142173bfbcaSSatoshi Sahara                    $ns = cleanID(substr($token, 3)) . ':';
143173bfbcaSSatoshi Sahara                    $pages_matched = array();
144173bfbcaSSatoshi Sahara                    foreach (array_keys($pages_all) as $id) {
145173bfbcaSSatoshi Sahara                        if (strpos($id, $ns) === 0) {
146173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // namespace: always 0 hit
147173bfbcaSSatoshi Sahara                        }
148173bfbcaSSatoshi Sahara                    }
149173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
150173bfbcaSSatoshi Sahara                    break;
151173bfbcaSSatoshi Sahara                case 'AND': // and operation
152173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
153*9329b002SSatoshi Sahara                    $stack[] = $this->resultCombine(array($pages1, $pages2));
154173bfbcaSSatoshi Sahara                    break;
155173bfbcaSSatoshi Sahara                case 'OR':  // or operation
156173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
157*9329b002SSatoshi Sahara                    $stack[] = $this->resultUnite(array($pages1, $pages2));
158173bfbcaSSatoshi Sahara                    break;
159173bfbcaSSatoshi Sahara                case 'NOT': // not operation (unary)
160173bfbcaSSatoshi Sahara                    $pages   = array_pop($stack);
161*9329b002SSatoshi Sahara                    $stack[] = $this->resultComplement(array($pages_all, $pages));
162173bfbcaSSatoshi Sahara                    break;
163173bfbcaSSatoshi Sahara            }
164173bfbcaSSatoshi Sahara        }
165173bfbcaSSatoshi Sahara        $docs = array_pop($stack);
166173bfbcaSSatoshi Sahara
167173bfbcaSSatoshi Sahara        if (empty($docs)) return array();
168173bfbcaSSatoshi Sahara
169173bfbcaSSatoshi Sahara        // check: settings, acls, existence
170173bfbcaSSatoshi Sahara        foreach (array_keys($docs) as $id) {
171173bfbcaSSatoshi Sahara            if (isHiddenPage($id)
172173bfbcaSSatoshi Sahara                || auth_quickaclcheck($id) < AUTH_READ
173173bfbcaSSatoshi Sahara                || !page_exists($id, '', false)
174173bfbcaSSatoshi Sahara            ) {
175173bfbcaSSatoshi Sahara                unset($docs[$id]);
176173bfbcaSSatoshi Sahara            }
177173bfbcaSSatoshi Sahara        }
178173bfbcaSSatoshi Sahara
179*9329b002SSatoshi Sahara        $docs = $this->filterResultsByTime($docs, $data['after'], $data['before']);
180173bfbcaSSatoshi Sahara
181173bfbcaSSatoshi Sahara        if ($data['sort'] === 'mtime') {
182*9329b002SSatoshi Sahara            uksort($docs, [$this, 'pagemtimesorter']);
183173bfbcaSSatoshi Sahara        } else {
184173bfbcaSSatoshi Sahara            // sort docs by count
185173bfbcaSSatoshi Sahara            arsort($docs);
186173bfbcaSSatoshi Sahara        }
187173bfbcaSSatoshi Sahara
188173bfbcaSSatoshi Sahara        return $docs;
189173bfbcaSSatoshi Sahara    }
190173bfbcaSSatoshi Sahara
191173bfbcaSSatoshi Sahara    /**
192173bfbcaSSatoshi Sahara     * @param array      $results search results in the form pageid => value
193173bfbcaSSatoshi Sahara     * @param int|string $after   only returns results with mtime after this date,
194173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
195173bfbcaSSatoshi Sahara     * @param int|string $before  only returns results with mtime after this date,
196173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
197173bfbcaSSatoshi Sahara     *
198173bfbcaSSatoshi Sahara     * @return array
199173bfbcaSSatoshi Sahara     */
200*9329b002SSatoshi Sahara    protected function filterResultsByTime(array $results, $after, $before)
201173bfbcaSSatoshi Sahara    {
202173bfbcaSSatoshi Sahara        if ($after || $before) {
203173bfbcaSSatoshi Sahara            $after = is_int($after) ? $after : strtotime($after);
204173bfbcaSSatoshi Sahara            $before = is_int($before) ? $before : strtotime($before);
205173bfbcaSSatoshi Sahara
206173bfbcaSSatoshi Sahara            foreach ($results as $id => $value) {
207173bfbcaSSatoshi Sahara                $mTime = filemtime(wikiFN($id));
208173bfbcaSSatoshi Sahara                if ($after && $after > $mTime) {
209173bfbcaSSatoshi Sahara                    unset($results[$id]);
210173bfbcaSSatoshi Sahara                    continue;
211173bfbcaSSatoshi Sahara                }
212173bfbcaSSatoshi Sahara                if ($before && $before < $mTime) {
213173bfbcaSSatoshi Sahara                    unset($results[$id]);
214173bfbcaSSatoshi Sahara                }
215173bfbcaSSatoshi Sahara            }
216173bfbcaSSatoshi Sahara        }
217173bfbcaSSatoshi Sahara        return $results;
218173bfbcaSSatoshi Sahara    }
219173bfbcaSSatoshi Sahara
220173bfbcaSSatoshi Sahara    /**
221173bfbcaSSatoshi Sahara     * Sort pages by their mtime, from newest to oldest
222173bfbcaSSatoshi Sahara     *
223173bfbcaSSatoshi Sahara     * @param string $a
224173bfbcaSSatoshi Sahara     * @param string $b
225173bfbcaSSatoshi Sahara     *
226173bfbcaSSatoshi Sahara     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
227173bfbcaSSatoshi Sahara     *             and 0 if they are of the same age
228173bfbcaSSatoshi Sahara     */
229*9329b002SSatoshi Sahara    protected function pagemtimesorter($a, $b)
230173bfbcaSSatoshi Sahara    {
231173bfbcaSSatoshi Sahara        $mtimeA = filemtime(wikiFN($a));
232173bfbcaSSatoshi Sahara        $mtimeB = filemtime(wikiFN($b));
233173bfbcaSSatoshi Sahara        return $mtimeB - $mtimeA;
234173bfbcaSSatoshi Sahara    }
235173bfbcaSSatoshi Sahara
236173bfbcaSSatoshi Sahara    /**
237173bfbcaSSatoshi Sahara     * Creates a snippet extract
238173bfbcaSSatoshi Sahara     *
239173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
240173bfbcaSSatoshi Sahara     * @triggers FULLTEXT_SNIPPET_CREATE
241173bfbcaSSatoshi Sahara     *
242173bfbcaSSatoshi Sahara     * @param string $id page id
243173bfbcaSSatoshi Sahara     * @param array $highlight
244173bfbcaSSatoshi Sahara     * @return mixed
245173bfbcaSSatoshi Sahara     */
246*9329b002SSatoshi Sahara    public function snippet($id, $highlight)
247173bfbcaSSatoshi Sahara    {
248173bfbcaSSatoshi Sahara        $text = rawWiki($id);
249173bfbcaSSatoshi Sahara        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
250173bfbcaSSatoshi Sahara        $evdata = array(
251173bfbcaSSatoshi Sahara            'id'        => $id,
252173bfbcaSSatoshi Sahara            'text'      => &$text,
253173bfbcaSSatoshi Sahara            'highlight' => &$highlight,
254173bfbcaSSatoshi Sahara            'snippet'   => '',
255173bfbcaSSatoshi Sahara        );
256173bfbcaSSatoshi Sahara
257173bfbcaSSatoshi Sahara        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
258173bfbcaSSatoshi Sahara        if ($evt->advise_before()) {
259173bfbcaSSatoshi Sahara            $match = array();
260173bfbcaSSatoshi Sahara            $snippets = array();
261173bfbcaSSatoshi Sahara            $utf8_offset = $offset = $end = 0;
2620a3e25f4SSatoshi Sahara            $len = Utf8\PhpString::strlen($text);
263173bfbcaSSatoshi Sahara
264173bfbcaSSatoshi Sahara            // build a regexp from the phrases to highlight
265173bfbcaSSatoshi Sahara            $re1 = '(' .
266173bfbcaSSatoshi Sahara                join(
267173bfbcaSSatoshi Sahara                    '|',
268173bfbcaSSatoshi Sahara                    array_map(
269*9329b002SSatoshi Sahara                        [$this, 'snippetRePreprocess'],
270173bfbcaSSatoshi Sahara                        array_map(
271173bfbcaSSatoshi Sahara                            'preg_quote_cb',
272173bfbcaSSatoshi Sahara                            array_filter((array) $highlight)
273173bfbcaSSatoshi Sahara                        )
274173bfbcaSSatoshi Sahara                    )
275173bfbcaSSatoshi Sahara                ) .
276173bfbcaSSatoshi Sahara                ')';
277173bfbcaSSatoshi Sahara            $re2 = "$re1.{0,75}(?!\\1)$re1";
278173bfbcaSSatoshi Sahara            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
279173bfbcaSSatoshi Sahara
280173bfbcaSSatoshi Sahara            for ($cnt=4; $cnt--;) {
281173bfbcaSSatoshi Sahara                if (0) {
282173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
283173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
284173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
285173bfbcaSSatoshi Sahara                } else {
286173bfbcaSSatoshi Sahara                    break;
287173bfbcaSSatoshi Sahara                }
288173bfbcaSSatoshi Sahara
289173bfbcaSSatoshi Sahara                list($str, $idx) = $match[0];
290173bfbcaSSatoshi Sahara
291173bfbcaSSatoshi Sahara                // convert $idx (a byte offset) into a utf8 character offset
2920a3e25f4SSatoshi Sahara                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
2930a3e25f4SSatoshi Sahara                $utf8_len = Utf8\PhpString::strlen($str);
294173bfbcaSSatoshi Sahara
295173bfbcaSSatoshi Sahara                // establish context, 100 bytes surrounding the match string
296173bfbcaSSatoshi Sahara                // first look to see if we can go 100 either side,
297173bfbcaSSatoshi Sahara                // then drop to 50 adding any excess if the other side can't go to 50,
298173bfbcaSSatoshi Sahara                $pre = min($utf8_idx - $utf8_offset, 100);
299173bfbcaSSatoshi Sahara                $post = min($len - $utf8_idx - $utf8_len, 100);
300173bfbcaSSatoshi Sahara
301173bfbcaSSatoshi Sahara                if ($pre > 50 && $post > 50) {
302173bfbcaSSatoshi Sahara                    $pre = $post = 50;
303173bfbcaSSatoshi Sahara                } elseif ($pre > 50) {
304173bfbcaSSatoshi Sahara                    $pre = min($pre, 100 - $post);
305173bfbcaSSatoshi Sahara                } elseif ($post > 50) {
306173bfbcaSSatoshi Sahara                    $post = min($post, 100 - $pre);
307173bfbcaSSatoshi Sahara                } elseif ($offset == 0) {
308173bfbcaSSatoshi Sahara                    // both are less than 50, means the context is the whole string
309173bfbcaSSatoshi Sahara                    // make it so and break out of this loop - there is no need for the
310173bfbcaSSatoshi Sahara                    // complex snippet calculations
311173bfbcaSSatoshi Sahara                    $snippets = array($text);
312173bfbcaSSatoshi Sahara                    break;
313173bfbcaSSatoshi Sahara                }
314173bfbcaSSatoshi Sahara
315173bfbcaSSatoshi Sahara                // establish context start and end points, try to append to previous
316173bfbcaSSatoshi Sahara                // context if possible
317173bfbcaSSatoshi Sahara                $start = $utf8_idx - $pre;
318173bfbcaSSatoshi Sahara                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
319173bfbcaSSatoshi Sahara                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
320173bfbcaSSatoshi Sahara
321173bfbcaSSatoshi Sahara                if ($append) {
3220a3e25f4SSatoshi Sahara                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
323173bfbcaSSatoshi Sahara                } else {
3240a3e25f4SSatoshi Sahara                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
325173bfbcaSSatoshi Sahara                }
326173bfbcaSSatoshi Sahara
327173bfbcaSSatoshi Sahara                // set $offset for next match attempt
328173bfbcaSSatoshi Sahara                // continue matching after the current match
329173bfbcaSSatoshi Sahara                // if the current match is not the longest possible match starting at the current offset
330173bfbcaSSatoshi Sahara                // this prevents further matching of this snippet but for possible matches of length
331173bfbcaSSatoshi Sahara                // smaller than match length + context (at least 50 characters) this match is part of the context
332173bfbcaSSatoshi Sahara                $utf8_offset = $utf8_idx + $utf8_len;
3330a3e25f4SSatoshi Sahara                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
3340a3e25f4SSatoshi Sahara                $offset = Utf8\Clean::correctIdx($text, $offset);
335173bfbcaSSatoshi Sahara            }
336173bfbcaSSatoshi Sahara
337173bfbcaSSatoshi Sahara            $m = "\1";
338173bfbcaSSatoshi Sahara            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
339173bfbcaSSatoshi Sahara            $snippet = preg_replace(
340173bfbcaSSatoshi Sahara                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
341173bfbcaSSatoshi Sahara                '<strong class="search_hit">$1</strong>',
342173bfbcaSSatoshi Sahara                hsc(join('... ', $snippets))
343173bfbcaSSatoshi Sahara            );
344173bfbcaSSatoshi Sahara
345173bfbcaSSatoshi Sahara            $evdata['snippet'] = $snippet;
346173bfbcaSSatoshi Sahara        }
347173bfbcaSSatoshi Sahara        $evt->advise_after();
348173bfbcaSSatoshi Sahara        unset($evt);
349173bfbcaSSatoshi Sahara
350173bfbcaSSatoshi Sahara        return $evdata['snippet'];
351173bfbcaSSatoshi Sahara    }
352173bfbcaSSatoshi Sahara
353173bfbcaSSatoshi Sahara    /**
354173bfbcaSSatoshi Sahara     * Wraps a search term in regex boundary checks.
355173bfbcaSSatoshi Sahara     *
356173bfbcaSSatoshi Sahara     * @param string $term
357173bfbcaSSatoshi Sahara     * @return string
358173bfbcaSSatoshi Sahara     */
359*9329b002SSatoshi Sahara    public function snippetRePreprocess($term)
360173bfbcaSSatoshi Sahara    {
361173bfbcaSSatoshi Sahara        // do not process asian terms where word boundaries are not explicit
3620a3e25f4SSatoshi Sahara        if (Utf8\Asian::isAsianWords($term)) return $term;
363173bfbcaSSatoshi Sahara
364173bfbcaSSatoshi Sahara        if (UTF8_PROPERTYSUPPORT) {
365173bfbcaSSatoshi Sahara            // unicode word boundaries
366173bfbcaSSatoshi Sahara            // see http://stackoverflow.com/a/2449017/172068
367173bfbcaSSatoshi Sahara            $BL = '(?<!\pL)';
368173bfbcaSSatoshi Sahara            $BR = '(?!\pL)';
369173bfbcaSSatoshi Sahara        } else {
370173bfbcaSSatoshi Sahara            // not as correct as above, but at least won't break
371173bfbcaSSatoshi Sahara            $BL = '\b';
372173bfbcaSSatoshi Sahara            $BR = '\b';
373173bfbcaSSatoshi Sahara        }
374173bfbcaSSatoshi Sahara
375173bfbcaSSatoshi Sahara        if (substr($term, 0, 2) == '\\*') {
376173bfbcaSSatoshi Sahara            $term = substr($term, 2);
377173bfbcaSSatoshi Sahara        } else {
378173bfbcaSSatoshi Sahara            $term = $BL.$term;
379173bfbcaSSatoshi Sahara        }
380173bfbcaSSatoshi Sahara
381173bfbcaSSatoshi Sahara        if (substr($term, -2, 2) == '\\*') {
382173bfbcaSSatoshi Sahara            $term = substr($term, 0, -2);
383173bfbcaSSatoshi Sahara        } else {
384173bfbcaSSatoshi Sahara            $term = $term.$BR;
385173bfbcaSSatoshi Sahara        }
386173bfbcaSSatoshi Sahara
387173bfbcaSSatoshi Sahara        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
388173bfbcaSSatoshi Sahara            $term = '';
389173bfbcaSSatoshi Sahara        }
390173bfbcaSSatoshi Sahara        return $term;
391173bfbcaSSatoshi Sahara    }
392173bfbcaSSatoshi Sahara
393173bfbcaSSatoshi Sahara    /**
394173bfbcaSSatoshi Sahara     * Combine found documents and sum up their scores
395173bfbcaSSatoshi Sahara     *
396173bfbcaSSatoshi Sahara     * This function is used to combine searched words with a logical
397173bfbcaSSatoshi Sahara     * AND. Only documents available in all arrays are returned.
398173bfbcaSSatoshi Sahara     *
399173bfbcaSSatoshi Sahara     * based upon PEAR's PHP_Compat function for array_intersect_key()
400173bfbcaSSatoshi Sahara     *
401173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
402173bfbcaSSatoshi Sahara     * @return array
403173bfbcaSSatoshi Sahara     */
404*9329b002SSatoshi Sahara    protected function resultCombine($args)
405173bfbcaSSatoshi Sahara    {
406173bfbcaSSatoshi Sahara        $array_count = count($args);
407173bfbcaSSatoshi Sahara        if ($array_count == 1) {
408173bfbcaSSatoshi Sahara            return $args[0];
409173bfbcaSSatoshi Sahara        }
410173bfbcaSSatoshi Sahara
411173bfbcaSSatoshi Sahara        $result = array();
412173bfbcaSSatoshi Sahara        if ($array_count > 1) {
413173bfbcaSSatoshi Sahara            foreach ($args[0] as $key => $value) {
414173bfbcaSSatoshi Sahara                $result[$key] = $value;
415173bfbcaSSatoshi Sahara                for ($i = 1; $i !== $array_count; $i++) {
416173bfbcaSSatoshi Sahara                    if (!isset($args[$i][$key])) {
417173bfbcaSSatoshi Sahara                        unset($result[$key]);
418173bfbcaSSatoshi Sahara                        break;
419173bfbcaSSatoshi Sahara                    }
420173bfbcaSSatoshi Sahara                    $result[$key] += $args[$i][$key];
421173bfbcaSSatoshi Sahara                }
422173bfbcaSSatoshi Sahara            }
423173bfbcaSSatoshi Sahara        }
424173bfbcaSSatoshi Sahara        return $result;
425173bfbcaSSatoshi Sahara    }
426173bfbcaSSatoshi Sahara
427173bfbcaSSatoshi Sahara    /**
428173bfbcaSSatoshi Sahara     * Unites found documents and sum up their scores
429173bfbcaSSatoshi Sahara     * based upon resultCombine() method
430173bfbcaSSatoshi Sahara     *
431173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
432173bfbcaSSatoshi Sahara     * @return array
433173bfbcaSSatoshi Sahara     *
434173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
435173bfbcaSSatoshi Sahara     */
436*9329b002SSatoshi Sahara    protected function resultUnite($args)
437173bfbcaSSatoshi Sahara    {
438173bfbcaSSatoshi Sahara        $array_count = count($args);
439173bfbcaSSatoshi Sahara        if ($array_count === 1) {
440173bfbcaSSatoshi Sahara            return $args[0];
441173bfbcaSSatoshi Sahara        }
442173bfbcaSSatoshi Sahara
443173bfbcaSSatoshi Sahara        $result = $args[0];
444173bfbcaSSatoshi Sahara        for ($i = 1; $i !== $array_count; $i++) {
445173bfbcaSSatoshi Sahara            foreach (array_keys($args[$i]) as $id) {
446173bfbcaSSatoshi Sahara                $result[$id] += $args[$i][$id];
447173bfbcaSSatoshi Sahara            }
448173bfbcaSSatoshi Sahara        }
449173bfbcaSSatoshi Sahara        return $result;
450173bfbcaSSatoshi Sahara    }
451173bfbcaSSatoshi Sahara
452173bfbcaSSatoshi Sahara    /**
453173bfbcaSSatoshi Sahara     * Computes the difference of documents using page id for comparison
454173bfbcaSSatoshi Sahara     * nearly identical to PHP5's array_diff_key()
455173bfbcaSSatoshi Sahara     *
456173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
457173bfbcaSSatoshi Sahara     * @return array
458173bfbcaSSatoshi Sahara     *
459173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
460173bfbcaSSatoshi Sahara     */
461*9329b002SSatoshi Sahara    protected function resultComplement($args)
462173bfbcaSSatoshi Sahara    {
463173bfbcaSSatoshi Sahara        $array_count = count($args);
464173bfbcaSSatoshi Sahara        if ($array_count === 1) {
465173bfbcaSSatoshi Sahara            return $args[0];
466173bfbcaSSatoshi Sahara        }
467173bfbcaSSatoshi Sahara
468173bfbcaSSatoshi Sahara        $result = $args[0];
469173bfbcaSSatoshi Sahara        foreach (array_keys($result) as $id) {
470173bfbcaSSatoshi Sahara            for ($i = 1; $i !== $array_count; $i++) {
471173bfbcaSSatoshi Sahara                if (isset($args[$i][$id])) unset($result[$id]);
472173bfbcaSSatoshi Sahara            }
473173bfbcaSSatoshi Sahara        }
474173bfbcaSSatoshi Sahara        return $result;
475173bfbcaSSatoshi Sahara    }
476173bfbcaSSatoshi Sahara}
477