xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 46b83514ca215ee33366a5c9f42f7da7812ef9ed)
1173bfbcaSSatoshi Sahara<?php
2*46b83514SSatoshi Sahara
3173bfbcaSSatoshi Saharanamespace dokuwiki\Search;
4173bfbcaSSatoshi Sahara
5173bfbcaSSatoshi Saharause dokuwiki\Extension\Event;
686fc7283SSatoshi Saharause dokuwiki\Search\PageIndex;
73837ea91SSatoshi Saharause dokuwiki\Search\QueryParser;
8c31af4f3SSatoshi Saharause dokuwiki\Utf8;
9173bfbcaSSatoshi Sahara
10*46b83514SSatoshi Sahara// create snippets for the first few results only
110cba610bSSatoshi Saharaconst FT_SNIPPET_NUMBER = 15;
120cba610bSSatoshi Sahara
130cba610bSSatoshi Sahara/**
14173bfbcaSSatoshi Sahara * Class DokuWiki Fulltext Search
15173bfbcaSSatoshi Sahara *
16173bfbcaSSatoshi Sahara * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
17173bfbcaSSatoshi Sahara * @author     Andreas Gohr <andi@splitbrain.org>
18173bfbcaSSatoshi Sahara */
19173bfbcaSSatoshi Saharaclass FulltextSearch
20173bfbcaSSatoshi Sahara{
21173bfbcaSSatoshi Sahara    /**
22173bfbcaSSatoshi Sahara     *  Fulltext Search constructor. prevent direct object creation
23173bfbcaSSatoshi Sahara     */
24173bfbcaSSatoshi Sahara    protected function __construct() {}
25173bfbcaSSatoshi Sahara
26173bfbcaSSatoshi Sahara    /**
27173bfbcaSSatoshi Sahara     * The fulltext search
28173bfbcaSSatoshi Sahara     *
29173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
30173bfbcaSSatoshi Sahara     *
31*46b83514SSatoshi Sahara     * refactored into pageSearch(), pageSearchCallBack() and trigger_event()
32173bfbcaSSatoshi Sahara     *
33173bfbcaSSatoshi Sahara     * @param string     $query
34173bfbcaSSatoshi Sahara     * @param array      $highlight
35173bfbcaSSatoshi Sahara     * @param string     $sort
36173bfbcaSSatoshi Sahara     * @param int|string $after  only show results with mtime after this date,
37173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
38173bfbcaSSatoshi Sahara     * @param int|string $before only show results with mtime before this date,
39173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
40173bfbcaSSatoshi Sahara     *
41173bfbcaSSatoshi Sahara     * @return array
42173bfbcaSSatoshi Sahara     */
43173bfbcaSSatoshi Sahara    public static function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
44173bfbcaSSatoshi Sahara    {
45173bfbcaSSatoshi Sahara        if ($sort === null) {
46173bfbcaSSatoshi Sahara            $sort = 'hits';
47173bfbcaSSatoshi Sahara        }
48173bfbcaSSatoshi Sahara        $data = [
49173bfbcaSSatoshi Sahara            'query' => $query,
50173bfbcaSSatoshi Sahara            'sort' => $sort,
51173bfbcaSSatoshi Sahara            'after' => $after,
52173bfbcaSSatoshi Sahara            'before' => $before
53173bfbcaSSatoshi Sahara        ];
54173bfbcaSSatoshi Sahara        $data['highlight'] =& $highlight;
55*46b83514SSatoshi Sahara        $action = static::class.'::pageSearchCallBack';
56173bfbcaSSatoshi Sahara        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
57173bfbcaSSatoshi Sahara    }
58173bfbcaSSatoshi Sahara
59173bfbcaSSatoshi Sahara    /**
60173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
61173bfbcaSSatoshi Sahara     *
62173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
63173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
64173bfbcaSSatoshi Sahara     *
65173bfbcaSSatoshi Sahara     * @param array $data  event data
66173bfbcaSSatoshi Sahara     * @return array       matching documents
67173bfbcaSSatoshi Sahara     */
68*46b83514SSatoshi Sahara    public static function pageSearchCallBack(&$data)
69173bfbcaSSatoshi Sahara    {
7086fc7283SSatoshi Sahara        $Indexer = PageIndex::getInstance();
71173bfbcaSSatoshi Sahara
72173bfbcaSSatoshi Sahara        // parse the given query
733837ea91SSatoshi Sahara        $q = QueryParser::convert($data['query']);
74173bfbcaSSatoshi Sahara        $data['highlight'] = $q['highlight'];
75173bfbcaSSatoshi Sahara
76173bfbcaSSatoshi Sahara        if (empty($q['parsed_ary'])) return array();
77173bfbcaSSatoshi Sahara
78173bfbcaSSatoshi Sahara        // lookup all words found in the query
79173bfbcaSSatoshi Sahara        $lookup = $Indexer->lookup($q['words']);
80173bfbcaSSatoshi Sahara
81173bfbcaSSatoshi Sahara        // get all pages in this dokuwiki site (!: includes nonexistent pages)
82173bfbcaSSatoshi Sahara        $pages_all = array();
83173bfbcaSSatoshi Sahara        foreach ($Indexer->getPages() as $id) {
84173bfbcaSSatoshi Sahara            $pages_all[$id] = 0; // base: 0 hit
85173bfbcaSSatoshi Sahara        }
86173bfbcaSSatoshi Sahara
87173bfbcaSSatoshi Sahara        // process the query
88173bfbcaSSatoshi Sahara        $stack = array();
89173bfbcaSSatoshi Sahara        foreach ($q['parsed_ary'] as $token) {
90173bfbcaSSatoshi Sahara            switch (substr($token, 0, 3)) {
91173bfbcaSSatoshi Sahara                case 'W+:':
92173bfbcaSSatoshi Sahara                case 'W-:':
93173bfbcaSSatoshi Sahara                case 'W_:': // word
94173bfbcaSSatoshi Sahara                    $word    = substr($token, 3);
95173bfbcaSSatoshi Sahara                    $stack[] = (array) $lookup[$word];
96173bfbcaSSatoshi Sahara                    break;
97173bfbcaSSatoshi Sahara                case 'P+:':
98173bfbcaSSatoshi Sahara                case 'P-:': // phrase
99173bfbcaSSatoshi Sahara                    $phrase = substr($token, 3);
100173bfbcaSSatoshi Sahara                    // since phrases are always parsed as ((W1)(W2)...(P)),
101173bfbcaSSatoshi Sahara                    // the end($stack) always points the pages that contain
102173bfbcaSSatoshi Sahara                    // all words in this phrase
103173bfbcaSSatoshi Sahara                    $pages  = end($stack);
104173bfbcaSSatoshi Sahara                    $pages_matched = array();
105173bfbcaSSatoshi Sahara                    foreach (array_keys($pages) as $id) {
106173bfbcaSSatoshi Sahara                        $evdata = array(
107173bfbcaSSatoshi Sahara                            'id' => $id,
108173bfbcaSSatoshi Sahara                            'phrase' => $phrase,
109173bfbcaSSatoshi Sahara                            'text' => rawWiki($id)
110173bfbcaSSatoshi Sahara                        );
111173bfbcaSSatoshi Sahara                        $evt = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
112173bfbcaSSatoshi Sahara                        if ($evt->advise_before() && $evt->result !== true) {
113c31af4f3SSatoshi Sahara                            $text = Utf8\PhpString::strtolower($evdata['text']);
114173bfbcaSSatoshi Sahara                            if (strpos($text, $phrase) !== false) {
115173bfbcaSSatoshi Sahara                                $evt->result = true;
116173bfbcaSSatoshi Sahara                            }
117173bfbcaSSatoshi Sahara                        }
118173bfbcaSSatoshi Sahara                        $evt->advise_after();
119173bfbcaSSatoshi Sahara                        if ($evt->result === true) {
120173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // phrase: always 0 hit
121173bfbcaSSatoshi Sahara                        }
122173bfbcaSSatoshi Sahara                    }
123173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
124173bfbcaSSatoshi Sahara                    break;
125173bfbcaSSatoshi Sahara                case 'N+:':
126173bfbcaSSatoshi Sahara                case 'N-:': // namespace
127173bfbcaSSatoshi Sahara                    $ns = cleanID(substr($token, 3)) . ':';
128173bfbcaSSatoshi Sahara                    $pages_matched = array();
129173bfbcaSSatoshi Sahara                    foreach (array_keys($pages_all) as $id) {
130173bfbcaSSatoshi Sahara                        if (strpos($id, $ns) === 0) {
131173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // namespace: always 0 hit
132173bfbcaSSatoshi Sahara                        }
133173bfbcaSSatoshi Sahara                    }
134173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
135173bfbcaSSatoshi Sahara                    break;
136173bfbcaSSatoshi Sahara                case 'AND': // and operation
137173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
138173bfbcaSSatoshi Sahara                    $stack[] = static::resultCombine(array($pages1, $pages2));
139173bfbcaSSatoshi Sahara                    break;
140173bfbcaSSatoshi Sahara                case 'OR':  // or operation
141173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
142173bfbcaSSatoshi Sahara                    $stack[] = static::resultUnite(array($pages1, $pages2));
143173bfbcaSSatoshi Sahara                    break;
144173bfbcaSSatoshi Sahara                case 'NOT': // not operation (unary)
145173bfbcaSSatoshi Sahara                    $pages   = array_pop($stack);
146173bfbcaSSatoshi Sahara                    $stack[] = static::resultComplement(array($pages_all, $pages));
147173bfbcaSSatoshi Sahara                    break;
148173bfbcaSSatoshi Sahara            }
149173bfbcaSSatoshi Sahara        }
150173bfbcaSSatoshi Sahara        $docs = array_pop($stack);
151173bfbcaSSatoshi Sahara
152173bfbcaSSatoshi Sahara        if (empty($docs)) return array();
153173bfbcaSSatoshi Sahara
154173bfbcaSSatoshi Sahara        // check: settings, acls, existence
155173bfbcaSSatoshi Sahara        foreach (array_keys($docs) as $id) {
156173bfbcaSSatoshi Sahara            if (isHiddenPage($id)
157173bfbcaSSatoshi Sahara                || auth_quickaclcheck($id) < AUTH_READ
158173bfbcaSSatoshi Sahara                || !page_exists($id, '', false)
159173bfbcaSSatoshi Sahara            ) {
160173bfbcaSSatoshi Sahara                unset($docs[$id]);
161173bfbcaSSatoshi Sahara            }
162173bfbcaSSatoshi Sahara        }
163173bfbcaSSatoshi Sahara
164173bfbcaSSatoshi Sahara        $docs = static::filterResultsByTime($docs, $data['after'], $data['before']);
165173bfbcaSSatoshi Sahara
166173bfbcaSSatoshi Sahara        if ($data['sort'] === 'mtime') {
167173bfbcaSSatoshi Sahara            uksort($docs, static::class.'::pagemtimesorter');
168173bfbcaSSatoshi Sahara        } else {
169173bfbcaSSatoshi Sahara            // sort docs by count
170173bfbcaSSatoshi Sahara            arsort($docs);
171173bfbcaSSatoshi Sahara        }
172173bfbcaSSatoshi Sahara
173173bfbcaSSatoshi Sahara        return $docs;
174173bfbcaSSatoshi Sahara    }
175173bfbcaSSatoshi Sahara
176173bfbcaSSatoshi Sahara    /**
177173bfbcaSSatoshi Sahara     * @param array      $results search results in the form pageid => value
178173bfbcaSSatoshi Sahara     * @param int|string $after   only returns results with mtime after this date,
179173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
180173bfbcaSSatoshi Sahara     * @param int|string $before  only returns results with mtime after this date,
181173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
182173bfbcaSSatoshi Sahara     *
183173bfbcaSSatoshi Sahara     * @return array
184173bfbcaSSatoshi Sahara     */
185173bfbcaSSatoshi Sahara    protected static function filterResultsByTime(array $results, $after, $before)
186173bfbcaSSatoshi Sahara    {
187173bfbcaSSatoshi Sahara        if ($after || $before) {
188173bfbcaSSatoshi Sahara            $after = is_int($after) ? $after : strtotime($after);
189173bfbcaSSatoshi Sahara            $before = is_int($before) ? $before : strtotime($before);
190173bfbcaSSatoshi Sahara
191173bfbcaSSatoshi Sahara            foreach ($results as $id => $value) {
192173bfbcaSSatoshi Sahara                $mTime = filemtime(wikiFN($id));
193173bfbcaSSatoshi Sahara                if ($after && $after > $mTime) {
194173bfbcaSSatoshi Sahara                    unset($results[$id]);
195173bfbcaSSatoshi Sahara                    continue;
196173bfbcaSSatoshi Sahara                }
197173bfbcaSSatoshi Sahara                if ($before && $before < $mTime) {
198173bfbcaSSatoshi Sahara                    unset($results[$id]);
199173bfbcaSSatoshi Sahara                }
200173bfbcaSSatoshi Sahara            }
201173bfbcaSSatoshi Sahara        }
202173bfbcaSSatoshi Sahara        return $results;
203173bfbcaSSatoshi Sahara    }
204173bfbcaSSatoshi Sahara
205173bfbcaSSatoshi Sahara    /**
206173bfbcaSSatoshi Sahara     * Sort pages by their mtime, from newest to oldest
207173bfbcaSSatoshi Sahara     *
208173bfbcaSSatoshi Sahara     * @param string $a
209173bfbcaSSatoshi Sahara     * @param string $b
210173bfbcaSSatoshi Sahara     *
211173bfbcaSSatoshi Sahara     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
212173bfbcaSSatoshi Sahara     *             and 0 if they are of the same age
213173bfbcaSSatoshi Sahara     */
214173bfbcaSSatoshi Sahara    protected static function pagemtimesorter($a, $b)
215173bfbcaSSatoshi Sahara    {
216173bfbcaSSatoshi Sahara        $mtimeA = filemtime(wikiFN($a));
217173bfbcaSSatoshi Sahara        $mtimeB = filemtime(wikiFN($b));
218173bfbcaSSatoshi Sahara        return $mtimeB - $mtimeA;
219173bfbcaSSatoshi Sahara    }
220173bfbcaSSatoshi Sahara
221173bfbcaSSatoshi Sahara    /**
222173bfbcaSSatoshi Sahara     * Creates a snippet extract
223173bfbcaSSatoshi Sahara     *
224173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
225173bfbcaSSatoshi Sahara     * @triggers FULLTEXT_SNIPPET_CREATE
226173bfbcaSSatoshi Sahara     *
227173bfbcaSSatoshi Sahara     * @param string $id page id
228173bfbcaSSatoshi Sahara     * @param array $highlight
229173bfbcaSSatoshi Sahara     * @return mixed
230173bfbcaSSatoshi Sahara     */
231173bfbcaSSatoshi Sahara    public static function snippet($id, $highlight)
232173bfbcaSSatoshi Sahara    {
233173bfbcaSSatoshi Sahara        $text = rawWiki($id);
234173bfbcaSSatoshi Sahara        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
235173bfbcaSSatoshi Sahara        $evdata = array(
236173bfbcaSSatoshi Sahara            'id'        => $id,
237173bfbcaSSatoshi Sahara            'text'      => &$text,
238173bfbcaSSatoshi Sahara            'highlight' => &$highlight,
239173bfbcaSSatoshi Sahara            'snippet'   => '',
240173bfbcaSSatoshi Sahara        );
241173bfbcaSSatoshi Sahara
242173bfbcaSSatoshi Sahara        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
243173bfbcaSSatoshi Sahara        if ($evt->advise_before()) {
244173bfbcaSSatoshi Sahara            $match = array();
245173bfbcaSSatoshi Sahara            $snippets = array();
246173bfbcaSSatoshi Sahara            $utf8_offset = $offset = $end = 0;
2470a3e25f4SSatoshi Sahara            $len = Utf8\PhpString::strlen($text);
248173bfbcaSSatoshi Sahara
249173bfbcaSSatoshi Sahara            // build a regexp from the phrases to highlight
250173bfbcaSSatoshi Sahara            $re1 = '(' .
251173bfbcaSSatoshi Sahara                join(
252173bfbcaSSatoshi Sahara                    '|',
253173bfbcaSSatoshi Sahara                    array_map(
254*46b83514SSatoshi Sahara                        static::class.'::snippetRePreprocess',
255173bfbcaSSatoshi Sahara                        array_map(
256173bfbcaSSatoshi Sahara                            'preg_quote_cb',
257173bfbcaSSatoshi Sahara                            array_filter((array) $highlight)
258173bfbcaSSatoshi Sahara                        )
259173bfbcaSSatoshi Sahara                    )
260173bfbcaSSatoshi Sahara                ) .
261173bfbcaSSatoshi Sahara                ')';
262173bfbcaSSatoshi Sahara            $re2 = "$re1.{0,75}(?!\\1)$re1";
263173bfbcaSSatoshi Sahara            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
264173bfbcaSSatoshi Sahara
265173bfbcaSSatoshi Sahara            for ($cnt=4; $cnt--;) {
266173bfbcaSSatoshi Sahara                if (0) {
267173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
268173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
269173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
270173bfbcaSSatoshi Sahara                } else {
271173bfbcaSSatoshi Sahara                    break;
272173bfbcaSSatoshi Sahara                }
273173bfbcaSSatoshi Sahara
274173bfbcaSSatoshi Sahara                list($str, $idx) = $match[0];
275173bfbcaSSatoshi Sahara
276173bfbcaSSatoshi Sahara                // convert $idx (a byte offset) into a utf8 character offset
2770a3e25f4SSatoshi Sahara                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
2780a3e25f4SSatoshi Sahara                $utf8_len = Utf8\PhpString::strlen($str);
279173bfbcaSSatoshi Sahara
280173bfbcaSSatoshi Sahara                // establish context, 100 bytes surrounding the match string
281173bfbcaSSatoshi Sahara                // first look to see if we can go 100 either side,
282173bfbcaSSatoshi Sahara                // then drop to 50 adding any excess if the other side can't go to 50,
283173bfbcaSSatoshi Sahara                $pre = min($utf8_idx - $utf8_offset, 100);
284173bfbcaSSatoshi Sahara                $post = min($len - $utf8_idx - $utf8_len, 100);
285173bfbcaSSatoshi Sahara
286173bfbcaSSatoshi Sahara                if ($pre > 50 && $post > 50) {
287173bfbcaSSatoshi Sahara                    $pre = $post = 50;
288173bfbcaSSatoshi Sahara                } elseif ($pre > 50) {
289173bfbcaSSatoshi Sahara                    $pre = min($pre, 100 - $post);
290173bfbcaSSatoshi Sahara                } elseif ($post > 50) {
291173bfbcaSSatoshi Sahara                    $post = min($post, 100 - $pre);
292173bfbcaSSatoshi Sahara                } elseif ($offset == 0) {
293173bfbcaSSatoshi Sahara                    // both are less than 50, means the context is the whole string
294173bfbcaSSatoshi Sahara                    // make it so and break out of this loop - there is no need for the
295173bfbcaSSatoshi Sahara                    // complex snippet calculations
296173bfbcaSSatoshi Sahara                    $snippets = array($text);
297173bfbcaSSatoshi Sahara                    break;
298173bfbcaSSatoshi Sahara                }
299173bfbcaSSatoshi Sahara
300173bfbcaSSatoshi Sahara                // establish context start and end points, try to append to previous
301173bfbcaSSatoshi Sahara                // context if possible
302173bfbcaSSatoshi Sahara                $start = $utf8_idx - $pre;
303173bfbcaSSatoshi Sahara                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
304173bfbcaSSatoshi Sahara                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
305173bfbcaSSatoshi Sahara
306173bfbcaSSatoshi Sahara                if ($append) {
3070a3e25f4SSatoshi Sahara                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
308173bfbcaSSatoshi Sahara                } else {
3090a3e25f4SSatoshi Sahara                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
310173bfbcaSSatoshi Sahara                }
311173bfbcaSSatoshi Sahara
312173bfbcaSSatoshi Sahara                // set $offset for next match attempt
313173bfbcaSSatoshi Sahara                // continue matching after the current match
314173bfbcaSSatoshi Sahara                // if the current match is not the longest possible match starting at the current offset
315173bfbcaSSatoshi Sahara                // this prevents further matching of this snippet but for possible matches of length
316173bfbcaSSatoshi Sahara                // smaller than match length + context (at least 50 characters) this match is part of the context
317173bfbcaSSatoshi Sahara                $utf8_offset = $utf8_idx + $utf8_len;
3180a3e25f4SSatoshi Sahara                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
3190a3e25f4SSatoshi Sahara                $offset = Utf8\Clean::correctIdx($text, $offset);
320173bfbcaSSatoshi Sahara            }
321173bfbcaSSatoshi Sahara
322173bfbcaSSatoshi Sahara            $m = "\1";
323173bfbcaSSatoshi Sahara            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
324173bfbcaSSatoshi Sahara            $snippet = preg_replace(
325173bfbcaSSatoshi Sahara                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
326173bfbcaSSatoshi Sahara                '<strong class="search_hit">$1</strong>',
327173bfbcaSSatoshi Sahara                hsc(join('... ', $snippets))
328173bfbcaSSatoshi Sahara            );
329173bfbcaSSatoshi Sahara
330173bfbcaSSatoshi Sahara            $evdata['snippet'] = $snippet;
331173bfbcaSSatoshi Sahara        }
332173bfbcaSSatoshi Sahara        $evt->advise_after();
333173bfbcaSSatoshi Sahara        unset($evt);
334173bfbcaSSatoshi Sahara
335173bfbcaSSatoshi Sahara        return $evdata['snippet'];
336173bfbcaSSatoshi Sahara    }
337173bfbcaSSatoshi Sahara
338173bfbcaSSatoshi Sahara    /**
339173bfbcaSSatoshi Sahara     * Wraps a search term in regex boundary checks.
340173bfbcaSSatoshi Sahara     *
341173bfbcaSSatoshi Sahara     * @param string $term
342173bfbcaSSatoshi Sahara     * @return string
343173bfbcaSSatoshi Sahara     */
344*46b83514SSatoshi Sahara    public static function snippetRePreprocess($term)
345173bfbcaSSatoshi Sahara    {
346173bfbcaSSatoshi Sahara        // do not process asian terms where word boundaries are not explicit
3470a3e25f4SSatoshi Sahara        if (Utf8\Asian::isAsianWords($term)) return $term;
348173bfbcaSSatoshi Sahara
349173bfbcaSSatoshi Sahara        if (UTF8_PROPERTYSUPPORT) {
350173bfbcaSSatoshi Sahara            // unicode word boundaries
351173bfbcaSSatoshi Sahara            // see http://stackoverflow.com/a/2449017/172068
352173bfbcaSSatoshi Sahara            $BL = '(?<!\pL)';
353173bfbcaSSatoshi Sahara            $BR = '(?!\pL)';
354173bfbcaSSatoshi Sahara        } else {
355173bfbcaSSatoshi Sahara            // not as correct as above, but at least won't break
356173bfbcaSSatoshi Sahara            $BL = '\b';
357173bfbcaSSatoshi Sahara            $BR = '\b';
358173bfbcaSSatoshi Sahara        }
359173bfbcaSSatoshi Sahara
360173bfbcaSSatoshi Sahara        if (substr($term, 0, 2) == '\\*') {
361173bfbcaSSatoshi Sahara            $term = substr($term, 2);
362173bfbcaSSatoshi Sahara        } else {
363173bfbcaSSatoshi Sahara            $term = $BL.$term;
364173bfbcaSSatoshi Sahara        }
365173bfbcaSSatoshi Sahara
366173bfbcaSSatoshi Sahara        if (substr($term, -2, 2) == '\\*') {
367173bfbcaSSatoshi Sahara            $term = substr($term, 0, -2);
368173bfbcaSSatoshi Sahara        } else {
369173bfbcaSSatoshi Sahara            $term = $term.$BR;
370173bfbcaSSatoshi Sahara        }
371173bfbcaSSatoshi Sahara
372173bfbcaSSatoshi Sahara        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
373173bfbcaSSatoshi Sahara            $term = '';
374173bfbcaSSatoshi Sahara        }
375173bfbcaSSatoshi Sahara        return $term;
376173bfbcaSSatoshi Sahara    }
377173bfbcaSSatoshi Sahara
378173bfbcaSSatoshi Sahara    /**
379173bfbcaSSatoshi Sahara     * Combine found documents and sum up their scores
380173bfbcaSSatoshi Sahara     *
381173bfbcaSSatoshi Sahara     * This function is used to combine searched words with a logical
382173bfbcaSSatoshi Sahara     * AND. Only documents available in all arrays are returned.
383173bfbcaSSatoshi Sahara     *
384173bfbcaSSatoshi Sahara     * based upon PEAR's PHP_Compat function for array_intersect_key()
385173bfbcaSSatoshi Sahara     *
386173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
387173bfbcaSSatoshi Sahara     * @return array
388173bfbcaSSatoshi Sahara     */
389173bfbcaSSatoshi Sahara    protected static function resultCombine($args)
390173bfbcaSSatoshi Sahara    {
391173bfbcaSSatoshi Sahara        $array_count = count($args);
392173bfbcaSSatoshi Sahara        if ($array_count == 1) {
393173bfbcaSSatoshi Sahara            return $args[0];
394173bfbcaSSatoshi Sahara        }
395173bfbcaSSatoshi Sahara
396173bfbcaSSatoshi Sahara        $result = array();
397173bfbcaSSatoshi Sahara        if ($array_count > 1) {
398173bfbcaSSatoshi Sahara            foreach ($args[0] as $key => $value) {
399173bfbcaSSatoshi Sahara                $result[$key] = $value;
400173bfbcaSSatoshi Sahara                for ($i = 1; $i !== $array_count; $i++) {
401173bfbcaSSatoshi Sahara                    if (!isset($args[$i][$key])) {
402173bfbcaSSatoshi Sahara                        unset($result[$key]);
403173bfbcaSSatoshi Sahara                        break;
404173bfbcaSSatoshi Sahara                    }
405173bfbcaSSatoshi Sahara                    $result[$key] += $args[$i][$key];
406173bfbcaSSatoshi Sahara                }
407173bfbcaSSatoshi Sahara            }
408173bfbcaSSatoshi Sahara        }
409173bfbcaSSatoshi Sahara        return $result;
410173bfbcaSSatoshi Sahara    }
411173bfbcaSSatoshi Sahara
412173bfbcaSSatoshi Sahara    /**
413173bfbcaSSatoshi Sahara     * Unites found documents and sum up their scores
414173bfbcaSSatoshi Sahara     * based upon resultCombine() method
415173bfbcaSSatoshi Sahara     *
416173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
417173bfbcaSSatoshi Sahara     * @return array
418173bfbcaSSatoshi Sahara     *
419173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
420173bfbcaSSatoshi Sahara     */
421173bfbcaSSatoshi Sahara    protected static function resultUnite($args)
422173bfbcaSSatoshi Sahara    {
423173bfbcaSSatoshi Sahara        $array_count = count($args);
424173bfbcaSSatoshi Sahara        if ($array_count === 1) {
425173bfbcaSSatoshi Sahara            return $args[0];
426173bfbcaSSatoshi Sahara        }
427173bfbcaSSatoshi Sahara
428173bfbcaSSatoshi Sahara        $result = $args[0];
429173bfbcaSSatoshi Sahara        for ($i = 1; $i !== $array_count; $i++) {
430173bfbcaSSatoshi Sahara            foreach (array_keys($args[$i]) as $id) {
431173bfbcaSSatoshi Sahara                $result[$id] += $args[$i][$id];
432173bfbcaSSatoshi Sahara            }
433173bfbcaSSatoshi Sahara        }
434173bfbcaSSatoshi Sahara        return $result;
435173bfbcaSSatoshi Sahara    }
436173bfbcaSSatoshi Sahara
437173bfbcaSSatoshi Sahara    /**
438173bfbcaSSatoshi Sahara     * Computes the difference of documents using page id for comparison
439173bfbcaSSatoshi Sahara     * nearly identical to PHP5's array_diff_key()
440173bfbcaSSatoshi Sahara     *
441173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
442173bfbcaSSatoshi Sahara     * @return array
443173bfbcaSSatoshi Sahara     *
444173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
445173bfbcaSSatoshi Sahara     */
446173bfbcaSSatoshi Sahara    protected static function resultComplement($args)
447173bfbcaSSatoshi Sahara    {
448173bfbcaSSatoshi Sahara        $array_count = count($args);
449173bfbcaSSatoshi Sahara        if ($array_count === 1) {
450173bfbcaSSatoshi Sahara            return $args[0];
451173bfbcaSSatoshi Sahara        }
452173bfbcaSSatoshi Sahara
453173bfbcaSSatoshi Sahara        $result = $args[0];
454173bfbcaSSatoshi Sahara        foreach (array_keys($result) as $id) {
455173bfbcaSSatoshi Sahara            for ($i = 1; $i !== $array_count; $i++) {
456173bfbcaSSatoshi Sahara                if (isset($args[$i][$id])) unset($result[$id]);
457173bfbcaSSatoshi Sahara            }
458173bfbcaSSatoshi Sahara        }
459173bfbcaSSatoshi Sahara        return $result;
460173bfbcaSSatoshi Sahara    }
461173bfbcaSSatoshi Sahara}
462