xref: /dokuwiki/inc/Search/FulltextSearch.php (revision 0cba610bea94e5841d211c0d3f57ae96e8ad1379)
1173bfbcaSSatoshi Sahara<?php
2173bfbcaSSatoshi Saharanamespace dokuwiki\Search;
3173bfbcaSSatoshi Sahara
4173bfbcaSSatoshi Saharause dokuwiki\Extension\Event;
586fc7283SSatoshi Saharause dokuwiki\Search\PageIndex;
63837ea91SSatoshi Saharause dokuwiki\Search\QueryParser;
7c31af4f3SSatoshi Saharause dokuwiki\Utf8;
8173bfbcaSSatoshi Sahara
9173bfbcaSSatoshi Sahara/**
10*0cba610bSSatoshi Sahara * create snippets for the first few results only
11*0cba610bSSatoshi Sahara */
12*0cba610bSSatoshi Saharaconst FT_SNIPPET_NUMBER = 15;
13*0cba610bSSatoshi Sahara
14*0cba610bSSatoshi Sahara/**
15173bfbcaSSatoshi Sahara * Class DokuWiki Fulltext Search
16173bfbcaSSatoshi Sahara *
17173bfbcaSSatoshi Sahara * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
18173bfbcaSSatoshi Sahara * @author     Andreas Gohr <andi@splitbrain.org>
19173bfbcaSSatoshi Sahara */
20173bfbcaSSatoshi Saharaclass FulltextSearch
21173bfbcaSSatoshi Sahara{
22173bfbcaSSatoshi Sahara    /**
23173bfbcaSSatoshi Sahara     *  Fulltext Search constructor. prevent direct object creation
24173bfbcaSSatoshi Sahara     */
25173bfbcaSSatoshi Sahara    protected function __construct() {}
26173bfbcaSSatoshi Sahara
27173bfbcaSSatoshi Sahara    /**
28173bfbcaSSatoshi Sahara     * The fulltext search
29173bfbcaSSatoshi Sahara     *
30173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
31173bfbcaSSatoshi Sahara     *
32173bfbcaSSatoshi Sahara     * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
33173bfbcaSSatoshi Sahara     *
34173bfbcaSSatoshi Sahara     * @param string     $query
35173bfbcaSSatoshi Sahara     * @param array      $highlight
36173bfbcaSSatoshi Sahara     * @param string     $sort
37173bfbcaSSatoshi Sahara     * @param int|string $after  only show results with mtime after this date,
38173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
39173bfbcaSSatoshi Sahara     * @param int|string $before only show results with mtime before this date,
40173bfbcaSSatoshi Sahara     *                           accepts timestap or strtotime arguments
41173bfbcaSSatoshi Sahara     *
42173bfbcaSSatoshi Sahara     * @return array
43173bfbcaSSatoshi Sahara     */
44173bfbcaSSatoshi Sahara    public static function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null)
45173bfbcaSSatoshi Sahara    {
46173bfbcaSSatoshi Sahara        if ($sort === null) {
47173bfbcaSSatoshi Sahara            $sort = 'hits';
48173bfbcaSSatoshi Sahara        }
49173bfbcaSSatoshi Sahara        $data = [
50173bfbcaSSatoshi Sahara            'query' => $query,
51173bfbcaSSatoshi Sahara            'sort' => $sort,
52173bfbcaSSatoshi Sahara            'after' => $after,
53173bfbcaSSatoshi Sahara            'before' => $before
54173bfbcaSSatoshi Sahara        ];
55173bfbcaSSatoshi Sahara        $data['highlight'] =& $highlight;
56173bfbcaSSatoshi Sahara        $action = static::class.'::callback_pageSearch';
57173bfbcaSSatoshi Sahara        return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action);
58173bfbcaSSatoshi Sahara    }
59173bfbcaSSatoshi Sahara
60173bfbcaSSatoshi Sahara    /**
61173bfbcaSSatoshi Sahara     * Returns a list of matching documents for the given query
62173bfbcaSSatoshi Sahara     *
63173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
64173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
65173bfbcaSSatoshi Sahara     *
66173bfbcaSSatoshi Sahara     * @param array $data  event data
67173bfbcaSSatoshi Sahara     * @return array       matching documents
68173bfbcaSSatoshi Sahara     */
696b6becabSSatoshi Sahara    public static function callback_pageSearch($data)
70173bfbcaSSatoshi Sahara    {
7186fc7283SSatoshi Sahara        $Indexer = PageIndex::getInstance();
72173bfbcaSSatoshi Sahara
73173bfbcaSSatoshi Sahara        // parse the given query
743837ea91SSatoshi Sahara        $q = QueryParser::convert($data['query']);
75173bfbcaSSatoshi Sahara        $data['highlight'] = $q['highlight'];
76173bfbcaSSatoshi Sahara
77173bfbcaSSatoshi Sahara        if (empty($q['parsed_ary'])) return array();
78173bfbcaSSatoshi Sahara
79173bfbcaSSatoshi Sahara        // lookup all words found in the query
80173bfbcaSSatoshi Sahara        $lookup = $Indexer->lookup($q['words']);
81173bfbcaSSatoshi Sahara
82173bfbcaSSatoshi Sahara        // get all pages in this dokuwiki site (!: includes nonexistent pages)
83173bfbcaSSatoshi Sahara        $pages_all = array();
84173bfbcaSSatoshi Sahara        foreach ($Indexer->getPages() as $id) {
85173bfbcaSSatoshi Sahara            $pages_all[$id] = 0; // base: 0 hit
86173bfbcaSSatoshi Sahara        }
87173bfbcaSSatoshi Sahara
88173bfbcaSSatoshi Sahara        // process the query
89173bfbcaSSatoshi Sahara        $stack = array();
90173bfbcaSSatoshi Sahara        foreach ($q['parsed_ary'] as $token) {
91173bfbcaSSatoshi Sahara            switch (substr($token, 0, 3)) {
92173bfbcaSSatoshi Sahara                case 'W+:':
93173bfbcaSSatoshi Sahara                case 'W-:':
94173bfbcaSSatoshi Sahara                case 'W_:': // word
95173bfbcaSSatoshi Sahara                    $word    = substr($token, 3);
96173bfbcaSSatoshi Sahara                    $stack[] = (array) $lookup[$word];
97173bfbcaSSatoshi Sahara                    break;
98173bfbcaSSatoshi Sahara                case 'P+:':
99173bfbcaSSatoshi Sahara                case 'P-:': // phrase
100173bfbcaSSatoshi Sahara                    $phrase = substr($token, 3);
101173bfbcaSSatoshi Sahara                    // since phrases are always parsed as ((W1)(W2)...(P)),
102173bfbcaSSatoshi Sahara                    // the end($stack) always points the pages that contain
103173bfbcaSSatoshi Sahara                    // all words in this phrase
104173bfbcaSSatoshi Sahara                    $pages  = end($stack);
105173bfbcaSSatoshi Sahara                    $pages_matched = array();
106173bfbcaSSatoshi Sahara                    foreach (array_keys($pages) as $id) {
107173bfbcaSSatoshi Sahara                        $evdata = array(
108173bfbcaSSatoshi Sahara                            'id' => $id,
109173bfbcaSSatoshi Sahara                            'phrase' => $phrase,
110173bfbcaSSatoshi Sahara                            'text' => rawWiki($id)
111173bfbcaSSatoshi Sahara                        );
112173bfbcaSSatoshi Sahara                        $evt = new Event('FULLTEXT_PHRASE_MATCH', $evdata);
113173bfbcaSSatoshi Sahara                        if ($evt->advise_before() && $evt->result !== true) {
114c31af4f3SSatoshi Sahara                            $text = Utf8\PhpString::strtolower($evdata['text']);
115173bfbcaSSatoshi Sahara                            if (strpos($text, $phrase) !== false) {
116173bfbcaSSatoshi Sahara                                $evt->result = true;
117173bfbcaSSatoshi Sahara                            }
118173bfbcaSSatoshi Sahara                        }
119173bfbcaSSatoshi Sahara                        $evt->advise_after();
120173bfbcaSSatoshi Sahara                        if ($evt->result === true) {
121173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // phrase: always 0 hit
122173bfbcaSSatoshi Sahara                        }
123173bfbcaSSatoshi Sahara                    }
124173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
125173bfbcaSSatoshi Sahara                    break;
126173bfbcaSSatoshi Sahara                case 'N+:':
127173bfbcaSSatoshi Sahara                case 'N-:': // namespace
128173bfbcaSSatoshi Sahara                    $ns = cleanID(substr($token, 3)) . ':';
129173bfbcaSSatoshi Sahara                    $pages_matched = array();
130173bfbcaSSatoshi Sahara                    foreach (array_keys($pages_all) as $id) {
131173bfbcaSSatoshi Sahara                        if (strpos($id, $ns) === 0) {
132173bfbcaSSatoshi Sahara                            $pages_matched[$id] = 0; // namespace: always 0 hit
133173bfbcaSSatoshi Sahara                        }
134173bfbcaSSatoshi Sahara                    }
135173bfbcaSSatoshi Sahara                    $stack[] = $pages_matched;
136173bfbcaSSatoshi Sahara                    break;
137173bfbcaSSatoshi Sahara                case 'AND': // and operation
138173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
139173bfbcaSSatoshi Sahara                    $stack[] = static::resultCombine(array($pages1, $pages2));
140173bfbcaSSatoshi Sahara                    break;
141173bfbcaSSatoshi Sahara                case 'OR':  // or operation
142173bfbcaSSatoshi Sahara                    list($pages1, $pages2) = array_splice($stack, -2);
143173bfbcaSSatoshi Sahara                    $stack[] = static::resultUnite(array($pages1, $pages2));
144173bfbcaSSatoshi Sahara                    break;
145173bfbcaSSatoshi Sahara                case 'NOT': // not operation (unary)
146173bfbcaSSatoshi Sahara                    $pages   = array_pop($stack);
147173bfbcaSSatoshi Sahara                    $stack[] = static::resultComplement(array($pages_all, $pages));
148173bfbcaSSatoshi Sahara                    break;
149173bfbcaSSatoshi Sahara            }
150173bfbcaSSatoshi Sahara        }
151173bfbcaSSatoshi Sahara        $docs = array_pop($stack);
152173bfbcaSSatoshi Sahara
153173bfbcaSSatoshi Sahara        if (empty($docs)) return array();
154173bfbcaSSatoshi Sahara
155173bfbcaSSatoshi Sahara        // check: settings, acls, existence
156173bfbcaSSatoshi Sahara        foreach (array_keys($docs) as $id) {
157173bfbcaSSatoshi Sahara            if (isHiddenPage($id)
158173bfbcaSSatoshi Sahara                || auth_quickaclcheck($id) < AUTH_READ
159173bfbcaSSatoshi Sahara                || !page_exists($id, '', false)
160173bfbcaSSatoshi Sahara            ) {
161173bfbcaSSatoshi Sahara                unset($docs[$id]);
162173bfbcaSSatoshi Sahara            }
163173bfbcaSSatoshi Sahara        }
164173bfbcaSSatoshi Sahara
165173bfbcaSSatoshi Sahara        $docs = static::filterResultsByTime($docs, $data['after'], $data['before']);
166173bfbcaSSatoshi Sahara
167173bfbcaSSatoshi Sahara        if ($data['sort'] === 'mtime') {
168173bfbcaSSatoshi Sahara            uksort($docs, static::class.'::pagemtimesorter');
169173bfbcaSSatoshi Sahara        } else {
170173bfbcaSSatoshi Sahara            // sort docs by count
171173bfbcaSSatoshi Sahara            arsort($docs);
172173bfbcaSSatoshi Sahara        }
173173bfbcaSSatoshi Sahara
174173bfbcaSSatoshi Sahara        return $docs;
175173bfbcaSSatoshi Sahara    }
176173bfbcaSSatoshi Sahara
177173bfbcaSSatoshi Sahara    /**
178173bfbcaSSatoshi Sahara     * @param array      $results search results in the form pageid => value
179173bfbcaSSatoshi Sahara     * @param int|string $after   only returns results with mtime after this date,
180173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
181173bfbcaSSatoshi Sahara     * @param int|string $before  only returns results with mtime after this date,
182173bfbcaSSatoshi Sahara     *                            accepts timestap or strtotime arguments
183173bfbcaSSatoshi Sahara     *
184173bfbcaSSatoshi Sahara     * @return array
185173bfbcaSSatoshi Sahara     */
186173bfbcaSSatoshi Sahara    protected static function filterResultsByTime(array $results, $after, $before)
187173bfbcaSSatoshi Sahara    {
188173bfbcaSSatoshi Sahara        if ($after || $before) {
189173bfbcaSSatoshi Sahara            $after = is_int($after) ? $after : strtotime($after);
190173bfbcaSSatoshi Sahara            $before = is_int($before) ? $before : strtotime($before);
191173bfbcaSSatoshi Sahara
192173bfbcaSSatoshi Sahara            foreach ($results as $id => $value) {
193173bfbcaSSatoshi Sahara                $mTime = filemtime(wikiFN($id));
194173bfbcaSSatoshi Sahara                if ($after && $after > $mTime) {
195173bfbcaSSatoshi Sahara                    unset($results[$id]);
196173bfbcaSSatoshi Sahara                    continue;
197173bfbcaSSatoshi Sahara                }
198173bfbcaSSatoshi Sahara                if ($before && $before < $mTime) {
199173bfbcaSSatoshi Sahara                    unset($results[$id]);
200173bfbcaSSatoshi Sahara                }
201173bfbcaSSatoshi Sahara            }
202173bfbcaSSatoshi Sahara        }
203173bfbcaSSatoshi Sahara        return $results;
204173bfbcaSSatoshi Sahara    }
205173bfbcaSSatoshi Sahara
206173bfbcaSSatoshi Sahara    /**
207173bfbcaSSatoshi Sahara     * Sort pages by their mtime, from newest to oldest
208173bfbcaSSatoshi Sahara     *
209173bfbcaSSatoshi Sahara     * @param string $a
210173bfbcaSSatoshi Sahara     * @param string $b
211173bfbcaSSatoshi Sahara     *
212173bfbcaSSatoshi Sahara     * @return int Returns < 0 if $a is newer than $b, > 0 if $b is newer than $a
213173bfbcaSSatoshi Sahara     *             and 0 if they are of the same age
214173bfbcaSSatoshi Sahara     */
215173bfbcaSSatoshi Sahara    protected static function pagemtimesorter($a, $b)
216173bfbcaSSatoshi Sahara    {
217173bfbcaSSatoshi Sahara        $mtimeA = filemtime(wikiFN($a));
218173bfbcaSSatoshi Sahara        $mtimeB = filemtime(wikiFN($b));
219173bfbcaSSatoshi Sahara        return $mtimeB - $mtimeA;
220173bfbcaSSatoshi Sahara    }
221173bfbcaSSatoshi Sahara
222173bfbcaSSatoshi Sahara    /**
223173bfbcaSSatoshi Sahara     * Creates a snippet extract
224173bfbcaSSatoshi Sahara     *
225173bfbcaSSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
226173bfbcaSSatoshi Sahara     * @triggers FULLTEXT_SNIPPET_CREATE
227173bfbcaSSatoshi Sahara     *
228173bfbcaSSatoshi Sahara     * @param string $id page id
229173bfbcaSSatoshi Sahara     * @param array $highlight
230173bfbcaSSatoshi Sahara     * @return mixed
231173bfbcaSSatoshi Sahara     */
232173bfbcaSSatoshi Sahara    public static function snippet($id, $highlight)
233173bfbcaSSatoshi Sahara    {
234173bfbcaSSatoshi Sahara        $text = rawWiki($id);
235173bfbcaSSatoshi Sahara        $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens
236173bfbcaSSatoshi Sahara        $evdata = array(
237173bfbcaSSatoshi Sahara            'id'        => $id,
238173bfbcaSSatoshi Sahara            'text'      => &$text,
239173bfbcaSSatoshi Sahara            'highlight' => &$highlight,
240173bfbcaSSatoshi Sahara            'snippet'   => '',
241173bfbcaSSatoshi Sahara        );
242173bfbcaSSatoshi Sahara
243173bfbcaSSatoshi Sahara        $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata);
244173bfbcaSSatoshi Sahara        if ($evt->advise_before()) {
245173bfbcaSSatoshi Sahara            $match = array();
246173bfbcaSSatoshi Sahara            $snippets = array();
247173bfbcaSSatoshi Sahara            $utf8_offset = $offset = $end = 0;
2480a3e25f4SSatoshi Sahara            $len = Utf8\PhpString::strlen($text);
249173bfbcaSSatoshi Sahara
250173bfbcaSSatoshi Sahara            // build a regexp from the phrases to highlight
251173bfbcaSSatoshi Sahara            $re1 = '(' .
252173bfbcaSSatoshi Sahara                join(
253173bfbcaSSatoshi Sahara                    '|',
254173bfbcaSSatoshi Sahara                    array_map(
255173bfbcaSSatoshi Sahara                        static::class.'::snippet_re_preprocess',
256173bfbcaSSatoshi Sahara                        array_map(
257173bfbcaSSatoshi Sahara                            'preg_quote_cb',
258173bfbcaSSatoshi Sahara                            array_filter((array) $highlight)
259173bfbcaSSatoshi Sahara                        )
260173bfbcaSSatoshi Sahara                    )
261173bfbcaSSatoshi Sahara                ) .
262173bfbcaSSatoshi Sahara                ')';
263173bfbcaSSatoshi Sahara            $re2 = "$re1.{0,75}(?!\\1)$re1";
264173bfbcaSSatoshi Sahara            $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1";
265173bfbcaSSatoshi Sahara
266173bfbcaSSatoshi Sahara            for ($cnt=4; $cnt--;) {
267173bfbcaSSatoshi Sahara                if (0) {
268173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
269173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
270173bfbcaSSatoshi Sahara                } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) {
271173bfbcaSSatoshi Sahara                } else {
272173bfbcaSSatoshi Sahara                    break;
273173bfbcaSSatoshi Sahara                }
274173bfbcaSSatoshi Sahara
275173bfbcaSSatoshi Sahara                list($str, $idx) = $match[0];
276173bfbcaSSatoshi Sahara
277173bfbcaSSatoshi Sahara                // convert $idx (a byte offset) into a utf8 character offset
2780a3e25f4SSatoshi Sahara                $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx));
2790a3e25f4SSatoshi Sahara                $utf8_len = Utf8\PhpString::strlen($str);
280173bfbcaSSatoshi Sahara
281173bfbcaSSatoshi Sahara                // establish context, 100 bytes surrounding the match string
282173bfbcaSSatoshi Sahara                // first look to see if we can go 100 either side,
283173bfbcaSSatoshi Sahara                // then drop to 50 adding any excess if the other side can't go to 50,
284173bfbcaSSatoshi Sahara                $pre = min($utf8_idx - $utf8_offset, 100);
285173bfbcaSSatoshi Sahara                $post = min($len - $utf8_idx - $utf8_len, 100);
286173bfbcaSSatoshi Sahara
287173bfbcaSSatoshi Sahara                if ($pre > 50 && $post > 50) {
288173bfbcaSSatoshi Sahara                    $pre = $post = 50;
289173bfbcaSSatoshi Sahara                } elseif ($pre > 50) {
290173bfbcaSSatoshi Sahara                    $pre = min($pre, 100 - $post);
291173bfbcaSSatoshi Sahara                } elseif ($post > 50) {
292173bfbcaSSatoshi Sahara                    $post = min($post, 100 - $pre);
293173bfbcaSSatoshi Sahara                } elseif ($offset == 0) {
294173bfbcaSSatoshi Sahara                    // both are less than 50, means the context is the whole string
295173bfbcaSSatoshi Sahara                    // make it so and break out of this loop - there is no need for the
296173bfbcaSSatoshi Sahara                    // complex snippet calculations
297173bfbcaSSatoshi Sahara                    $snippets = array($text);
298173bfbcaSSatoshi Sahara                    break;
299173bfbcaSSatoshi Sahara                }
300173bfbcaSSatoshi Sahara
301173bfbcaSSatoshi Sahara                // establish context start and end points, try to append to previous
302173bfbcaSSatoshi Sahara                // context if possible
303173bfbcaSSatoshi Sahara                $start = $utf8_idx - $pre;
304173bfbcaSSatoshi Sahara                $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
305173bfbcaSSatoshi Sahara                $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
306173bfbcaSSatoshi Sahara
307173bfbcaSSatoshi Sahara                if ($append) {
3080a3e25f4SSatoshi Sahara                    $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append);
309173bfbcaSSatoshi Sahara                } else {
3100a3e25f4SSatoshi Sahara                    $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start);
311173bfbcaSSatoshi Sahara                }
312173bfbcaSSatoshi Sahara
313173bfbcaSSatoshi Sahara                // set $offset for next match attempt
314173bfbcaSSatoshi Sahara                // continue matching after the current match
315173bfbcaSSatoshi Sahara                // if the current match is not the longest possible match starting at the current offset
316173bfbcaSSatoshi Sahara                // this prevents further matching of this snippet but for possible matches of length
317173bfbcaSSatoshi Sahara                // smaller than match length + context (at least 50 characters) this match is part of the context
318173bfbcaSSatoshi Sahara                $utf8_offset = $utf8_idx + $utf8_len;
3190a3e25f4SSatoshi Sahara                $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len));
3200a3e25f4SSatoshi Sahara                $offset = Utf8\Clean::correctIdx($text, $offset);
321173bfbcaSSatoshi Sahara            }
322173bfbcaSSatoshi Sahara
323173bfbcaSSatoshi Sahara            $m = "\1";
324173bfbcaSSatoshi Sahara            $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets);
325173bfbcaSSatoshi Sahara            $snippet = preg_replace(
326173bfbcaSSatoshi Sahara                '/' . $m . '([^' . $m . ']*?)' . $m . '/iu',
327173bfbcaSSatoshi Sahara                '<strong class="search_hit">$1</strong>',
328173bfbcaSSatoshi Sahara                hsc(join('... ', $snippets))
329173bfbcaSSatoshi Sahara            );
330173bfbcaSSatoshi Sahara
331173bfbcaSSatoshi Sahara            $evdata['snippet'] = $snippet;
332173bfbcaSSatoshi Sahara        }
333173bfbcaSSatoshi Sahara        $evt->advise_after();
334173bfbcaSSatoshi Sahara        unset($evt);
335173bfbcaSSatoshi Sahara
336173bfbcaSSatoshi Sahara        return $evdata['snippet'];
337173bfbcaSSatoshi Sahara    }
338173bfbcaSSatoshi Sahara
339173bfbcaSSatoshi Sahara    /**
340173bfbcaSSatoshi Sahara     * Wraps a search term in regex boundary checks.
341173bfbcaSSatoshi Sahara     *
342173bfbcaSSatoshi Sahara     * @param string $term
343173bfbcaSSatoshi Sahara     * @return string
344173bfbcaSSatoshi Sahara     */
345173bfbcaSSatoshi Sahara    public static function snippet_re_preprocess($term)
346173bfbcaSSatoshi Sahara    {
347173bfbcaSSatoshi Sahara        // do not process asian terms where word boundaries are not explicit
3480a3e25f4SSatoshi Sahara        if (Utf8\Asian::isAsianWords($term)) return $term;
349173bfbcaSSatoshi Sahara
350173bfbcaSSatoshi Sahara        if (UTF8_PROPERTYSUPPORT) {
351173bfbcaSSatoshi Sahara            // unicode word boundaries
352173bfbcaSSatoshi Sahara            // see http://stackoverflow.com/a/2449017/172068
353173bfbcaSSatoshi Sahara            $BL = '(?<!\pL)';
354173bfbcaSSatoshi Sahara            $BR = '(?!\pL)';
355173bfbcaSSatoshi Sahara        } else {
356173bfbcaSSatoshi Sahara            // not as correct as above, but at least won't break
357173bfbcaSSatoshi Sahara            $BL = '\b';
358173bfbcaSSatoshi Sahara            $BR = '\b';
359173bfbcaSSatoshi Sahara        }
360173bfbcaSSatoshi Sahara
361173bfbcaSSatoshi Sahara        if (substr($term, 0, 2) == '\\*') {
362173bfbcaSSatoshi Sahara            $term = substr($term, 2);
363173bfbcaSSatoshi Sahara        } else {
364173bfbcaSSatoshi Sahara            $term = $BL.$term;
365173bfbcaSSatoshi Sahara        }
366173bfbcaSSatoshi Sahara
367173bfbcaSSatoshi Sahara        if (substr($term, -2, 2) == '\\*') {
368173bfbcaSSatoshi Sahara            $term = substr($term, 0, -2);
369173bfbcaSSatoshi Sahara        } else {
370173bfbcaSSatoshi Sahara            $term = $term.$BR;
371173bfbcaSSatoshi Sahara        }
372173bfbcaSSatoshi Sahara
373173bfbcaSSatoshi Sahara        if ($term == $BL || $term == $BR || $term == $BL.$BR) {
374173bfbcaSSatoshi Sahara            $term = '';
375173bfbcaSSatoshi Sahara        }
376173bfbcaSSatoshi Sahara        return $term;
377173bfbcaSSatoshi Sahara    }
378173bfbcaSSatoshi Sahara
379173bfbcaSSatoshi Sahara    /**
380173bfbcaSSatoshi Sahara     * Combine found documents and sum up their scores
381173bfbcaSSatoshi Sahara     *
382173bfbcaSSatoshi Sahara     * This function is used to combine searched words with a logical
383173bfbcaSSatoshi Sahara     * AND. Only documents available in all arrays are returned.
384173bfbcaSSatoshi Sahara     *
385173bfbcaSSatoshi Sahara     * based upon PEAR's PHP_Compat function for array_intersect_key()
386173bfbcaSSatoshi Sahara     *
387173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
388173bfbcaSSatoshi Sahara     * @return array
389173bfbcaSSatoshi Sahara     */
390173bfbcaSSatoshi Sahara    protected static function resultCombine($args)
391173bfbcaSSatoshi Sahara    {
392173bfbcaSSatoshi Sahara        $array_count = count($args);
393173bfbcaSSatoshi Sahara        if ($array_count == 1) {
394173bfbcaSSatoshi Sahara            return $args[0];
395173bfbcaSSatoshi Sahara        }
396173bfbcaSSatoshi Sahara
397173bfbcaSSatoshi Sahara        $result = array();
398173bfbcaSSatoshi Sahara        if ($array_count > 1) {
399173bfbcaSSatoshi Sahara            foreach ($args[0] as $key => $value) {
400173bfbcaSSatoshi Sahara                $result[$key] = $value;
401173bfbcaSSatoshi Sahara                for ($i = 1; $i !== $array_count; $i++) {
402173bfbcaSSatoshi Sahara                    if (!isset($args[$i][$key])) {
403173bfbcaSSatoshi Sahara                        unset($result[$key]);
404173bfbcaSSatoshi Sahara                        break;
405173bfbcaSSatoshi Sahara                    }
406173bfbcaSSatoshi Sahara                    $result[$key] += $args[$i][$key];
407173bfbcaSSatoshi Sahara                }
408173bfbcaSSatoshi Sahara            }
409173bfbcaSSatoshi Sahara        }
410173bfbcaSSatoshi Sahara        return $result;
411173bfbcaSSatoshi Sahara    }
412173bfbcaSSatoshi Sahara
413173bfbcaSSatoshi Sahara    /**
414173bfbcaSSatoshi Sahara     * Unites found documents and sum up their scores
415173bfbcaSSatoshi Sahara     * based upon resultCombine() method
416173bfbcaSSatoshi Sahara     *
417173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
418173bfbcaSSatoshi Sahara     * @return array
419173bfbcaSSatoshi Sahara     *
420173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
421173bfbcaSSatoshi Sahara     */
422173bfbcaSSatoshi Sahara    protected static function resultUnite($args)
423173bfbcaSSatoshi Sahara    {
424173bfbcaSSatoshi Sahara        $array_count = count($args);
425173bfbcaSSatoshi Sahara        if ($array_count === 1) {
426173bfbcaSSatoshi Sahara            return $args[0];
427173bfbcaSSatoshi Sahara        }
428173bfbcaSSatoshi Sahara
429173bfbcaSSatoshi Sahara        $result = $args[0];
430173bfbcaSSatoshi Sahara        for ($i = 1; $i !== $array_count; $i++) {
431173bfbcaSSatoshi Sahara            foreach (array_keys($args[$i]) as $id) {
432173bfbcaSSatoshi Sahara                $result[$id] += $args[$i][$id];
433173bfbcaSSatoshi Sahara            }
434173bfbcaSSatoshi Sahara        }
435173bfbcaSSatoshi Sahara        return $result;
436173bfbcaSSatoshi Sahara    }
437173bfbcaSSatoshi Sahara
438173bfbcaSSatoshi Sahara    /**
439173bfbcaSSatoshi Sahara     * Computes the difference of documents using page id for comparison
440173bfbcaSSatoshi Sahara     * nearly identical to PHP5's array_diff_key()
441173bfbcaSSatoshi Sahara     *
442173bfbcaSSatoshi Sahara     * @param array $args An array of page arrays
443173bfbcaSSatoshi Sahara     * @return array
444173bfbcaSSatoshi Sahara     *
445173bfbcaSSatoshi Sahara     * @author Kazutaka Miyasaka <kazmiya@gmail.com>
446173bfbcaSSatoshi Sahara     */
447173bfbcaSSatoshi Sahara    protected static function resultComplement($args)
448173bfbcaSSatoshi Sahara    {
449173bfbcaSSatoshi Sahara        $array_count = count($args);
450173bfbcaSSatoshi Sahara        if ($array_count === 1) {
451173bfbcaSSatoshi Sahara            return $args[0];
452173bfbcaSSatoshi Sahara        }
453173bfbcaSSatoshi Sahara
454173bfbcaSSatoshi Sahara        $result = $args[0];
455173bfbcaSSatoshi Sahara        foreach (array_keys($result) as $id) {
456173bfbcaSSatoshi Sahara            for ($i = 1; $i !== $array_count; $i++) {
457173bfbcaSSatoshi Sahara                if (isset($args[$i][$id])) unset($result[$id]);
458173bfbcaSSatoshi Sahara            }
459173bfbcaSSatoshi Sahara        }
460173bfbcaSSatoshi Sahara        return $result;
461173bfbcaSSatoshi Sahara    }
462173bfbcaSSatoshi Sahara}
463