1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Search\Collection\CollectionSearch; 7use dokuwiki\Search\Collection\PageFulltextCollection; 8use dokuwiki\Search\Query\QueryEvaluator; 9use dokuwiki\Search\Query\QueryParser; 10use dokuwiki\Utf8; 11 12/** 13 * DokuWiki Fulltext Search 14 * 15 * @license GPL 2 (http://www.gnu.org/licenses/gpl.html) 16 * @author Andreas Gohr <andi@splitbrain.org> 17 */ 18class FulltextSearch 19{ 20 /** @var int Maximum number of results to generate snippets for */ 21 protected int $maxSnippets = 15; 22 23 /** 24 * @return int 25 */ 26 public function getMaxSnippets(): int 27 { 28 return $this->maxSnippets; 29 } 30 31 /** 32 * @param int $maxSnippets 33 */ 34 public function setMaxSnippets(int $maxSnippets): void 35 { 36 $this->maxSnippets = $maxSnippets; 37 } 38 39 /** 40 * The fulltext search 41 * 42 * Returns a list of matching documents for the given query 43 * 44 * @triggers SEARCH_QUERY_FULLPAGE 45 * 46 * @param string $query the search query string 47 * @param array $highlight will be filled with terms to highlight 48 * @param string $sort sort mode: 'hits' (default) or 'mtime' 49 * @param int|string $after only show results with mtime after this date, 50 * accepts timestamp or strtotime arguments 51 * @param int|string $before only show results with mtime before this date, 52 * accepts timestamp or strtotime arguments 53 * 54 * @return array matching documents as pageid => score 55 */ 56 public function pageSearch($query, &$highlight, $sort = null, $after = null, $before = null) 57 { 58 if ($sort === null) { 59 $sort = 'hits'; 60 } 61 $data = [ 62 'query' => $query, 63 'sort' => $sort, 64 'after' => $after, 65 'before' => $before 66 ]; 67 $data['highlight'] =& $highlight; 68 $action = [$this, 'pageSearchCallBack']; 69 return Event::createAndTrigger('SEARCH_QUERY_FULLPAGE', $data, $action); 70 } 71 72 /** 73 * Returns a list of matching documents for the given query 74 * 75 * @author Andreas Gohr <andi@splitbrain.org> 76 * @author Kazutaka Miyasaka <kazmiya@gmail.com> 77 * 78 * @param array $data event data 79 * @return array matching documents as pageid => score 80 */ 81 public function pageSearchCallBack(&$data) 82 { 83 // parse the given query 84 $q = (new QueryParser)->convert($data['query']); 85 $data['highlight'] = $q['highlight']; 86 87 if (empty($q['parsed_ary'])) return []; 88 89 // look up all words via CollectionSearch 90 $collection = new PageFulltextCollection(); 91 $search = new CollectionSearch($collection); 92 foreach ($q['words'] as $word) { 93 if (!Tokenizer::isValidSearchTerm($word)) continue; 94 $search->addTerm($word); 95 } 96 $terms = $search->execute(); 97 98 // evaluate the query 99 $evaluator = new QueryEvaluator($q['parsed_ary'], $terms); 100 $docs = $evaluator->evaluate(); 101 102 if (empty($docs)) return []; 103 104 // filter by visibility, acls, existence, and time range 105 $docs = MetadataSearch::filterPages($docs, false, $data['after'], $data['before']); 106 107 if ($data['sort'] === 'mtime') { 108 uksort($docs, static function ($a, $b) { 109 return filemtime(wikiFN($b)) - filemtime(wikiFN($a)); 110 }); 111 } else { 112 arsort($docs); 113 } 114 115 return $docs; 116 } 117 118 /** 119 * Creates a snippet extract 120 * 121 * @author Andreas Gohr <andi@splitbrain.org> 122 * @triggers FULLTEXT_SNIPPET_CREATE 123 * 124 * @param string $id page id 125 * @param array $highlight 126 * @return mixed 127 */ 128 public function snippet($id, $highlight) 129 { 130 $text = rawWiki($id); 131 $text = str_replace("\xC2\xAD",'',$text); // remove soft-hyphens 132 $evdata = array( 133 'id' => $id, 134 'text' => &$text, 135 'highlight' => &$highlight, 136 'snippet' => '', 137 ); 138 139 $evt = new Event('FULLTEXT_SNIPPET_CREATE', $evdata); 140 if ($evt->advise_before()) { 141 $match = array(); 142 $snippets = array(); 143 $utf8_offset = $offset = $end = 0; 144 $len = Utf8\PhpString::strlen($text); 145 146 // build a regexp from the phrases to highlight 147 $re1 = '(' . 148 join( 149 '|', 150 array_map( 151 [$this, 'snippetRePreprocess'], 152 array_map( 153 'preg_quote_cb', 154 array_filter((array) $highlight) 155 ) 156 ) 157 ) . 158 ')'; 159 $re2 = "$re1.{0,75}(?!\\1)$re1"; 160 $re3 = "$re1.{0,45}(?!\\1)$re1.{0,45}(?!\\1)(?!\\2)$re1"; 161 162 for ($cnt=4; $cnt--;) { 163 if (0) { 164 } elseif (preg_match('/'.$re3.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 165 } elseif (preg_match('/'.$re2.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 166 } elseif (preg_match('/'.$re1.'/iu', $text, $match, PREG_OFFSET_CAPTURE, $offset)) { 167 } else { 168 break; 169 } 170 171 list($str, $idx) = $match[0]; 172 173 // convert $idx (a byte offset) into a utf8 character offset 174 $utf8_idx = Utf8\PhpString::strlen(substr($text, 0, $idx)); 175 $utf8_len = Utf8\PhpString::strlen($str); 176 177 // establish context, 100 bytes surrounding the match string 178 // first look to see if we can go 100 either side, 179 // then drop to 50 adding any excess if the other side can't go to 50, 180 $pre = min($utf8_idx - $utf8_offset, 100); 181 $post = min($len - $utf8_idx - $utf8_len, 100); 182 183 if ($pre > 50 && $post > 50) { 184 $pre = $post = 50; 185 } elseif ($pre > 50) { 186 $pre = min($pre, 100 - $post); 187 } elseif ($post > 50) { 188 $post = min($post, 100 - $pre); 189 } elseif ($offset == 0) { 190 // both are less than 50, means the context is the whole string 191 // make it so and break out of this loop - there is no need for the 192 // complex snippet calculations 193 $snippets = array($text); 194 break; 195 } 196 197 // establish context start and end points, try to append to previous 198 // context if possible 199 $start = $utf8_idx - $pre; 200 $append = ($start < $end) ? $end : false; // still the end of the previous context snippet 201 $end = $utf8_idx + $utf8_len + $post; // now set it to the end of this context 202 203 if ($append) { 204 $snippets[count($snippets)-1] .= Utf8\PhpString::substr($text, $append, $end-$append); 205 } else { 206 $snippets[] = Utf8\PhpString::substr($text, $start, $end-$start); 207 } 208 209 // set $offset for next match attempt 210 // continue matching after the current match 211 // if the current match is not the longest possible match starting at the current offset 212 // this prevents further matching of this snippet but for possible matches of length 213 // smaller than match length + context (at least 50 characters) this match is part of the context 214 $utf8_offset = $utf8_idx + $utf8_len; 215 $offset = $idx + strlen(Utf8\PhpString::substr($text, $utf8_idx, $utf8_len)); 216 $offset = Utf8\Clean::correctIdx($text, $offset); 217 } 218 219 $m = "\1"; 220 $snippets = preg_replace('/'.$re1.'/iu', $m.'$1'.$m, $snippets); 221 $snippet = preg_replace( 222 '/' . $m . '([^' . $m . ']*?)' . $m . '/iu', 223 '<strong class="search_hit">$1</strong>', 224 hsc(join('... ', $snippets)) 225 ); 226 227 $evdata['snippet'] = $snippet; 228 } 229 $evt->advise_after(); 230 unset($evt); 231 232 return $evdata['snippet']; 233 } 234 235 /** 236 * Wraps a search term in regex boundary checks. 237 * 238 * @param string $term 239 * @return string 240 */ 241 public function snippetRePreprocess($term) 242 { 243 // do not process asian terms where word boundaries are not explicit 244 if (Utf8\Asian::isAsianWords($term)) return $term; 245 246 if (UTF8_PROPERTYSUPPORT) { 247 // unicode word boundaries 248 // see http://stackoverflow.com/a/2449017/172068 249 $BL = '(?<!\pL)'; 250 $BR = '(?!\pL)'; 251 } else { 252 // not as correct as above, but at least won't break 253 $BL = '\b'; 254 $BR = '\b'; 255 } 256 257 if (substr($term, 0, 2) == '\\*') { 258 $term = substr($term, 2); 259 } else { 260 $term = $BL.$term; 261 } 262 263 if (substr($term, -2, 2) == '\\*') { 264 $term = substr($term, 0, -2); 265 } else { 266 $term = $term.$BR; 267 } 268 269 if ($term == $BL || $term == $BR || $term == $BL.$BR) { 270 $term = ''; 271 } 272 return $term; 273 } 274} 275