1<?php 2/** 3 * Modernized helper functions for SphinxSearch plugin 4 */ 5 6/** 7 * Format document for Sphinx XMLpipe2 8 * 9 * @param array $data 10 * @return string 11 */ 12function formatXml(array $data): string 13{ 14 // NO whitespace at the beginning 15 $xmlFormat = '<sphinx:document id="%s"><title><![CDATA[%s]]></title><body><![CDATA[%s]]></body><namespace><![CDATA[%s]]></namespace><pagename><![CDATA[%s]]></pagename><level>%d</level><modified>%d</modified></sphinx:document>'; 16 17 return sprintf( 18 $xmlFormat, 19 (string)$data['id'], 20 escapeTextValue($data['title_to_index']), 21 escapeTextValue($data['body']), 22 escapeTextValue($data['namespace']), 23 escapeTextValue($data['pagename']), 24 (int)$data['level'], 25 (int)$data['modified'] 26 ); 27} 28 29 30/** 31 * Clean text for CDATA inclusion 32 * 33 * @param string $value 34 * @return string 35 */ 36function escapeTextValue($value) 37{ 38 if ($value === "" || $value === null) return ""; 39 40 // Escape CDATA end marker (Sphinx requirement) 41 $value = str_replace("]]>", "]]>", $value); 42 43 // Remove illegal XML control characters 44 return stripInvalidXml($value); 45} 46 47/** 48 * Remove characters that are invalid in XML 1.0 49 * 50 * @param string $value 51 * @return string 52 */ 53function stripInvalidXml(string $value): string 54{ 55 if (empty($value)) return ""; 56 57 // 1. Remove control characters except tab, newline, carriage return 58 $value = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/u', '', $value); 59 60 // 2. Fix/Reject invalid UTF-8 61 $value = mb_convert_encoding($value, 'UTF-8', 'UTF-8'); 62 63 // 3. Remove characters outside the standard XML 1.0 range 64 return preg_replace( 65 '/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u', 66 ' ', 67 $value 68 ); 69} 70 71/** 72 * Extract sections based on Heading metadata 73 */ 74function getDocumentsByHeadings($id, $metadata) 75{ 76 if (empty($metadata) || empty($metadata['description']['tableofcontents'])) { 77 return false; 78 } 79 80 $sections = []; 81 $level = 1; 82 $previous_title = ''; 83 84 foreach ($metadata['description']['tableofcontents'] as $row) { 85 $sections[$row['hid']] = [ 86 'section' => getSectionByTitleLevel($id, $row['title']), 87 'level' => $row['level'], 88 'title' => $row['title'] 89 ]; 90 91 if ($row['level'] > $level && !empty($previous_title)) { 92 $sections[$row['hid']]['title_text'] = $previous_title . " » " . $row['title']; 93 } else { 94 $sections[$row['hid']]['title_text'] = $row['title']; 95 $previous_title = $row['title']; 96 } 97 $sections[$row['hid']]['title_to_index'] = $row['title']; 98 } 99 return $sections; 100} 101 102/** 103 * Regex based section extraction 104 */ 105function getSectionByTitleLevel($id, $header, $extended = false) 106{ 107 $headerReg = preg_quote($header, '/'); 108 $file = wikiFN($id); 109 if (!file_exists($file)) return ''; 110 111 $doc = io_readFile($file); 112 $regex = "(={1,6})\s*({$headerReg})\s*(={1,6})"; 113 $section = ''; 114 115 if (preg_match("/$regex/i", $doc, $matches)) { 116 $startHeader = $matches[0]; 117 $startHeaderPos = strpos($doc, $startHeader) + strlen($startHeader); 118 $endDoc = substr($doc, $startHeaderPos); 119 120 $endRegex = '(={4,6})(.*?)(={4,6})'; 121 if (preg_match("/$endRegex/i", $endDoc, $matches)) { 122 $endHeader = $matches[0]; 123 $endHeaderPos = strpos($doc, $endHeader); 124 } else { 125 $endHeaderPos = 0; 126 } 127 128 if ($endHeaderPos) { 129 $section = substr($doc, $startHeaderPos, $endHeaderPos - $startHeaderPos); 130 } else { 131 $section = substr($doc, $startHeaderPos); 132 } 133 } 134 135 $section = trim($section); 136 137 if ($extended && empty($section) && isset($endHeader)) { 138 $startHeaderPos = $endHeaderPos + strlen($endHeader); 139 $endDoc = substr($doc, $startHeaderPos); 140 if (preg_match("/$endRegex/i", $endDoc, $matches)) { 141 $nextEndHeaderPos = strpos($doc, $matches[0], $startHeaderPos); 142 $section = substr($doc, $startHeaderPos, $nextEndHeaderPos - $startHeaderPos); 143 } else { 144 $section = substr($doc, $startHeaderPos); 145 } 146 } 147 148 return trim($section); 149} 150 151/** 152 * Modernized section extractor using DokuWiki internal parser helpers 153 */ 154function getSection($id, $header) 155{ 156 static $cacheInstructions = []; 157 static $cacheDoc = []; 158 159 if (empty($cacheDoc[$id])) { 160 $file = wikiFN($id); 161 if (!file_exists($file)) return ''; 162 163 $doc = io_readFile($file); 164 // Use standard DokuWiki helper to get instructions 165 $instructions = p_get_instructions($doc); 166 167 $cacheInstructions[$id] = $instructions; 168 $cacheDoc[$id] = $doc; 169 } else { 170 $instructions = $cacheInstructions[$id]; 171 $doc = $cacheDoc[$id]; 172 } 173 174 $inSection = false; 175 $startPos = 0; 176 $endPos = 0; 177 178 foreach ($instructions as $instruction) { 179 if (!$inSection) { 180 if ($instruction[0] == 'header' && trim($instruction[1][0]) == $header) { 181 $startPos = $instruction[2]; 182 $inSection = true; 183 } 184 } else { 185 if ($instruction[0] == 'section_close' || $instruction[0] == 'header') { 186 $endPos = $instruction[2]; 187 break; 188 } 189 } 190 } 191 192 $doc = "\n" . str_replace("\r\n", "\n", $doc) . "\n"; 193 return substr($doc, $startPos, ($endPos - $startPos)); 194} 195 196function getCategories($id) 197{ 198 if (empty($id)) return ''; 199 if (strpos($id, ":") === false) return ''; 200 201 $ns = explode(":", $id); 202 array_pop($ns); // Remove the pagename 203 204 $result = []; 205 while (!empty($ns)) { 206 $result[] = implode(':', $ns); 207 array_pop($ns); 208 } 209 return implode(' ', $result); 210} 211 212function getPagename($id) 213{ 214 if (empty($id)) return ''; 215 $parts = explode(":", $id); 216 return end($parts); 217} 218 219function getPagesList() 220{ 221 global $conf; 222 $data = []; 223 search($data, $conf['datadir'], 'search_allpages', ['skipacl' => 1], ''); 224 sort($data); 225 return $data; 226} 227 228/** 229 * Generate namespace breadcrumbs for search results 230 */ 231function getNsLinks($id, $keywords, $search) 232{ 233 $parts = explode(':', $id); 234 $count = count($parts); 235 236 $part = ''; 237 $data = []; 238 $titles = []; 239 240 for ($i = 0; $i < $count; $i++) { 241 $part .= ($i > 0 ? ':' : '') . $parts[$i]; 242 $page = $part; 243 resolve_pageid('', $page, $exists); 244 245 if (str_ends_with($page, ':start')) { 246 $page = substr($page, 0, -6); 247 } 248 249 $titles[] = $parts[$i]; 250 $data[] = ['link' => "?do=search&id=" . urlencode($keywords . " @ns $page")]; 251 } 252 253 // Sphinx excerpt for the breadcrumbs 254 $titleExcerpts = $search->getExcerpt($titles, $search->starQuery($keywords)); 255 256 foreach ($data as $key => $val) { 257 $data[$key]['title'] = $titleExcerpts[$key] ?? $titles[$key]; 258 } 259 return $data; 260} 261 262function printNamespacesNew($pageNames) 263{ 264 if (empty($pageNames)) return false; 265 266 $limit = 10; 267 echo '<h2>Matching pagenames</h2>'; 268 echo '<ul>'; 269 $counter = 0; 270 foreach ($pageNames as $id => $header) { 271 echo '<li>'; 272 echo '<a href="' . wl($id) . '" class="wikilink1">' . hsc($id) . '</a>'; 273 echo '</li>'; 274 if (++$counter >= $limit) break; 275 } 276 echo '</ul>'; 277} 278 279if (!function_exists('shorten')) { 280 function shorten($keep, $short, $max, $min = 9, $char = '…') { 281 $keepLen = utf8_strlen($keep); 282 $max = $max - $keepLen; 283 if ($max < $min) return $keep; 284 285 $len = utf8_strlen($short); 286 if ($len <= $max) return $keep . $short; 287 288 $half = (int)floor($max / 2); 289 return $keep . utf8_substr($short, 0, $half - 1) . $char . utf8_substr($short, $len - $half); 290 } 291} 292 293/** 294 * Robust way to get searchable plain text by using XHTML and stripping noise 295 * Added for SphinxSearch indexing compatibility 296 */ 297function get_clean_text($wikitext) { 298 $info = []; 299 // The @ suppresses warnings from incompatible DokuWiki plugins during render 300 $html = @p_render('xhtml', p_get_instructions($wikitext), $info); 301 302 // Remove code block line numbers 303 $html = preg_replace('/<span class="ln">.*?<\/span>/', '', $html); 304 // Remove Table of Contents 305 $html = preg_replace('/<div class="tableofcontents">.*?<\/div>/s', '', $html); 306 // Remove "Download" buttons and extra code block labels 307 $html = preg_replace('/<div class="xtra">.*?<\/div>/s', '', $html); 308 309 $text = strip_tags($html); 310 return htmlspecialchars_decode($text); 311} 312