%d%d'; return sprintf( $xmlFormat, (string)$data['id'], escapeTextValue($data['title_to_index']), escapeTextValue($data['body']), escapeTextValue($data['namespace']), escapeTextValue($data['pagename']), (int)$data['level'], (int)$data['modified'] ); } /** * Clean text for CDATA inclusion * * @param string $value * @return string */ function escapeTextValue($value) { if ($value === "" || $value === null) return ""; // Escape CDATA end marker (Sphinx requirement) $value = str_replace("]]>", "]]>", $value); // Remove illegal XML control characters return stripInvalidXml($value); } /** * Remove characters that are invalid in XML 1.0 * * @param string $value * @return string */ function stripInvalidXml(string $value): string { if (empty($value)) return ""; // 1. Remove control characters except tab, newline, carriage return $value = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/u', '', $value); // 2. Fix/Reject invalid UTF-8 $value = mb_convert_encoding($value, 'UTF-8', 'UTF-8'); // 3. Remove characters outside the standard XML 1.0 range return preg_replace( '/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u', ' ', $value ); } /** * Extract sections based on Heading metadata */ function getDocumentsByHeadings($id, $metadata) { if (empty($metadata) || empty($metadata['description']['tableofcontents'])) { return false; } $sections = []; $level = 1; $previous_title = ''; foreach ($metadata['description']['tableofcontents'] as $row) { $sections[$row['hid']] = [ 'section' => getSectionByTitleLevel($id, $row['title']), 'level' => $row['level'], 'title' => $row['title'] ]; if ($row['level'] > $level && !empty($previous_title)) { $sections[$row['hid']]['title_text'] = $previous_title . " » " . $row['title']; } else { $sections[$row['hid']]['title_text'] = $row['title']; $previous_title = $row['title']; } $sections[$row['hid']]['title_to_index'] = $row['title']; } return $sections; } /** * Regex based section extraction */ function getSectionByTitleLevel($id, $header, $extended = false) { $headerReg = preg_quote($header, '/'); $file = wikiFN($id); if (!file_exists($file)) return ''; $doc = io_readFile($file); $regex = "(={1,6})\s*({$headerReg})\s*(={1,6})"; $section = ''; if (preg_match("/$regex/i", $doc, $matches)) { $startHeader = $matches[0]; $startHeaderPos = strpos($doc, $startHeader) + strlen($startHeader); $endDoc = substr($doc, $startHeaderPos); $endRegex = '(={4,6})(.*?)(={4,6})'; if (preg_match("/$endRegex/i", $endDoc, $matches)) { $endHeader = $matches[0]; $endHeaderPos = strpos($doc, $endHeader); } else { $endHeaderPos = 0; } if ($endHeaderPos) { $section = substr($doc, $startHeaderPos, $endHeaderPos - $startHeaderPos); } else { $section = substr($doc, $startHeaderPos); } } $section = trim($section); if ($extended && empty($section) && isset($endHeader)) { $startHeaderPos = $endHeaderPos + strlen($endHeader); $endDoc = substr($doc, $startHeaderPos); if (preg_match("/$endRegex/i", $endDoc, $matches)) { $nextEndHeaderPos = strpos($doc, $matches[0], $startHeaderPos); $section = substr($doc, $startHeaderPos, $nextEndHeaderPos - $startHeaderPos); } else { $section = substr($doc, $startHeaderPos); } } return trim($section); } /** * Modernized section extractor using DokuWiki internal parser helpers */ function getSection($id, $header) { static $cacheInstructions = []; static $cacheDoc = []; if (empty($cacheDoc[$id])) { $file = wikiFN($id); if (!file_exists($file)) return ''; $doc = io_readFile($file); // Use standard DokuWiki helper to get instructions $instructions = p_get_instructions($doc); $cacheInstructions[$id] = $instructions; $cacheDoc[$id] = $doc; } else { $instructions = $cacheInstructions[$id]; $doc = $cacheDoc[$id]; } $inSection = false; $startPos = 0; $endPos = 0; foreach ($instructions as $instruction) { if (!$inSection) { if ($instruction[0] == 'header' && trim($instruction[1][0]) == $header) { $startPos = $instruction[2]; $inSection = true; } } else { if ($instruction[0] == 'section_close' || $instruction[0] == 'header') { $endPos = $instruction[2]; break; } } } $doc = "\n" . str_replace("\r\n", "\n", $doc) . "\n"; return substr($doc, $startPos, ($endPos - $startPos)); } function getCategories($id) { if (empty($id)) return ''; if (strpos($id, ":") === false) return ''; $ns = explode(":", $id); array_pop($ns); // Remove the pagename $result = []; while (!empty($ns)) { $result[] = implode(':', $ns); array_pop($ns); } return implode(' ', $result); } function getPagename($id) { if (empty($id)) return ''; $parts = explode(":", $id); return end($parts); } function getPagesList() { global $conf; $data = []; search($data, $conf['datadir'], 'search_allpages', ['skipacl' => 1], ''); sort($data); return $data; } /** * Generate namespace breadcrumbs for search results */ function getNsLinks($id, $keywords, $search) { $parts = explode(':', $id); $count = count($parts); $part = ''; $data = []; $titles = []; for ($i = 0; $i < $count; $i++) { $part .= ($i > 0 ? ':' : '') . $parts[$i]; $page = $part; resolve_pageid('', $page, $exists); if (str_ends_with($page, ':start')) { $page = substr($page, 0, -6); } $titles[] = $parts[$i]; $data[] = ['link' => "?do=search&id=" . urlencode($keywords . " @ns $page")]; } // Sphinx excerpt for the breadcrumbs $titleExcerpts = $search->getExcerpt($titles, $search->starQuery($keywords)); foreach ($data as $key => $val) { $data[$key]['title'] = $titleExcerpts[$key] ?? $titles[$key]; } return $data; } function printNamespacesNew($pageNames) { if (empty($pageNames)) return false; $limit = 10; echo '

Matching pagenames

'; echo ''; } if (!function_exists('shorten')) { function shorten($keep, $short, $max, $min = 9, $char = '…') { $keepLen = utf8_strlen($keep); $max = $max - $keepLen; if ($max < $min) return $keep; $len = utf8_strlen($short); if ($len <= $max) return $keep . $short; $half = (int)floor($max / 2); return $keep . utf8_substr($short, 0, $half - 1) . $char . utf8_substr($short, $len - $half); } } /** * Robust way to get searchable plain text by using XHTML and stripping noise * Added for SphinxSearch indexing compatibility */ function get_clean_text($wikitext) { $info = []; // The @ suppresses warnings from incompatible DokuWiki plugins during render $html = @p_render('xhtml', p_get_instructions($wikitext), $info); // Remove code block line numbers $html = preg_replace('/.*?<\/span>/', '', $html); // Remove Table of Contents $html = preg_replace('/
.*?<\/div>/s', '', $html); // Remove "Download" buttons and extra code block labels $html = preg_replace('/
.*?<\/div>/s', '', $html); $text = strip_tags($html); return htmlspecialchars_decode($text); }