1<?php
2/**
3 * Modernized helper functions for SphinxSearch plugin
4 */
5
6/**
7 * Format document for Sphinx XMLpipe2
8 *
9 * @param array $data
10 * @return string
11 */
12function formatXml(array $data): string
13{
14    // NO whitespace at the beginning
15    $xmlFormat = '<sphinx:document id="%s"><title><![CDATA[%s]]></title><body><![CDATA[%s]]></body><namespace><![CDATA[%s]]></namespace><pagename><![CDATA[%s]]></pagename><level>%d</level><modified>%d</modified></sphinx:document>';
16
17    return sprintf(
18        $xmlFormat,
19        (string)$data['id'],
20        escapeTextValue($data['title_to_index']),
21        escapeTextValue($data['body']),
22        escapeTextValue($data['namespace']),
23        escapeTextValue($data['pagename']),
24        (int)$data['level'],
25        (int)$data['modified']
26    );
27}
28
29
30/**
31 * Clean text for CDATA inclusion
32 *
33 * @param string $value
34 * @return string
35 */
36function escapeTextValue($value)
37{
38    if ($value === "" || $value === null) return "";
39
40    // Escape CDATA end marker (Sphinx requirement)
41    $value = str_replace("]]>", "]]&gt;", $value);
42
43    // Remove illegal XML control characters
44    return stripInvalidXml($value);
45}
46
47/**
48 * Remove characters that are invalid in XML 1.0
49 *
50 * @param string $value
51 * @return string
52 */
53function stripInvalidXml(string $value): string
54{
55    if (empty($value)) return "";
56
57    // 1. Remove control characters except tab, newline, carriage return
58    $value = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/u', '', $value);
59
60    // 2. Fix/Reject invalid UTF-8
61    $value = mb_convert_encoding($value, 'UTF-8', 'UTF-8');
62
63    // 3. Remove characters outside the standard XML 1.0 range
64    return preg_replace(
65        '/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u',
66        ' ',
67        $value
68    );
69}
70
71/**
72 * Extract sections based on Heading metadata
73 */
74function getDocumentsByHeadings($id, $metadata)
75{
76    if (empty($metadata) || empty($metadata['description']['tableofcontents'])) {
77        return false;
78    }
79
80    $sections = [];
81    $level = 1;
82    $previous_title = '';
83
84    foreach ($metadata['description']['tableofcontents'] as $row) {
85        $sections[$row['hid']] = [
86            'section' => getSectionByTitleLevel($id, $row['title']),
87            'level' => $row['level'],
88            'title' => $row['title']
89        ];
90
91        if ($row['level'] > $level && !empty($previous_title)) {
92            $sections[$row['hid']]['title_text'] = $previous_title . " » " . $row['title'];
93        } else {
94            $sections[$row['hid']]['title_text'] = $row['title'];
95            $previous_title = $row['title'];
96        }
97        $sections[$row['hid']]['title_to_index'] = $row['title'];
98    }
99    return $sections;
100}
101
102/**
103 * Regex based section extraction
104 */
105function getSectionByTitleLevel($id, $header, $extended = false)
106{
107    $headerReg = preg_quote($header, '/');
108    $file = wikiFN($id);
109    if (!file_exists($file)) return '';
110
111    $doc = io_readFile($file);
112    $regex = "(={1,6})\s*({$headerReg})\s*(={1,6})";
113    $section = '';
114
115    if (preg_match("/$regex/i", $doc, $matches)) {
116        $startHeader = $matches[0];
117        $startHeaderPos = strpos($doc, $startHeader) + strlen($startHeader);
118        $endDoc = substr($doc, $startHeaderPos);
119
120        $endRegex = '(={4,6})(.*?)(={4,6})';
121        if (preg_match("/$endRegex/i", $endDoc, $matches)) {
122            $endHeader = $matches[0];
123            $endHeaderPos = strpos($doc, $endHeader);
124        } else {
125            $endHeaderPos = 0;
126        }
127
128        if ($endHeaderPos) {
129            $section = substr($doc, $startHeaderPos, $endHeaderPos - $startHeaderPos);
130        } else {
131            $section = substr($doc, $startHeaderPos);
132        }
133    }
134
135    $section = trim($section);
136
137    if ($extended && empty($section) && isset($endHeader)) {
138        $startHeaderPos = $endHeaderPos + strlen($endHeader);
139        $endDoc = substr($doc, $startHeaderPos);
140        if (preg_match("/$endRegex/i", $endDoc, $matches)) {
141            $nextEndHeaderPos = strpos($doc, $matches[0], $startHeaderPos);
142            $section = substr($doc, $startHeaderPos, $nextEndHeaderPos - $startHeaderPos);
143        } else {
144            $section = substr($doc, $startHeaderPos);
145        }
146    }
147
148    return trim($section);
149}
150
151/**
152 * Modernized section extractor using DokuWiki internal parser helpers
153 */
154function getSection($id, $header)
155{
156    static $cacheInstructions = [];
157    static $cacheDoc = [];
158
159    if (empty($cacheDoc[$id])) {
160        $file = wikiFN($id);
161        if (!file_exists($file)) return '';
162
163        $doc = io_readFile($file);
164        // Use standard DokuWiki helper to get instructions
165        $instructions = p_get_instructions($doc);
166
167        $cacheInstructions[$id] = $instructions;
168        $cacheDoc[$id] = $doc;
169    } else {
170        $instructions = $cacheInstructions[$id];
171        $doc = $cacheDoc[$id];
172    }
173
174    $inSection = false;
175    $startPos = 0;
176    $endPos = 0;
177
178    foreach ($instructions as $instruction) {
179        if (!$inSection) {
180            if ($instruction[0] == 'header' && trim($instruction[1][0]) == $header) {
181                $startPos = $instruction[2];
182                $inSection = true;
183            }
184        } else {
185            if ($instruction[0] == 'section_close' || $instruction[0] == 'header') {
186                $endPos = $instruction[2];
187                break;
188            }
189        }
190    }
191
192    $doc = "\n" . str_replace("\r\n", "\n", $doc) . "\n";
193    return substr($doc, $startPos, ($endPos - $startPos));
194}
195
196function getCategories($id)
197{
198    if (empty($id)) return '';
199    if (strpos($id, ":") === false) return '';
200
201    $ns = explode(":", $id);
202    array_pop($ns); // Remove the pagename
203
204    $result = [];
205    while (!empty($ns)) {
206        $result[] = implode(':', $ns);
207        array_pop($ns);
208    }
209    return implode(' ', $result);
210}
211
212function getPagename($id)
213{
214    if (empty($id)) return '';
215    $parts = explode(":", $id);
216    return end($parts);
217}
218
219function getPagesList()
220{
221    global $conf;
222    $data = [];
223    search($data, $conf['datadir'], 'search_allpages', ['skipacl' => 1], '');
224    sort($data);
225    return $data;
226}
227
228/**
229 * Generate namespace breadcrumbs for search results
230 */
231function getNsLinks($id, $keywords, $search)
232{
233    $parts = explode(':', $id);
234    $count = count($parts);
235
236    $part = '';
237    $data = [];
238    $titles = [];
239
240    for ($i = 0; $i < $count; $i++) {
241        $part .= ($i > 0 ? ':' : '') . $parts[$i];
242        $page = $part;
243        resolve_pageid('', $page, $exists);
244
245        if (str_ends_with($page, ':start')) {
246            $page = substr($page, 0, -6);
247        }
248
249        $titles[] = $parts[$i];
250        $data[] = ['link' => "?do=search&id=" . urlencode($keywords . " @ns $page")];
251    }
252
253    // Sphinx excerpt for the breadcrumbs
254    $titleExcerpts = $search->getExcerpt($titles, $search->starQuery($keywords));
255
256    foreach ($data as $key => $val) {
257        $data[$key]['title'] = $titleExcerpts[$key] ?? $titles[$key];
258    }
259    return $data;
260}
261
262function printNamespacesNew($pageNames)
263{
264    if (empty($pageNames)) return false;
265
266    $limit = 10;
267    echo '<h2>Matching pagenames</h2>';
268    echo '<ul>';
269    $counter = 0;
270    foreach ($pageNames as $id => $header) {
271        echo '<li>';
272        echo '<a href="' . wl($id) . '" class="wikilink1">' . hsc($id) . '</a>';
273        echo '</li>';
274        if (++$counter >= $limit) break;
275    }
276    echo '</ul>';
277}
278
279if (!function_exists('shorten')) {
280    function shorten($keep, $short, $max, $min = 9, $char = '…') {
281        $keepLen = utf8_strlen($keep);
282        $max = $max - $keepLen;
283        if ($max < $min) return $keep;
284
285        $len = utf8_strlen($short);
286        if ($len <= $max) return $keep . $short;
287
288        $half = (int)floor($max / 2);
289        return $keep . utf8_substr($short, 0, $half - 1) . $char . utf8_substr($short, $len - $half);
290    }
291}
292
293/**
294 * Robust way to get searchable plain text by using XHTML and stripping noise
295 * Added for SphinxSearch indexing compatibility
296 */
297function get_clean_text($wikitext) {
298    $info = [];
299    // The @ suppresses warnings from incompatible DokuWiki plugins during render
300    $html = @p_render('xhtml', p_get_instructions($wikitext), $info);
301
302    // Remove code block line numbers
303    $html = preg_replace('/<span class="ln">.*?<\/span>/', '', $html);
304    // Remove Table of Contents
305    $html = preg_replace('/<div class="tableofcontents">.*?<\/div>/s', '', $html);
306    // Remove "Download" buttons and extra code block labels
307    $html = preg_replace('/<div class="xtra">.*?<\/div>/s', '', $html);
308
309    $text = strip_tags($html);
310    return htmlspecialchars_decode($text);
311}
312