plugin/sphinxsearchwas/functions.php

<?php
/**
 * Modernized helper functions for SphinxSearch plugin
 */

/**
 * Format document for Sphinx XMLpipe2
 *
 * @param array $data
 * @return string
 */
function formatXml(array $data): string
{
    // NO whitespace at the beginning
    $xmlFormat = '<sphinx:document id="%s"><title><![CDATA[%s]]></title><body><![CDATA[%s]]></body><namespace><![CDATA[%s]]></namespace><pagename><![CDATA[%s]]></pagename><level>%d</level><modified>%d</modified></sphinx:document>';

    return sprintf(
        $xmlFormat,
        (string)$data['id'],
        escapeTextValue($data['title_to_index']),
        escapeTextValue($data['body']),
        escapeTextValue($data['namespace']),
        escapeTextValue($data['pagename']),
        (int)$data['level'],
        (int)$data['modified']
    );
}


/**
 * Clean text for CDATA inclusion
 *
 * @param string $value
 * @return string
 */
function escapeTextValue($value)
{
    if ($value === "" || $value === null) return "";

    // Escape CDATA end marker (Sphinx requirement)
    $value = str_replace("]]>", "]]&gt;", $value);

    // Remove illegal XML control characters
    return stripInvalidXml($value);
}

/**
 * Remove characters that are invalid in XML 1.0
 *
 * @param string $value
 * @return string
 */
function stripInvalidXml(string $value): string
{
    if (empty($value)) return "";

    // 1. Remove control characters except tab, newline, carriage return
    $value = preg_replace('/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]/u', '', $value);

    // 2. Fix/Reject invalid UTF-8
    $value = mb_convert_encoding($value, 'UTF-8', 'UTF-8');

    // 3. Remove characters outside the standard XML 1.0 range
    return preg_replace(
        '/[^\x{0009}\x{000a}\x{000d}\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]/u',
        ' ',
        $value
    );
}

/**
 * Extract sections based on Heading metadata
 */
function getDocumentsByHeadings($id, $metadata)
{
    if (empty($metadata) || empty($metadata['description']['tableofcontents'])) {
        return false;
    }

    $sections = [];
    $level = 1;
    $previous_title = '';

    foreach ($metadata['description']['tableofcontents'] as $row) {
        $sections[$row['hid']] = [
            'section' => getSectionByTitleLevel($id, $row['title']),
            'level' => $row['level'],
            'title' => $row['title']
        ];

        if ($row['level'] > $level && !empty($previous_title)) {
            $sections[$row['hid']]['title_text'] = $previous_title . " » " . $row['title'];
        } else {
            $sections[$row['hid']]['title_text'] = $row['title'];
            $previous_title = $row['title'];
        }
        $sections[$row['hid']]['title_to_index'] = $row['title'];
    }
    return $sections;
}

/**
 * Regex based section extraction
 */
function getSectionByTitleLevel($id, $header, $extended = false)
{
    $headerReg = preg_quote($header, '/');
    $file = wikiFN($id);
    if (!file_exists($file)) return '';

    $doc = io_readFile($file);
    $regex = "(={1,6})\s*({$headerReg})\s*(={1,6})";
    $section = '';

    if (preg_match("/$regex/i", $doc, $matches)) {
        $startHeader = $matches[0];
        $startHeaderPos = strpos($doc, $startHeader) + strlen($startHeader);
        $endDoc = substr($doc, $startHeaderPos);

        $endRegex = '(={4,6})(.*?)(={4,6})';
        if (preg_match("/$endRegex/i", $endDoc, $matches)) {
            $endHeader = $matches[0];
            $endHeaderPos = strpos($doc, $endHeader);
        } else {
            $endHeaderPos = 0;
        }

        if ($endHeaderPos) {
            $section = substr($doc, $startHeaderPos, $endHeaderPos - $startHeaderPos);
        } else {
            $section = substr($doc, $startHeaderPos);
        }
    }

    $section = trim($section);

    if ($extended && empty($section) && isset($endHeader)) {
        $startHeaderPos = $endHeaderPos + strlen($endHeader);
        $endDoc = substr($doc, $startHeaderPos);
        if (preg_match("/$endRegex/i", $endDoc, $matches)) {
            $nextEndHeaderPos = strpos($doc, $matches[0], $startHeaderPos);
            $section = substr($doc, $startHeaderPos, $nextEndHeaderPos - $startHeaderPos);
        } else {
            $section = substr($doc, $startHeaderPos);
        }
    }

    return trim($section);
}

/**
 * Modernized section extractor using DokuWiki internal parser helpers
 */
function getSection($id, $header)
{
    static $cacheInstructions = [];
    static $cacheDoc = [];

    if (empty($cacheDoc[$id])) {
        $file = wikiFN($id);
        if (!file_exists($file)) return '';

        $doc = io_readFile($file);
        // Use standard DokuWiki helper to get instructions
        $instructions = p_get_instructions($doc);

        $cacheInstructions[$id] = $instructions;
        $cacheDoc[$id] = $doc;
    } else {
        $instructions = $cacheInstructions[$id];
        $doc = $cacheDoc[$id];
    }

    $inSection = false;
    $startPos = 0;
    $endPos = 0;

    foreach ($instructions as $instruction) {
        if (!$inSection) {
            if ($instruction[0] == 'header' && trim($instruction[1][0]) == $header) {
                $startPos = $instruction[2];
                $inSection = true;
            }
        } else {
            if ($instruction[0] == 'section_close' || $instruction[0] == 'header') {
                $endPos = $instruction[2];
                break;
            }
        }
    }

    $doc = "\n" . str_replace("\r\n", "\n", $doc) . "\n";
    return substr($doc, $startPos, ($endPos - $startPos));
}

function getCategories($id)
{
    if (empty($id)) return '';
    if (strpos($id, ":") === false) return '';

    $ns = explode(":", $id);
    array_pop($ns); // Remove the pagename

    $result = [];
    while (!empty($ns)) {
        $result[] = implode(':', $ns);
        array_pop($ns);
    }
    return implode(' ', $result);
}

function getPagename($id)
{
    if (empty($id)) return '';
    $parts = explode(":", $id);
    return end($parts);
}

function getPagesList()
{
    global $conf;
    $data = [];
    search($data, $conf['datadir'], 'search_allpages', ['skipacl' => 1], '');
    sort($data);
    return $data;
}

/**
 * Generate namespace breadcrumbs for search results
 */
function getNsLinks($id, $keywords, $search)
{
    $parts = explode(':', $id);
    $count = count($parts);

    $part = '';
    $data = [];
    $titles = [];

    for ($i = 0; $i < $count; $i++) {
        $part .= ($i > 0 ? ':' : '') . $parts[$i];
        $page = $part;
        resolve_pageid('', $page, $exists);

        if (str_ends_with($page, ':start')) {
            $page = substr($page, 0, -6);
        }

        $titles[] = $parts[$i];
        $data[] = ['link' => "?do=search&id=" . urlencode($keywords . " @ns $page")];
    }

    // Sphinx excerpt for the breadcrumbs
    $titleExcerpts = $search->getExcerpt($titles, $search->starQuery($keywords));

    foreach ($data as $key => $val) {
        $data[$key]['title'] = $titleExcerpts[$key] ?? $titles[$key];
    }
    return $data;
}

function printNamespacesNew($pageNames)
{
    if (empty($pageNames)) return false;

    $limit = 10;
    echo '<h2>Matching pagenames</h2>';
    echo '<ul>';
    $counter = 0;
    foreach ($pageNames as $id => $header) {
        echo '<li>';
        echo '<a href="' . wl($id) . '" class="wikilink1">' . hsc($id) . '</a>';
        echo '</li>';
        if (++$counter >= $limit) break;
    }
    echo '</ul>';
}

if (!function_exists('shorten')) {
    function shorten($keep, $short, $max, $min = 9, $char = '…') {
        $keepLen = utf8_strlen($keep);
        $max = $max - $keepLen;
        if ($max < $min) return $keep;

        $len = utf8_strlen($short);
        if ($len <= $max) return $keep . $short;

        $half = (int)floor($max / 2);
        return $keep . utf8_substr($short, 0, $half - 1) . $char . utf8_substr($short, $len - $half);
    }
}

/**
 * Robust way to get searchable plain text by using XHTML and stripping noise
 * Added for SphinxSearch indexing compatibility
 */
function get_clean_text($wikitext) {
    $info = [];
    // The @ suppresses warnings from incompatible DokuWiki plugins during render
    $html = @p_render('xhtml', p_get_instructions($wikitext), $info);

    // Remove code block line numbers
    $html = preg_replace('/<span class="ln">.*?<\/span>/', '', $html);
    // Remove Table of Contents
    $html = preg_replace('/<div class="tableofcontents">.*?<\/div>/s', '', $html);
    // Remove "Download" buttons and extra code block labels
    $html = preg_replace('/<div class="xtra">.*?<\/div>/s', '', $html);

    $text = strip_tags($html);
    return htmlspecialchars_decode($text);
}