xref: /dokuwiki/inc/Search/Tokenizer.php (revision 06053dca2fac9a1da4eb1accf8c2488942da5d2a)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Utf8\Asian;
6use dokuwiki\Utf8\Clean;
7use dokuwiki\Utf8\PhpString;
8use dokuwiki\Extension\Event;
9use dokuwiki\Utf8;
10
11// set the minimum token length to use in the index
12// (note, this doesn't apply to numeric tokens)
13const MINWORDLENGTH = 2;
14
15/**
16 * DokuWiki Tokenizer class
17 */
18class Tokenizer
19{
20    /** @var array $Stopwords Words that tokenizer ignores */
21    protected static array $Stopwords;
22
23    /** @var int $MinWordLength minimum token length */
24    protected static int $MinWordLength;
25
26    /**
27     * Returns words that will be ignored
28     *
29     * @return array  list of stop words
30     *
31     * @author Tom N Harris <tnharris@whoopdedo.org>
32     */
33    public static function getStopwords(): array
34    {
35        if (!isset(static::$Stopwords)) {
36            global $conf;
37            $swFile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
38            if (file_exists($swFile)) {
39                static::$Stopwords = file($swFile, FILE_IGNORE_NEW_LINES);
40            } else {
41                static::$Stopwords = [];
42            }
43        }
44        return static::$Stopwords;
45    }
46
47    /**
48     * Returns minimum word length to be used in the index
49     *
50     * @return int
51     */
52    public static function getMinWordLength(): int
53    {
54        if (!isset(static::$MinWordLength)) {
55            // set the minimum token length to use in the index
56            // (note, this doesn't apply to numeric tokens)
57            static::$MinWordLength = (defined('IDX_MINWORDLENGTH'))
58                ? IDX_MINWORDLENGTH
59                : MINWORDLENGTH;
60        }
61        return static::$MinWordLength;
62    }
63
64    /**
65     * Split the text into words for fulltext search
66     *
67     * @triggers INDEXER_TEXT_PREPARE
68     * This event allows plugins to modify the text before it gets tokenized.
69     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
70     *
71     * @param string $text plain text
72     * @param bool $wc are wildcards allowed?
73     * @return array  list of words in the text
74     *
75     * @author Tom N Harris <tnharris@whoopdedo.org>
76     * @author Andreas Gohr <andi@splitbrain.org>
77     */
78    public static function getWords(string $text, bool $wc = false): array
79    {
80        $wc = ($wc) ? '' : '\*';
81
82        // prepare the text to be tokenized
83        $event = new Event('INDEXER_TEXT_PREPARE', $text);
84        if ($event->advise_before()) {
85            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
86                $text = Asian::separateAsianWords($text);
87            }
88        }
89        $event->advise_after();
90        unset($event);
91
92        $text = strtr($text, [
93                "\r" => ' ',
94                "\n" => ' ',
95                "\t" => ' ',
96                "\xC2\xAD" => '', //soft-hyphen
97        ]);
98        if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
99            $text = Clean::stripspecials($text, ' ', '\._\-:' . $wc);
100        }
101
102        $wordlist = explode(' ', $text);
103        foreach ($wordlist as $i => $word) {
104            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
105                PhpString::strtolower($word) : strtolower($word);
106        }
107
108        foreach ($wordlist as $i => $word) {
109            if (
110                (!is_numeric($word) && strlen($word) < static::getMinWordLength())
111                || in_array($word, static::getStopwords(), true)
112            ) {
113                unset($wordlist[$i]);
114            }
115        }
116        return array_values($wordlist);
117    }
118
119    /**
120     * Check if a search term meets the minimum length requirement
121     *
122     * Strips wildcard characters, then checks the base against the minimum
123     * word length. Numeric terms are always accepted.
124     *
125     * @param string $term the search term, may include * wildcards
126     * @return bool true if the term is valid for searching
127     */
128    public static function isValidSearchTerm(string $term): bool
129    {
130        $base = trim($term, '*');
131        if ($base === '') return false;
132        if (is_numeric($base)) return true;
133        return static::tokenLength($base) >= static::getMinWordLength();
134    }
135
136    /**
137     * Measure the length of a string
138     *
139     * Differs from strlen in handling of asian characters, otherwise byte lengths are used
140     *
141     * @param string $token
142     * @return int
143     * @author Tom N Harris <tnharris@whoopdedo.org>
144     *
145     */
146    public static function tokenLength(string $token): int
147    {
148        $length = strlen($token);
149        // If left alone, all chinese "words" will have the same lenght of 3, so the "length" of a "word" is faked
150        if (preg_match_all('/[\xE2-\xEF]/', $token, $leadbytes)) {
151            foreach ($leadbytes[0] as $byte) {
152                $length += ord($byte) - 0xE1;
153            }
154        }
155        return $length;
156    }
157}
158