xref: /dokuwiki/inc/Search/Tokenizer.php (revision 9369b4a991666bc911474806b106d8958e79f4c1)
1094ebf29SSatoshi Sahara<?php
2094ebf29SSatoshi Sahara
3094ebf29SSatoshi Saharanamespace dokuwiki\Search;
4094ebf29SSatoshi Sahara
5*9369b4a9SAndreas Gohruse dokuwiki\Utf8\Asian;
6*9369b4a9SAndreas Gohruse dokuwiki\Utf8\Clean;
7*9369b4a9SAndreas Gohruse dokuwiki\Utf8\PhpString;
8094ebf29SSatoshi Saharause dokuwiki\Extension\Event;
9094ebf29SSatoshi Saharause dokuwiki\Utf8;
10094ebf29SSatoshi Sahara
11094ebf29SSatoshi Sahara// set the minimum token length to use in the index
12094ebf29SSatoshi Sahara// (note, this doesn't apply to numeric tokens)
13094ebf29SSatoshi Saharaconst MINWORDLENGTH = 2;
14094ebf29SSatoshi Sahara
15094ebf29SSatoshi Sahara/**
161755450bSSatoshi Sahara * DokuWiki Tokenizer class
17094ebf29SSatoshi Sahara */
18094ebf29SSatoshi Saharaclass Tokenizer
19094ebf29SSatoshi Sahara{
20094ebf29SSatoshi Sahara    /** @var array $Stopwords Words that tokenizer ignores */
21*9369b4a9SAndreas Gohr    protected static array $Stopwords;
22094ebf29SSatoshi Sahara
23094ebf29SSatoshi Sahara    /** @var int $MinWordLength minimum token length */
24*9369b4a9SAndreas Gohr    protected static int $MinWordLength;
25094ebf29SSatoshi Sahara
26094ebf29SSatoshi Sahara    /**
27094ebf29SSatoshi Sahara     * Returns words that will be ignored
28094ebf29SSatoshi Sahara     *
29094ebf29SSatoshi Sahara     * @return array  list of stop words
30094ebf29SSatoshi Sahara     *
31094ebf29SSatoshi Sahara     * @author Tom N Harris <tnharris@whoopdedo.org>
32094ebf29SSatoshi Sahara     */
33*9369b4a9SAndreas Gohr    public static function getStopwords(): array
34094ebf29SSatoshi Sahara    {
351755450bSSatoshi Sahara        if (!isset(static::$Stopwords)) {
36094ebf29SSatoshi Sahara            global $conf;
37094ebf29SSatoshi Sahara            $swFile = DOKU_INC . 'inc/lang/' . $conf['lang'] . '/stopwords.txt';
38094ebf29SSatoshi Sahara            if (file_exists($swFile)) {
391755450bSSatoshi Sahara                static::$Stopwords = file($swFile, FILE_IGNORE_NEW_LINES);
40094ebf29SSatoshi Sahara            } else {
41*9369b4a9SAndreas Gohr                static::$Stopwords = [];
42094ebf29SSatoshi Sahara            }
43094ebf29SSatoshi Sahara        }
441755450bSSatoshi Sahara        return static::$Stopwords;
45094ebf29SSatoshi Sahara    }
46094ebf29SSatoshi Sahara
47094ebf29SSatoshi Sahara    /**
48094ebf29SSatoshi Sahara     * Returns minimum word length to be used in the index
491755450bSSatoshi Sahara     *
501755450bSSatoshi Sahara     * @return int
51094ebf29SSatoshi Sahara     */
52*9369b4a9SAndreas Gohr    public static function getMinWordLength(): int
53094ebf29SSatoshi Sahara    {
541755450bSSatoshi Sahara        if (!isset(static::$MinWordLength)) {
551755450bSSatoshi Sahara            // set the minimum token length to use in the index
561755450bSSatoshi Sahara            // (note, this doesn't apply to numeric tokens)
571755450bSSatoshi Sahara            static::$MinWordLength = (defined('IDX_MINWORDLENGTH'))
581755450bSSatoshi Sahara                ? IDX_MINWORDLENGTH
591755450bSSatoshi Sahara                : MINWORDLENGTH;
601755450bSSatoshi Sahara        }
611755450bSSatoshi Sahara        return static::$MinWordLength;
62094ebf29SSatoshi Sahara    }
63094ebf29SSatoshi Sahara
64094ebf29SSatoshi Sahara    /**
65094ebf29SSatoshi Sahara     * Split the text into words for fulltext search
66094ebf29SSatoshi Sahara     *
67094ebf29SSatoshi Sahara     * @triggers INDEXER_TEXT_PREPARE
68094ebf29SSatoshi Sahara     * This event allows plugins to modify the text before it gets tokenized.
69094ebf29SSatoshi Sahara     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
70094ebf29SSatoshi Sahara     *
71094ebf29SSatoshi Sahara     * @param string $text plain text
72094ebf29SSatoshi Sahara     * @param bool $wc are wildcards allowed?
73094ebf29SSatoshi Sahara     * @return array  list of words in the text
74094ebf29SSatoshi Sahara     *
75094ebf29SSatoshi Sahara     * @author Tom N Harris <tnharris@whoopdedo.org>
76094ebf29SSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
77094ebf29SSatoshi Sahara     */
78*9369b4a9SAndreas Gohr    public static function getWords(string $text, bool $wc = false): array
79094ebf29SSatoshi Sahara    {
80094ebf29SSatoshi Sahara        $wc = ($wc) ? '' : '\*';
81094ebf29SSatoshi Sahara
82094ebf29SSatoshi Sahara        // prepare the text to be tokenized
83094ebf29SSatoshi Sahara        $event = new Event('INDEXER_TEXT_PREPARE', $text);
84*9369b4a9SAndreas Gohr        if ($event->advise_before()) {
85094ebf29SSatoshi Sahara            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
86*9369b4a9SAndreas Gohr                $text = Asian::separateAsianWords($text);
87094ebf29SSatoshi Sahara            }
88094ebf29SSatoshi Sahara        }
89094ebf29SSatoshi Sahara        $event->advise_after();
90094ebf29SSatoshi Sahara        unset($event);
91094ebf29SSatoshi Sahara
92*9369b4a9SAndreas Gohr        $text = strtr($text, [
93094ebf29SSatoshi Sahara                "\r" => ' ',
94094ebf29SSatoshi Sahara                "\n" => ' ',
95094ebf29SSatoshi Sahara                "\t" => ' ',
96094ebf29SSatoshi Sahara                "\xC2\xAD" => '', //soft-hyphen
97*9369b4a9SAndreas Gohr        ]);
98094ebf29SSatoshi Sahara        if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
99*9369b4a9SAndreas Gohr            $text = Clean::stripspecials($text, ' ', '\._\-:' . $wc);
100094ebf29SSatoshi Sahara        }
101094ebf29SSatoshi Sahara
102094ebf29SSatoshi Sahara        $wordlist = explode(' ', $text);
103094ebf29SSatoshi Sahara        foreach ($wordlist as $i => $word) {
104094ebf29SSatoshi Sahara            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
105*9369b4a9SAndreas Gohr                PhpString::strtolower($word) : strtolower($word);
106094ebf29SSatoshi Sahara        }
107094ebf29SSatoshi Sahara
108094ebf29SSatoshi Sahara        foreach ($wordlist as $i => $word) {
109*9369b4a9SAndreas Gohr            if (
110*9369b4a9SAndreas Gohr                (!is_numeric($word) && strlen($word) < static::getMinWordLength())
111*9369b4a9SAndreas Gohr                || in_array($word, static::getStopwords(), true)
112*9369b4a9SAndreas Gohr            ) {
113094ebf29SSatoshi Sahara                unset($wordlist[$i]);
114094ebf29SSatoshi Sahara            }
115094ebf29SSatoshi Sahara        }
116094ebf29SSatoshi Sahara        return array_values($wordlist);
117094ebf29SSatoshi Sahara    }
118596d5287SAndreas Gohr
119596d5287SAndreas Gohr    /**
1201148921dSAndreas Gohr     * Check if a search term meets the minimum length requirement
1211148921dSAndreas Gohr     *
1221148921dSAndreas Gohr     * Strips wildcard characters, then checks the base against the minimum
1231148921dSAndreas Gohr     * word length. Numeric terms are always accepted.
1241148921dSAndreas Gohr     *
1251148921dSAndreas Gohr     * @param string $term the search term, may include * wildcards
1261148921dSAndreas Gohr     * @return bool true if the term is valid for searching
1271148921dSAndreas Gohr     */
1281148921dSAndreas Gohr    public static function isValidSearchTerm(string $term): bool
1291148921dSAndreas Gohr    {
1301148921dSAndreas Gohr        $base = trim($term, '*');
1311148921dSAndreas Gohr        if ($base === '') return false;
1321148921dSAndreas Gohr        if (is_numeric($base)) return true;
1331148921dSAndreas Gohr        return static::tokenLength($base) >= static::getMinWordLength();
1341148921dSAndreas Gohr    }
1351148921dSAndreas Gohr
1361148921dSAndreas Gohr    /**
137596d5287SAndreas Gohr     * Measure the length of a string
138596d5287SAndreas Gohr     *
139596d5287SAndreas Gohr     * Differs from strlen in handling of asian characters, otherwise byte lengths are used
140596d5287SAndreas Gohr     *
141596d5287SAndreas Gohr     * @param string $token
142596d5287SAndreas Gohr     * @return int
143596d5287SAndreas Gohr     * @author Tom N Harris <tnharris@whoopdedo.org>
144596d5287SAndreas Gohr     *
145596d5287SAndreas Gohr     */
146*9369b4a9SAndreas Gohr    public static function tokenLength(string $token): int
147596d5287SAndreas Gohr    {
148596d5287SAndreas Gohr        $length = strlen($token);
149596d5287SAndreas Gohr        // If left alone, all chinese "words" will have the same lenght of 3, so the "length" of a "word" is faked
150596d5287SAndreas Gohr        if (preg_match_all('/[\xE2-\xEF]/', $token, $leadbytes)) {
151596d5287SAndreas Gohr            foreach ($leadbytes[0] as $byte) {
152596d5287SAndreas Gohr                $length += ord($byte) - 0xE1;
153596d5287SAndreas Gohr            }
154596d5287SAndreas Gohr        }
155596d5287SAndreas Gohr        return $length;
156596d5287SAndreas Gohr    }
157094ebf29SSatoshi Sahara}
158