xref: /dokuwiki/inc/Search/Tokenizer.php (revision 1148921de6af6909f19cb5b30b698d0f27d7751e)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Utf8;
7
8// set the minimum token length to use in the index
9// (note, this doesn't apply to numeric tokens)
10const MINWORDLENGTH = 2;
11
12/**
13 * DokuWiki Tokenizer class
14 */
15class Tokenizer
16{
17    /** @var array $Stopwords Words that tokenizer ignores */
18    protected static $Stopwords;
19
20    /** @var int $MinWordLength minimum token length */
21    protected static $MinWordLength;
22
23    /**
24     * Returns words that will be ignored
25     *
26     * @return array  list of stop words
27     *
28     * @author Tom N Harris <tnharris@whoopdedo.org>
29     */
30    public static function getStopwords()
31    {
32        if (!isset(static::$Stopwords)) {
33            global $conf;
34            $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
35            if (file_exists($swFile)) {
36                static::$Stopwords = file($swFile, FILE_IGNORE_NEW_LINES);
37            } else {
38                static::$Stopwords = array();
39            }
40        }
41        return static::$Stopwords;
42    }
43
44    /**
45     * Returns minimum word length to be used in the index
46     *
47     * @return int
48     */
49    public static function getMinWordLength()
50    {
51        if (!isset(static::$MinWordLength)) {
52            // set the minimum token length to use in the index
53            // (note, this doesn't apply to numeric tokens)
54            static::$MinWordLength = (defined('IDX_MINWORDLENGTH'))
55                ? IDX_MINWORDLENGTH
56                : MINWORDLENGTH;
57        }
58        return static::$MinWordLength;
59    }
60
61    /**
62     * Split the text into words for fulltext search
63     *
64     * @triggers INDEXER_TEXT_PREPARE
65     * This event allows plugins to modify the text before it gets tokenized.
66     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
67     *
68     * @param string $text plain text
69     * @param bool $wc are wildcards allowed?
70     * @return array  list of words in the text
71     *
72     * @author Tom N Harris <tnharris@whoopdedo.org>
73     * @author Andreas Gohr <andi@splitbrain.org>
74     */
75    public static function getWords($text, $wc = false)
76    {
77        $wc = ($wc) ? '' : '\*';
78
79        // prepare the text to be tokenized
80        $event = new Event('INDEXER_TEXT_PREPARE', $text);
81        if ($event->advise_before(true)) {
82            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
83                $text = Utf8\Asian::separateAsianWords($text);
84            }
85        }
86        $event->advise_after();
87        unset($event);
88
89        $text = strtr($text, array(
90                "\r" => ' ',
91                "\n" => ' ',
92                "\t" => ' ',
93                "\xC2\xAD" => '', //soft-hyphen
94        ));
95        if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
96            $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc);
97        }
98
99        $wordlist = explode(' ', $text);
100        foreach ($wordlist as $i => $word) {
101            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
102                Utf8\PhpString::strtolower($word) : strtolower($word);
103        }
104
105        foreach ($wordlist as $i => $word) {
106            if ((!is_numeric($word) && strlen($word) < static::getMinWordLength())
107              || array_search($word, static::getStopwords(), true) !== false) {
108                unset($wordlist[$i]);
109            }
110        }
111        return array_values($wordlist);
112    }
113
114    /**
115     * Check if a search term meets the minimum length requirement
116     *
117     * Strips wildcard characters, then checks the base against the minimum
118     * word length. Numeric terms are always accepted.
119     *
120     * @param string $term the search term, may include * wildcards
121     * @return bool true if the term is valid for searching
122     */
123    public static function isValidSearchTerm(string $term): bool
124    {
125        $base = trim($term, '*');
126        if ($base === '') return false;
127        if (is_numeric($base)) return true;
128        return static::tokenLength($base) >= static::getMinWordLength();
129    }
130
131    /**
132     * Measure the length of a string
133     *
134     * Differs from strlen in handling of asian characters, otherwise byte lengths are used
135     *
136     * @param string $token
137     * @return int
138     * @author Tom N Harris <tnharris@whoopdedo.org>
139     *
140     */
141    public static function tokenLength($token)
142    {
143        $length = strlen($token);
144        // If left alone, all chinese "words" will have the same lenght of 3, so the "length" of a "word" is faked
145        if (preg_match_all('/[\xE2-\xEF]/', $token, $leadbytes)) {
146            foreach ($leadbytes[0] as $byte) {
147                $length += ord($byte) - 0xE1;
148            }
149        }
150        return $length;
151    }
152
153}
154