xref: /dokuwiki/inc/Search/Tokenizer.php (revision ede4646658cf51245060332d97a319a39c788ea1)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Utf8;
7
8// set the minimum token length to use in the index
9// (note, this doesn't apply to numeric tokens)
10const MINWORDLENGTH = 2;
11
12/**
13 * DokuWiki Tokenizer class
14 */
15class Tokenizer
16{
17    /** @var array $Stopwords Words that tokenizer ignores */
18    protected static $Stopwords;
19
20    /** @var int $MinWordLength minimum token length */
21    protected static $MinWordLength;
22
23    /**
24     * Returns words that will be ignored
25     *
26     * @return array  list of stop words
27     *
28     * @author Tom N Harris <tnharris@whoopdedo.org>
29     */
30    public static function getStopwords()
31    {
32        if (!isset(static::$Stopwords)) {
33            global $conf;
34            $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
35            if (file_exists($swFile)) {
36                static::$Stopwords = file($swFile, FILE_IGNORE_NEW_LINES);
37            } else {
38                static::$Stopwords = array();
39            }
40        }
41        return static::$Stopwords;
42    }
43
44    /**
45     * Returns minimum word length to be used in the index
46     *
47     * @return int
48     */
49    public static function getMinWordLength()
50    {
51        if (!isset(static::$MinWordLength)) {
52            // set the minimum token length to use in the index
53            // (note, this doesn't apply to numeric tokens)
54            static::$MinWordLength = (defined('IDX_MINWORDLENGTH'))
55                ? IDX_MINWORDLENGTH
56                : MINWORDLENGTH;
57        }
58        return static::$MinWordLength;
59    }
60
61    /**
62     * Split the text into words for fulltext search
63     *
64     * @triggers INDEXER_TEXT_PREPARE
65     * This event allows plugins to modify the text before it gets tokenized.
66     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
67     *
68     * @param string $text plain text
69     * @param bool $wc are wildcards allowed?
70     * @return array  list of words in the text
71     *
72     * @author Tom N Harris <tnharris@whoopdedo.org>
73     * @author Andreas Gohr <andi@splitbrain.org>
74     */
75    public static function getWords($text, $wc = false)
76    {
77        $wc = ($wc) ? '' : '\*';
78
79        // prepare the text to be tokenized
80        $event = new Event('INDEXER_TEXT_PREPARE', $text);
81        if ($event->advise_before(true)) {
82            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
83                $text = Utf8\Asian::separateAsianWords($text);
84            }
85        }
86        $event->advise_after();
87        unset($event);
88
89        $text = strtr($text, array(
90                "\r" => ' ',
91                "\n" => ' ',
92                "\t" => ' ',
93                "\xC2\xAD" => '', //soft-hyphen
94        ));
95        if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
96            $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc);
97        }
98
99        $wordlist = explode(' ', $text);
100        foreach ($wordlist as $i => $word) {
101            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
102                Utf8\PhpString::strtolower($word) : strtolower($word);
103        }
104
105        foreach ($wordlist as $i => $word) {
106            if ((!is_numeric($word) && strlen($word) < static::getMinWordLength())
107              || array_search($word, static::getStopwords(), true) !== false) {
108                unset($wordlist[$i]);
109            }
110        }
111        return array_values($wordlist);
112    }
113
114    /**
115     * Measure the length of a string
116     *
117     * Differs from strlen in handling of asian characters, otherwise byte lengths are used
118     *
119     * @param string $token
120     * @return int
121     * @author Tom N Harris <tnharris@whoopdedo.org>
122     *
123     */
124    public static function tokenLength($token)
125    {
126        $length = strlen($token);
127        // If left alone, all chinese "words" will have the same lenght of 3, so the "length" of a "word" is faked
128        if (preg_match_all('/[\xE2-\xEF]/', $token, $leadbytes)) {
129            foreach ($leadbytes[0] as $byte) {
130                $length += ord($byte) - 0xE1;
131            }
132        }
133        return $length;
134    }
135
136}
137