xref: /dokuwiki/inc/Search/Tokenizer.php (revision 094ebf29c8846c56a4fe657a639ccf9ded62d429)
1<?php
2
3namespace dokuwiki\Search;
4
5use dokuwiki\Extension\Event;
6use dokuwiki\Utf8;
7
8// set the minimum token length to use in the index
9// (note, this doesn't apply to numeric tokens)
10const MINWORDLENGTH = 2;
11
12/**
13 * DokuWuki Tokenizer class (Singleton)
14 */
15class Tokenizer
16{
17    /** @var Tokenizer */
18    protected static $instance = null;
19
20    /** @var array $Stopwords Words that tokenizer ignores */
21    protected $Stopwords;
22
23    /** @var int $MinWordLength  minimum token length */
24    protected $MinWordLength;
25
26    /**
27     * Tokenizer constructor. Singleton, thus protected!
28     */
29    protected function __construct()
30    {
31        // set the minimum token length to use in the index
32        // (note, this doesn't apply to numeric tokens)
33        $this->MinWordLength = (defined('IDX_MINWORDLENGTH'))
34            ? IDX_MINWORDLENGTH
35            : MINWORDLENGTH;
36
37        $this->getStopwords();
38    }
39
40    /**
41     * Get new or existing singleton instance of the Tokenizer
42     *
43     * @return PagewordIndex
44     */
45    public static function getInstance()
46    {
47        if (is_null(static::$instance)) {
48            static::$instance = new static();
49        }
50        return static::$instance;
51    }
52
53    /**
54     * Returns words that will be ignored
55     *
56     * @return array                list of stop words
57     *
58     * @author Tom N Harris <tnharris@whoopdedo.org>
59     */
60    public function getStopwords()
61    {
62        if (!isset($this->Stopwords)) {
63            global $conf;
64            $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
65            if (file_exists($swFile)) {
66                $this->Stopwords = file($swFile, FILE_IGNORE_NEW_LINES);
67            } else {
68                $this->Stopwords = array();
69           }
70        }
71        return $this->Stopwords;
72    }
73
74    /**
75     * Returns minimum word length to be used in the index
76     */
77    public function getMinWordLength()
78    {
79        return $this->MinWordLength;
80    }
81
82    /**
83     * Split the text into words for fulltext search
84     *
85     * @triggers INDEXER_TEXT_PREPARE
86     * This event allows plugins to modify the text before it gets tokenized.
87     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
88     *
89     * @param string    $text   plain text
90     * @param bool      $wc     are wildcards allowed?
91     * @return array            list of words in the text
92     *
93     * @author Tom N Harris <tnharris@whoopdedo.org>
94     * @author Andreas Gohr <andi@splitbrain.org>
95     */
96    public function getWords($text, $wc=false)
97    {
98        $wc = ($wc) ? '' : '\*';
99
100        // prepare the text to be tokenized
101        $event = new Event('INDEXER_TEXT_PREPARE', $text);
102        if ($event->advise_before(true)) {
103            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
104                $text = Utf8\Asian::separateAsianWords($text);
105            }
106        }
107        $event->advise_after();
108        unset($event);
109
110        $text = strtr($text,
111                       array(
112                           "\r" => ' ',
113                           "\n" => ' ',
114                           "\t" => ' ',
115                           "\xC2\xAD" => '', //soft-hyphen
116                       )
117                     );
118        if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
119            $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc);
120        }
121
122        $wordlist = explode(' ', $text);
123        foreach ($wordlist as $i => $word) {
124            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
125                Utf8\PhpString::strtolower($word) : strtolower($word);
126        }
127
128        foreach ($wordlist as $i => $word) {
129            if ((!is_numeric($word) && strlen($word) < $this->MinWordLength)
130              || array_search($word, $this->getStopwords(), true) !== false) {
131                unset($wordlist[$i]);
132            }
133        }
134        return array_values($wordlist);
135    }
136}
137