xref: /dokuwiki/inc/Search/Tokenizer.php (revision 1755450b32c7398d40e13297b3285732f4296005)
1094ebf29SSatoshi Sahara<?php
2094ebf29SSatoshi Sahara
3094ebf29SSatoshi Saharanamespace dokuwiki\Search;
4094ebf29SSatoshi Sahara
5094ebf29SSatoshi Saharause dokuwiki\Extension\Event;
6094ebf29SSatoshi Saharause dokuwiki\Utf8;
7094ebf29SSatoshi Sahara
8094ebf29SSatoshi Sahara// set the minimum token length to use in the index
9094ebf29SSatoshi Sahara// (note, this doesn't apply to numeric tokens)
10094ebf29SSatoshi Saharaconst MINWORDLENGTH = 2;
11094ebf29SSatoshi Sahara
12094ebf29SSatoshi Sahara/**
13*1755450bSSatoshi Sahara * DokuWiki Tokenizer class
14094ebf29SSatoshi Sahara */
15094ebf29SSatoshi Saharaclass Tokenizer
16094ebf29SSatoshi Sahara{
17094ebf29SSatoshi Sahara    /** @var array $Stopwords Words that tokenizer ignores */
18*1755450bSSatoshi Sahara    protected static $Stopwords;
19094ebf29SSatoshi Sahara
20094ebf29SSatoshi Sahara    /** @var int $MinWordLength minimum token length */
21*1755450bSSatoshi Sahara    protected static $MinWordLength;
22094ebf29SSatoshi Sahara
23094ebf29SSatoshi Sahara    /**
24094ebf29SSatoshi Sahara     * Returns words that will be ignored
25094ebf29SSatoshi Sahara     *
26094ebf29SSatoshi Sahara     * @return array  list of stop words
27094ebf29SSatoshi Sahara     *
28094ebf29SSatoshi Sahara     * @author Tom N Harris <tnharris@whoopdedo.org>
29094ebf29SSatoshi Sahara     */
30*1755450bSSatoshi Sahara    public static function getStopwords()
31094ebf29SSatoshi Sahara    {
32*1755450bSSatoshi Sahara        if (!isset(static::$Stopwords)) {
33094ebf29SSatoshi Sahara            global $conf;
34094ebf29SSatoshi Sahara            $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
35094ebf29SSatoshi Sahara            if (file_exists($swFile)) {
36*1755450bSSatoshi Sahara                static::$Stopwords = file($swFile, FILE_IGNORE_NEW_LINES);
37094ebf29SSatoshi Sahara            } else {
38*1755450bSSatoshi Sahara                static::$Stopwords = array();
39094ebf29SSatoshi Sahara            }
40094ebf29SSatoshi Sahara        }
41*1755450bSSatoshi Sahara        return static::$Stopwords;
42094ebf29SSatoshi Sahara    }
43094ebf29SSatoshi Sahara
44094ebf29SSatoshi Sahara    /**
45094ebf29SSatoshi Sahara     * Returns minimum word length to be used in the index
46*1755450bSSatoshi Sahara     *
47*1755450bSSatoshi Sahara     * @return int
48094ebf29SSatoshi Sahara     */
49*1755450bSSatoshi Sahara    public static function getMinWordLength()
50094ebf29SSatoshi Sahara    {
51*1755450bSSatoshi Sahara        if (!isset(static::$MinWordLength)) {
52*1755450bSSatoshi Sahara            // set the minimum token length to use in the index
53*1755450bSSatoshi Sahara            // (note, this doesn't apply to numeric tokens)
54*1755450bSSatoshi Sahara            static::$MinWordLength = (defined('IDX_MINWORDLENGTH'))
55*1755450bSSatoshi Sahara                ? IDX_MINWORDLENGTH
56*1755450bSSatoshi Sahara                : MINWORDLENGTH;
57*1755450bSSatoshi Sahara        }
58*1755450bSSatoshi Sahara        return static::$MinWordLength;
59094ebf29SSatoshi Sahara    }
60094ebf29SSatoshi Sahara
61094ebf29SSatoshi Sahara    /**
62094ebf29SSatoshi Sahara     * Split the text into words for fulltext search
63094ebf29SSatoshi Sahara     *
64094ebf29SSatoshi Sahara     * @triggers INDEXER_TEXT_PREPARE
65094ebf29SSatoshi Sahara     * This event allows plugins to modify the text before it gets tokenized.
66094ebf29SSatoshi Sahara     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
67094ebf29SSatoshi Sahara     *
68094ebf29SSatoshi Sahara     * @param string $text plain text
69094ebf29SSatoshi Sahara     * @param bool $wc are wildcards allowed?
70094ebf29SSatoshi Sahara     * @return array  list of words in the text
71094ebf29SSatoshi Sahara     *
72094ebf29SSatoshi Sahara     * @author Tom N Harris <tnharris@whoopdedo.org>
73094ebf29SSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
74094ebf29SSatoshi Sahara     */
75*1755450bSSatoshi Sahara    public static function getWords($text, $wc = false)
76094ebf29SSatoshi Sahara    {
77094ebf29SSatoshi Sahara        $wc = ($wc) ? '' : '\*';
78094ebf29SSatoshi Sahara
79094ebf29SSatoshi Sahara        // prepare the text to be tokenized
80094ebf29SSatoshi Sahara        $event = new Event('INDEXER_TEXT_PREPARE', $text);
81094ebf29SSatoshi Sahara        if ($event->advise_before(true)) {
82094ebf29SSatoshi Sahara            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
83094ebf29SSatoshi Sahara                $text = Utf8\Asian::separateAsianWords($text);
84094ebf29SSatoshi Sahara            }
85094ebf29SSatoshi Sahara        }
86094ebf29SSatoshi Sahara        $event->advise_after();
87094ebf29SSatoshi Sahara        unset($event);
88094ebf29SSatoshi Sahara
89*1755450bSSatoshi Sahara        $text = strtr($text, array(
90094ebf29SSatoshi Sahara                "\r" => ' ',
91094ebf29SSatoshi Sahara                "\n" => ' ',
92094ebf29SSatoshi Sahara                "\t" => ' ',
93094ebf29SSatoshi Sahara                "\xC2\xAD" => '', //soft-hyphen
94*1755450bSSatoshi Sahara        ));
95094ebf29SSatoshi Sahara        if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
96094ebf29SSatoshi Sahara            $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc);
97094ebf29SSatoshi Sahara        }
98094ebf29SSatoshi Sahara
99094ebf29SSatoshi Sahara        $wordlist = explode(' ', $text);
100094ebf29SSatoshi Sahara        foreach ($wordlist as $i => $word) {
101094ebf29SSatoshi Sahara            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
102094ebf29SSatoshi Sahara                Utf8\PhpString::strtolower($word) : strtolower($word);
103094ebf29SSatoshi Sahara        }
104094ebf29SSatoshi Sahara
105094ebf29SSatoshi Sahara        foreach ($wordlist as $i => $word) {
106*1755450bSSatoshi Sahara            if ((!is_numeric($word) && strlen($word) < static::getMinWordLength())
107*1755450bSSatoshi Sahara              || array_search($word, static::getStopwords(), true) !== false) {
108094ebf29SSatoshi Sahara                unset($wordlist[$i]);
109094ebf29SSatoshi Sahara            }
110094ebf29SSatoshi Sahara        }
111094ebf29SSatoshi Sahara        return array_values($wordlist);
112094ebf29SSatoshi Sahara    }
113094ebf29SSatoshi Sahara}
114