xref: /dokuwiki/inc/Search/Tokenizer.php (revision 596d5287d7a816d606ef4153ef9e0f4704bf8f73)
1094ebf29SSatoshi Sahara<?php
2094ebf29SSatoshi Sahara
3094ebf29SSatoshi Saharanamespace dokuwiki\Search;
4094ebf29SSatoshi Sahara
5094ebf29SSatoshi Saharause dokuwiki\Extension\Event;
6094ebf29SSatoshi Saharause dokuwiki\Utf8;
7094ebf29SSatoshi Sahara
8094ebf29SSatoshi Sahara// set the minimum token length to use in the index
9094ebf29SSatoshi Sahara// (note, this doesn't apply to numeric tokens)
10094ebf29SSatoshi Saharaconst MINWORDLENGTH = 2;
11094ebf29SSatoshi Sahara
12094ebf29SSatoshi Sahara/**
131755450bSSatoshi Sahara * DokuWiki Tokenizer class
14094ebf29SSatoshi Sahara */
15094ebf29SSatoshi Saharaclass Tokenizer
16094ebf29SSatoshi Sahara{
17094ebf29SSatoshi Sahara    /** @var array $Stopwords Words that tokenizer ignores */
181755450bSSatoshi Sahara    protected static $Stopwords;
19094ebf29SSatoshi Sahara
20094ebf29SSatoshi Sahara    /** @var int $MinWordLength minimum token length */
211755450bSSatoshi Sahara    protected static $MinWordLength;
22094ebf29SSatoshi Sahara
23094ebf29SSatoshi Sahara    /**
24094ebf29SSatoshi Sahara     * Returns words that will be ignored
25094ebf29SSatoshi Sahara     *
26094ebf29SSatoshi Sahara     * @return array  list of stop words
27094ebf29SSatoshi Sahara     *
28094ebf29SSatoshi Sahara     * @author Tom N Harris <tnharris@whoopdedo.org>
29094ebf29SSatoshi Sahara     */
301755450bSSatoshi Sahara    public static function getStopwords()
31094ebf29SSatoshi Sahara    {
321755450bSSatoshi Sahara        if (!isset(static::$Stopwords)) {
33094ebf29SSatoshi Sahara            global $conf;
34094ebf29SSatoshi Sahara            $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
35094ebf29SSatoshi Sahara            if (file_exists($swFile)) {
361755450bSSatoshi Sahara                static::$Stopwords = file($swFile, FILE_IGNORE_NEW_LINES);
37094ebf29SSatoshi Sahara            } else {
381755450bSSatoshi Sahara                static::$Stopwords = array();
39094ebf29SSatoshi Sahara            }
40094ebf29SSatoshi Sahara        }
411755450bSSatoshi Sahara        return static::$Stopwords;
42094ebf29SSatoshi Sahara    }
43094ebf29SSatoshi Sahara
44094ebf29SSatoshi Sahara    /**
45094ebf29SSatoshi Sahara     * Returns minimum word length to be used in the index
461755450bSSatoshi Sahara     *
471755450bSSatoshi Sahara     * @return int
48094ebf29SSatoshi Sahara     */
491755450bSSatoshi Sahara    public static function getMinWordLength()
50094ebf29SSatoshi Sahara    {
511755450bSSatoshi Sahara        if (!isset(static::$MinWordLength)) {
521755450bSSatoshi Sahara            // set the minimum token length to use in the index
531755450bSSatoshi Sahara            // (note, this doesn't apply to numeric tokens)
541755450bSSatoshi Sahara            static::$MinWordLength = (defined('IDX_MINWORDLENGTH'))
551755450bSSatoshi Sahara                ? IDX_MINWORDLENGTH
561755450bSSatoshi Sahara                : MINWORDLENGTH;
571755450bSSatoshi Sahara        }
581755450bSSatoshi Sahara        return static::$MinWordLength;
59094ebf29SSatoshi Sahara    }
60094ebf29SSatoshi Sahara
61094ebf29SSatoshi Sahara    /**
62094ebf29SSatoshi Sahara     * Split the text into words for fulltext search
63094ebf29SSatoshi Sahara     *
64094ebf29SSatoshi Sahara     * @triggers INDEXER_TEXT_PREPARE
65094ebf29SSatoshi Sahara     * This event allows plugins to modify the text before it gets tokenized.
66094ebf29SSatoshi Sahara     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
67094ebf29SSatoshi Sahara     *
68094ebf29SSatoshi Sahara     * @param string $text plain text
69094ebf29SSatoshi Sahara     * @param bool $wc are wildcards allowed?
70094ebf29SSatoshi Sahara     * @return array  list of words in the text
71094ebf29SSatoshi Sahara     *
72094ebf29SSatoshi Sahara     * @author Tom N Harris <tnharris@whoopdedo.org>
73094ebf29SSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
74094ebf29SSatoshi Sahara     */
751755450bSSatoshi Sahara    public static function getWords($text, $wc = false)
76094ebf29SSatoshi Sahara    {
77094ebf29SSatoshi Sahara        $wc = ($wc) ? '' : '\*';
78094ebf29SSatoshi Sahara
79094ebf29SSatoshi Sahara        // prepare the text to be tokenized
80094ebf29SSatoshi Sahara        $event = new Event('INDEXER_TEXT_PREPARE', $text);
81094ebf29SSatoshi Sahara        if ($event->advise_before(true)) {
82094ebf29SSatoshi Sahara            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
83094ebf29SSatoshi Sahara                $text = Utf8\Asian::separateAsianWords($text);
84094ebf29SSatoshi Sahara            }
85094ebf29SSatoshi Sahara        }
86094ebf29SSatoshi Sahara        $event->advise_after();
87094ebf29SSatoshi Sahara        unset($event);
88094ebf29SSatoshi Sahara
891755450bSSatoshi Sahara        $text = strtr($text, array(
90094ebf29SSatoshi Sahara                "\r" => ' ',
91094ebf29SSatoshi Sahara                "\n" => ' ',
92094ebf29SSatoshi Sahara                "\t" => ' ',
93094ebf29SSatoshi Sahara                "\xC2\xAD" => '', //soft-hyphen
941755450bSSatoshi Sahara        ));
95094ebf29SSatoshi Sahara        if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
96094ebf29SSatoshi Sahara            $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc);
97094ebf29SSatoshi Sahara        }
98094ebf29SSatoshi Sahara
99094ebf29SSatoshi Sahara        $wordlist = explode(' ', $text);
100094ebf29SSatoshi Sahara        foreach ($wordlist as $i => $word) {
101094ebf29SSatoshi Sahara            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
102094ebf29SSatoshi Sahara                Utf8\PhpString::strtolower($word) : strtolower($word);
103094ebf29SSatoshi Sahara        }
104094ebf29SSatoshi Sahara
105094ebf29SSatoshi Sahara        foreach ($wordlist as $i => $word) {
1061755450bSSatoshi Sahara            if ((!is_numeric($word) && strlen($word) < static::getMinWordLength())
1071755450bSSatoshi Sahara              || array_search($word, static::getStopwords(), true) !== false) {
108094ebf29SSatoshi Sahara                unset($wordlist[$i]);
109094ebf29SSatoshi Sahara            }
110094ebf29SSatoshi Sahara        }
111094ebf29SSatoshi Sahara        return array_values($wordlist);
112094ebf29SSatoshi Sahara    }
113*596d5287SAndreas Gohr
114*596d5287SAndreas Gohr    /**
115*596d5287SAndreas Gohr     * Measure the length of a string
116*596d5287SAndreas Gohr     *
117*596d5287SAndreas Gohr     * Differs from strlen in handling of asian characters, otherwise byte lengths are used
118*596d5287SAndreas Gohr     *
119*596d5287SAndreas Gohr     * @param string $token
120*596d5287SAndreas Gohr     * @return int
121*596d5287SAndreas Gohr     * @author Tom N Harris <tnharris@whoopdedo.org>
122*596d5287SAndreas Gohr     *
123*596d5287SAndreas Gohr     */
124*596d5287SAndreas Gohr    public static function tokenLength($token)
125*596d5287SAndreas Gohr    {
126*596d5287SAndreas Gohr        $length = strlen($token);
127*596d5287SAndreas Gohr        // If left alone, all chinese "words" will have the same lenght of 3, so the "length" of a "word" is faked
128*596d5287SAndreas Gohr        if (preg_match_all('/[\xE2-\xEF]/', $token, $leadbytes)) {
129*596d5287SAndreas Gohr            foreach ($leadbytes[0] as $byte) {
130*596d5287SAndreas Gohr                $length += ord($byte) - 0xE1;
131*596d5287SAndreas Gohr            }
132*596d5287SAndreas Gohr        }
133*596d5287SAndreas Gohr        return $length;
134*596d5287SAndreas Gohr    }
135*596d5287SAndreas Gohr
136094ebf29SSatoshi Sahara}
137