xref: /dokuwiki/inc/Search/Tokenizer.php (revision 094ebf29c8846c56a4fe657a639ccf9ded62d429)
1*094ebf29SSatoshi Sahara<?php
2*094ebf29SSatoshi Sahara
3*094ebf29SSatoshi Saharanamespace dokuwiki\Search;
4*094ebf29SSatoshi Sahara
5*094ebf29SSatoshi Saharause dokuwiki\Extension\Event;
6*094ebf29SSatoshi Saharause dokuwiki\Utf8;
7*094ebf29SSatoshi Sahara
8*094ebf29SSatoshi Sahara// set the minimum token length to use in the index
9*094ebf29SSatoshi Sahara// (note, this doesn't apply to numeric tokens)
10*094ebf29SSatoshi Saharaconst MINWORDLENGTH = 2;
11*094ebf29SSatoshi Sahara
12*094ebf29SSatoshi Sahara/**
13*094ebf29SSatoshi Sahara * DokuWuki Tokenizer class (Singleton)
14*094ebf29SSatoshi Sahara */
15*094ebf29SSatoshi Saharaclass Tokenizer
16*094ebf29SSatoshi Sahara{
17*094ebf29SSatoshi Sahara    /** @var Tokenizer */
18*094ebf29SSatoshi Sahara    protected static $instance = null;
19*094ebf29SSatoshi Sahara
20*094ebf29SSatoshi Sahara    /** @var array $Stopwords Words that tokenizer ignores */
21*094ebf29SSatoshi Sahara    protected $Stopwords;
22*094ebf29SSatoshi Sahara
23*094ebf29SSatoshi Sahara    /** @var int $MinWordLength  minimum token length */
24*094ebf29SSatoshi Sahara    protected $MinWordLength;
25*094ebf29SSatoshi Sahara
26*094ebf29SSatoshi Sahara    /**
27*094ebf29SSatoshi Sahara     * Tokenizer constructor. Singleton, thus protected!
28*094ebf29SSatoshi Sahara     */
29*094ebf29SSatoshi Sahara    protected function __construct()
30*094ebf29SSatoshi Sahara    {
31*094ebf29SSatoshi Sahara        // set the minimum token length to use in the index
32*094ebf29SSatoshi Sahara        // (note, this doesn't apply to numeric tokens)
33*094ebf29SSatoshi Sahara        $this->MinWordLength = (defined('IDX_MINWORDLENGTH'))
34*094ebf29SSatoshi Sahara            ? IDX_MINWORDLENGTH
35*094ebf29SSatoshi Sahara            : MINWORDLENGTH;
36*094ebf29SSatoshi Sahara
37*094ebf29SSatoshi Sahara        $this->getStopwords();
38*094ebf29SSatoshi Sahara    }
39*094ebf29SSatoshi Sahara
40*094ebf29SSatoshi Sahara    /**
41*094ebf29SSatoshi Sahara     * Get new or existing singleton instance of the Tokenizer
42*094ebf29SSatoshi Sahara     *
43*094ebf29SSatoshi Sahara     * @return PagewordIndex
44*094ebf29SSatoshi Sahara     */
45*094ebf29SSatoshi Sahara    public static function getInstance()
46*094ebf29SSatoshi Sahara    {
47*094ebf29SSatoshi Sahara        if (is_null(static::$instance)) {
48*094ebf29SSatoshi Sahara            static::$instance = new static();
49*094ebf29SSatoshi Sahara        }
50*094ebf29SSatoshi Sahara        return static::$instance;
51*094ebf29SSatoshi Sahara    }
52*094ebf29SSatoshi Sahara
53*094ebf29SSatoshi Sahara    /**
54*094ebf29SSatoshi Sahara     * Returns words that will be ignored
55*094ebf29SSatoshi Sahara     *
56*094ebf29SSatoshi Sahara     * @return array                list of stop words
57*094ebf29SSatoshi Sahara     *
58*094ebf29SSatoshi Sahara     * @author Tom N Harris <tnharris@whoopdedo.org>
59*094ebf29SSatoshi Sahara     */
60*094ebf29SSatoshi Sahara    public function getStopwords()
61*094ebf29SSatoshi Sahara    {
62*094ebf29SSatoshi Sahara        if (!isset($this->Stopwords)) {
63*094ebf29SSatoshi Sahara            global $conf;
64*094ebf29SSatoshi Sahara            $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
65*094ebf29SSatoshi Sahara            if (file_exists($swFile)) {
66*094ebf29SSatoshi Sahara                $this->Stopwords = file($swFile, FILE_IGNORE_NEW_LINES);
67*094ebf29SSatoshi Sahara            } else {
68*094ebf29SSatoshi Sahara                $this->Stopwords = array();
69*094ebf29SSatoshi Sahara           }
70*094ebf29SSatoshi Sahara        }
71*094ebf29SSatoshi Sahara        return $this->Stopwords;
72*094ebf29SSatoshi Sahara    }
73*094ebf29SSatoshi Sahara
74*094ebf29SSatoshi Sahara    /**
75*094ebf29SSatoshi Sahara     * Returns minimum word length to be used in the index
76*094ebf29SSatoshi Sahara     */
77*094ebf29SSatoshi Sahara    public function getMinWordLength()
78*094ebf29SSatoshi Sahara    {
79*094ebf29SSatoshi Sahara        return $this->MinWordLength;
80*094ebf29SSatoshi Sahara    }
81*094ebf29SSatoshi Sahara
82*094ebf29SSatoshi Sahara    /**
83*094ebf29SSatoshi Sahara     * Split the text into words for fulltext search
84*094ebf29SSatoshi Sahara     *
85*094ebf29SSatoshi Sahara     * @triggers INDEXER_TEXT_PREPARE
86*094ebf29SSatoshi Sahara     * This event allows plugins to modify the text before it gets tokenized.
87*094ebf29SSatoshi Sahara     * Plugins intercepting this event should also intercept INDEX_VERSION_GET
88*094ebf29SSatoshi Sahara     *
89*094ebf29SSatoshi Sahara     * @param string    $text   plain text
90*094ebf29SSatoshi Sahara     * @param bool      $wc     are wildcards allowed?
91*094ebf29SSatoshi Sahara     * @return array            list of words in the text
92*094ebf29SSatoshi Sahara     *
93*094ebf29SSatoshi Sahara     * @author Tom N Harris <tnharris@whoopdedo.org>
94*094ebf29SSatoshi Sahara     * @author Andreas Gohr <andi@splitbrain.org>
95*094ebf29SSatoshi Sahara     */
96*094ebf29SSatoshi Sahara    public function getWords($text, $wc=false)
97*094ebf29SSatoshi Sahara    {
98*094ebf29SSatoshi Sahara        $wc = ($wc) ? '' : '\*';
99*094ebf29SSatoshi Sahara
100*094ebf29SSatoshi Sahara        // prepare the text to be tokenized
101*094ebf29SSatoshi Sahara        $event = new Event('INDEXER_TEXT_PREPARE', $text);
102*094ebf29SSatoshi Sahara        if ($event->advise_before(true)) {
103*094ebf29SSatoshi Sahara            if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
104*094ebf29SSatoshi Sahara                $text = Utf8\Asian::separateAsianWords($text);
105*094ebf29SSatoshi Sahara            }
106*094ebf29SSatoshi Sahara        }
107*094ebf29SSatoshi Sahara        $event->advise_after();
108*094ebf29SSatoshi Sahara        unset($event);
109*094ebf29SSatoshi Sahara
110*094ebf29SSatoshi Sahara        $text = strtr($text,
111*094ebf29SSatoshi Sahara                       array(
112*094ebf29SSatoshi Sahara                           "\r" => ' ',
113*094ebf29SSatoshi Sahara                           "\n" => ' ',
114*094ebf29SSatoshi Sahara                           "\t" => ' ',
115*094ebf29SSatoshi Sahara                           "\xC2\xAD" => '', //soft-hyphen
116*094ebf29SSatoshi Sahara                       )
117*094ebf29SSatoshi Sahara                     );
118*094ebf29SSatoshi Sahara        if (preg_match('/[^0-9A-Za-z ]/u', $text)) {
119*094ebf29SSatoshi Sahara            $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc);
120*094ebf29SSatoshi Sahara        }
121*094ebf29SSatoshi Sahara
122*094ebf29SSatoshi Sahara        $wordlist = explode(' ', $text);
123*094ebf29SSatoshi Sahara        foreach ($wordlist as $i => $word) {
124*094ebf29SSatoshi Sahara            $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ?
125*094ebf29SSatoshi Sahara                Utf8\PhpString::strtolower($word) : strtolower($word);
126*094ebf29SSatoshi Sahara        }
127*094ebf29SSatoshi Sahara
128*094ebf29SSatoshi Sahara        foreach ($wordlist as $i => $word) {
129*094ebf29SSatoshi Sahara            if ((!is_numeric($word) && strlen($word) < $this->MinWordLength)
130*094ebf29SSatoshi Sahara              || array_search($word, $this->getStopwords(), true) !== false) {
131*094ebf29SSatoshi Sahara                unset($wordlist[$i]);
132*094ebf29SSatoshi Sahara            }
133*094ebf29SSatoshi Sahara        }
134*094ebf29SSatoshi Sahara        return array_values($wordlist);
135*094ebf29SSatoshi Sahara    }
136*094ebf29SSatoshi Sahara}
137