1*094ebf29SSatoshi Sahara<?php 2*094ebf29SSatoshi Sahara 3*094ebf29SSatoshi Saharanamespace dokuwiki\Search; 4*094ebf29SSatoshi Sahara 5*094ebf29SSatoshi Saharause dokuwiki\Extension\Event; 6*094ebf29SSatoshi Saharause dokuwiki\Utf8; 7*094ebf29SSatoshi Sahara 8*094ebf29SSatoshi Sahara// set the minimum token length to use in the index 9*094ebf29SSatoshi Sahara// (note, this doesn't apply to numeric tokens) 10*094ebf29SSatoshi Saharaconst MINWORDLENGTH = 2; 11*094ebf29SSatoshi Sahara 12*094ebf29SSatoshi Sahara/** 13*094ebf29SSatoshi Sahara * DokuWuki Tokenizer class (Singleton) 14*094ebf29SSatoshi Sahara */ 15*094ebf29SSatoshi Saharaclass Tokenizer 16*094ebf29SSatoshi Sahara{ 17*094ebf29SSatoshi Sahara /** @var Tokenizer */ 18*094ebf29SSatoshi Sahara protected static $instance = null; 19*094ebf29SSatoshi Sahara 20*094ebf29SSatoshi Sahara /** @var array $Stopwords Words that tokenizer ignores */ 21*094ebf29SSatoshi Sahara protected $Stopwords; 22*094ebf29SSatoshi Sahara 23*094ebf29SSatoshi Sahara /** @var int $MinWordLength minimum token length */ 24*094ebf29SSatoshi Sahara protected $MinWordLength; 25*094ebf29SSatoshi Sahara 26*094ebf29SSatoshi Sahara /** 27*094ebf29SSatoshi Sahara * Tokenizer constructor. Singleton, thus protected! 28*094ebf29SSatoshi Sahara */ 29*094ebf29SSatoshi Sahara protected function __construct() 30*094ebf29SSatoshi Sahara { 31*094ebf29SSatoshi Sahara // set the minimum token length to use in the index 32*094ebf29SSatoshi Sahara // (note, this doesn't apply to numeric tokens) 33*094ebf29SSatoshi Sahara $this->MinWordLength = (defined('IDX_MINWORDLENGTH')) 34*094ebf29SSatoshi Sahara ? IDX_MINWORDLENGTH 35*094ebf29SSatoshi Sahara : MINWORDLENGTH; 36*094ebf29SSatoshi Sahara 37*094ebf29SSatoshi Sahara $this->getStopwords(); 38*094ebf29SSatoshi Sahara } 39*094ebf29SSatoshi Sahara 40*094ebf29SSatoshi Sahara /** 41*094ebf29SSatoshi Sahara * Get new or existing singleton instance of the Tokenizer 42*094ebf29SSatoshi Sahara * 43*094ebf29SSatoshi Sahara * @return PagewordIndex 44*094ebf29SSatoshi Sahara */ 45*094ebf29SSatoshi Sahara public static function getInstance() 46*094ebf29SSatoshi Sahara { 47*094ebf29SSatoshi Sahara if (is_null(static::$instance)) { 48*094ebf29SSatoshi Sahara static::$instance = new static(); 49*094ebf29SSatoshi Sahara } 50*094ebf29SSatoshi Sahara return static::$instance; 51*094ebf29SSatoshi Sahara } 52*094ebf29SSatoshi Sahara 53*094ebf29SSatoshi Sahara /** 54*094ebf29SSatoshi Sahara * Returns words that will be ignored 55*094ebf29SSatoshi Sahara * 56*094ebf29SSatoshi Sahara * @return array list of stop words 57*094ebf29SSatoshi Sahara * 58*094ebf29SSatoshi Sahara * @author Tom N Harris <tnharris@whoopdedo.org> 59*094ebf29SSatoshi Sahara */ 60*094ebf29SSatoshi Sahara public function getStopwords() 61*094ebf29SSatoshi Sahara { 62*094ebf29SSatoshi Sahara if (!isset($this->Stopwords)) { 63*094ebf29SSatoshi Sahara global $conf; 64*094ebf29SSatoshi Sahara $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 65*094ebf29SSatoshi Sahara if (file_exists($swFile)) { 66*094ebf29SSatoshi Sahara $this->Stopwords = file($swFile, FILE_IGNORE_NEW_LINES); 67*094ebf29SSatoshi Sahara } else { 68*094ebf29SSatoshi Sahara $this->Stopwords = array(); 69*094ebf29SSatoshi Sahara } 70*094ebf29SSatoshi Sahara } 71*094ebf29SSatoshi Sahara return $this->Stopwords; 72*094ebf29SSatoshi Sahara } 73*094ebf29SSatoshi Sahara 74*094ebf29SSatoshi Sahara /** 75*094ebf29SSatoshi Sahara * Returns minimum word length to be used in the index 76*094ebf29SSatoshi Sahara */ 77*094ebf29SSatoshi Sahara public function getMinWordLength() 78*094ebf29SSatoshi Sahara { 79*094ebf29SSatoshi Sahara return $this->MinWordLength; 80*094ebf29SSatoshi Sahara } 81*094ebf29SSatoshi Sahara 82*094ebf29SSatoshi Sahara /** 83*094ebf29SSatoshi Sahara * Split the text into words for fulltext search 84*094ebf29SSatoshi Sahara * 85*094ebf29SSatoshi Sahara * @triggers INDEXER_TEXT_PREPARE 86*094ebf29SSatoshi Sahara * This event allows plugins to modify the text before it gets tokenized. 87*094ebf29SSatoshi Sahara * Plugins intercepting this event should also intercept INDEX_VERSION_GET 88*094ebf29SSatoshi Sahara * 89*094ebf29SSatoshi Sahara * @param string $text plain text 90*094ebf29SSatoshi Sahara * @param bool $wc are wildcards allowed? 91*094ebf29SSatoshi Sahara * @return array list of words in the text 92*094ebf29SSatoshi Sahara * 93*094ebf29SSatoshi Sahara * @author Tom N Harris <tnharris@whoopdedo.org> 94*094ebf29SSatoshi Sahara * @author Andreas Gohr <andi@splitbrain.org> 95*094ebf29SSatoshi Sahara */ 96*094ebf29SSatoshi Sahara public function getWords($text, $wc=false) 97*094ebf29SSatoshi Sahara { 98*094ebf29SSatoshi Sahara $wc = ($wc) ? '' : '\*'; 99*094ebf29SSatoshi Sahara 100*094ebf29SSatoshi Sahara // prepare the text to be tokenized 101*094ebf29SSatoshi Sahara $event = new Event('INDEXER_TEXT_PREPARE', $text); 102*094ebf29SSatoshi Sahara if ($event->advise_before(true)) { 103*094ebf29SSatoshi Sahara if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 104*094ebf29SSatoshi Sahara $text = Utf8\Asian::separateAsianWords($text); 105*094ebf29SSatoshi Sahara } 106*094ebf29SSatoshi Sahara } 107*094ebf29SSatoshi Sahara $event->advise_after(); 108*094ebf29SSatoshi Sahara unset($event); 109*094ebf29SSatoshi Sahara 110*094ebf29SSatoshi Sahara $text = strtr($text, 111*094ebf29SSatoshi Sahara array( 112*094ebf29SSatoshi Sahara "\r" => ' ', 113*094ebf29SSatoshi Sahara "\n" => ' ', 114*094ebf29SSatoshi Sahara "\t" => ' ', 115*094ebf29SSatoshi Sahara "\xC2\xAD" => '', //soft-hyphen 116*094ebf29SSatoshi Sahara ) 117*094ebf29SSatoshi Sahara ); 118*094ebf29SSatoshi Sahara if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 119*094ebf29SSatoshi Sahara $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc); 120*094ebf29SSatoshi Sahara } 121*094ebf29SSatoshi Sahara 122*094ebf29SSatoshi Sahara $wordlist = explode(' ', $text); 123*094ebf29SSatoshi Sahara foreach ($wordlist as $i => $word) { 124*094ebf29SSatoshi Sahara $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 125*094ebf29SSatoshi Sahara Utf8\PhpString::strtolower($word) : strtolower($word); 126*094ebf29SSatoshi Sahara } 127*094ebf29SSatoshi Sahara 128*094ebf29SSatoshi Sahara foreach ($wordlist as $i => $word) { 129*094ebf29SSatoshi Sahara if ((!is_numeric($word) && strlen($word) < $this->MinWordLength) 130*094ebf29SSatoshi Sahara || array_search($word, $this->getStopwords(), true) !== false) { 131*094ebf29SSatoshi Sahara unset($wordlist[$i]); 132*094ebf29SSatoshi Sahara } 133*094ebf29SSatoshi Sahara } 134*094ebf29SSatoshi Sahara return array_values($wordlist); 135*094ebf29SSatoshi Sahara } 136*094ebf29SSatoshi Sahara} 137