MinWordLength = (defined('IDX_MINWORDLENGTH')) ? IDX_MINWORDLENGTH : MINWORDLENGTH; $this->getStopwords(); } /** * Get new or existing singleton instance of the Tokenizer * * @return PagewordIndex */ public static function getInstance() { if (is_null(static::$instance)) { static::$instance = new static(); } return static::$instance; } /** * Returns words that will be ignored * * @return array list of stop words * * @author Tom N Harris */ public function getStopwords() { if (!isset($this->Stopwords)) { global $conf; $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; if (file_exists($swFile)) { $this->Stopwords = file($swFile, FILE_IGNORE_NEW_LINES); } else { $this->Stopwords = array(); } } return $this->Stopwords; } /** * Returns minimum word length to be used in the index */ public function getMinWordLength() { return $this->MinWordLength; } /** * Split the text into words for fulltext search * * @triggers INDEXER_TEXT_PREPARE * This event allows plugins to modify the text before it gets tokenized. * Plugins intercepting this event should also intercept INDEX_VERSION_GET * * @param string $text plain text * @param bool $wc are wildcards allowed? * @return array list of words in the text * * @author Tom N Harris * @author Andreas Gohr */ public function getWords($text, $wc=false) { $wc = ($wc) ? '' : '\*'; // prepare the text to be tokenized $event = new Event('INDEXER_TEXT_PREPARE', $text); if ($event->advise_before(true)) { if (preg_match('/[^0-9A-Za-z ]/u', $text)) { $text = Utf8\Asian::separateAsianWords($text); } } $event->advise_after(); unset($event); $text = strtr($text, array( "\r" => ' ', "\n" => ' ', "\t" => ' ', "\xC2\xAD" => '', //soft-hyphen ) ); if (preg_match('/[^0-9A-Za-z ]/u', $text)) { $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc); } $wordlist = explode(' ', $text); foreach ($wordlist as $i => $word) { $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? Utf8\PhpString::strtolower($word) : strtolower($word); } foreach ($wordlist as $i => $word) { if ((!is_numeric($word) && strlen($word) < $this->MinWordLength) || array_search($word, $this->getStopwords(), true) !== false) { unset($wordlist[$i]); } } return array_values($wordlist); } }