1<?php 2 3namespace dokuwiki\Search; 4 5use dokuwiki\Extension\Event; 6use dokuwiki\Utf8; 7 8// set the minimum token length to use in the index 9// (note, this doesn't apply to numeric tokens) 10const MINWORDLENGTH = 2; 11 12/** 13 * DokuWuki Tokenizer class (Singleton) 14 */ 15class Tokenizer 16{ 17 /** @var Tokenizer */ 18 protected static $instance = null; 19 20 /** @var array $Stopwords Words that tokenizer ignores */ 21 protected $Stopwords; 22 23 /** @var int $MinWordLength minimum token length */ 24 protected $MinWordLength; 25 26 /** 27 * Tokenizer constructor. Singleton, thus protected! 28 */ 29 protected function __construct() 30 { 31 // set the minimum token length to use in the index 32 // (note, this doesn't apply to numeric tokens) 33 $this->MinWordLength = (defined('IDX_MINWORDLENGTH')) 34 ? IDX_MINWORDLENGTH 35 : MINWORDLENGTH; 36 37 $this->getStopwords(); 38 } 39 40 /** 41 * Get new or existing singleton instance of the Tokenizer 42 * 43 * @return Tokenizer 44 */ 45 public static function getInstance() 46 { 47 if (is_null(static::$instance)) { 48 static::$instance = new static(); 49 } 50 return static::$instance; 51 } 52 53 /** 54 * Returns words that will be ignored 55 * 56 * @return array list of stop words 57 * 58 * @author Tom N Harris <tnharris@whoopdedo.org> 59 */ 60 public function getStopwords() 61 { 62 if (!isset($this->Stopwords)) { 63 global $conf; 64 $swFile = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt'; 65 if (file_exists($swFile)) { 66 $this->Stopwords = file($swFile, FILE_IGNORE_NEW_LINES); 67 } else { 68 $this->Stopwords = array(); 69 } 70 } 71 return $this->Stopwords; 72 } 73 74 /** 75 * Returns minimum word length to be used in the index 76 */ 77 public function getMinWordLength() 78 { 79 return $this->MinWordLength; 80 } 81 82 /** 83 * Split the text into words for fulltext search 84 * 85 * @triggers INDEXER_TEXT_PREPARE 86 * This event allows plugins to modify the text before it gets tokenized. 87 * Plugins intercepting this event should also intercept INDEX_VERSION_GET 88 * 89 * @param string $text plain text 90 * @param bool $wc are wildcards allowed? 91 * @return array list of words in the text 92 * 93 * @author Tom N Harris <tnharris@whoopdedo.org> 94 * @author Andreas Gohr <andi@splitbrain.org> 95 */ 96 public function getWords($text, $wc=false) 97 { 98 $wc = ($wc) ? '' : '\*'; 99 100 // prepare the text to be tokenized 101 $event = new Event('INDEXER_TEXT_PREPARE', $text); 102 if ($event->advise_before(true)) { 103 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 104 $text = Utf8\Asian::separateAsianWords($text); 105 } 106 } 107 $event->advise_after(); 108 unset($event); 109 110 $text = strtr($text, 111 array( 112 "\r" => ' ', 113 "\n" => ' ', 114 "\t" => ' ', 115 "\xC2\xAD" => '', //soft-hyphen 116 ) 117 ); 118 if (preg_match('/[^0-9A-Za-z ]/u', $text)) { 119 $text = Utf8\Clean::stripspecials($text, ' ', '\._\-:'.$wc); 120 } 121 122 $wordlist = explode(' ', $text); 123 foreach ($wordlist as $i => $word) { 124 $wordlist[$i] = (preg_match('/[^0-9A-Za-z]/u', $word)) ? 125 Utf8\PhpString::strtolower($word) : strtolower($word); 126 } 127 128 foreach ($wordlist as $i => $word) { 129 if ((!is_numeric($word) && strlen($word) < $this->MinWordLength) 130 || array_search($word, $this->getStopwords(), true) !== false) { 131 unset($wordlist[$i]); 132 } 133 } 134 return array_values($wordlist); 135 } 136} 137