1<?php 2 3namespace Vanderlee\Sentence; 4 5/** 6 * Segments sentences. 7 * Clipping may not be perfect. 8 * Sentence count should be VERY close to the truth. 9 * 10 * Multibyte.php safe (atleast for UTF-8), but rules based on germanic 11 * language stucture (English, Dutch, German). Should work for most 12 * latin-alphabet languages. 13 * 14 * @author Martijn van der Lee (@vanderlee) 15 * @author @marktaw 16 */ 17class Sentence 18{ 19 20 /** 21 * Specify this flag with the split method to trim whitespace. 22 */ 23 const SPLIT_TRIM = 0x1; 24 25 /** 26 * List of characters used to terminate sentences. 27 * 28 * @var string[] 29 */ 30 private $terminals = ['.', '!', '?']; 31 32 /** 33 * List of characters used for abbreviations. 34 * 35 * @var string[] 36 */ 37 private $abbreviators = ['.']; 38 39 /** 40 * List of replacements in the text. 41 * 42 * @var string[] 43 */ 44 private $replacements = []; 45 46 /** 47 * Generate an in-text replacement code for the specified index 48 * 49 * @param int $index 50 * 51 * @return string 52 */ 53 private function getReplaceCode(int $index) 54 { 55 return 0x02 . $index . 0x03; 56 } 57 58 /** 59 * Clean floating point numbers by replace them with an in-text index 60 * 61 * @param string $text 62 * 63 * @return string 64 */ 65 private function replaceFloatNumbers(string $text) 66 { 67 preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE); 68 69 $this->replacements = []; 70 $index = 0; 71 foreach (array_reverse($matches[0]) as $match) { 72 $number = $match[0]; 73 $offset = $match[1]; 74 $code = $this->getReplaceCode($index); 75 76 $this->replacements[$index] = $number; 77 78 $text = (string)substr_replace($text, $code, $offset, mb_strlen($number)); 79 80 ++$index; 81 } 82 83 return $text; 84 } 85 86 /** 87 * Restore any stored replacements 88 * 89 * @param string[] $text 90 * 91 * @return string[] 92 */ 93 private function restoreReplacements($text) 94 { 95 return array_map(function ($value) { 96 foreach ($this->replacements as $index => $number) { 97 $code = $this->getReplaceCode($index); 98 $value = str_replace($code, $number, $value); 99 } 100 101 return $value; 102 }, $text); 103 } 104 105 /** 106 * Breaks a piece of text into lines by linebreak. 107 * Eats up any linebreak characters as if one. 108 * 109 * Multibyte.php safe 110 * 111 * @param string $text 112 * 113 * @return string[] 114 */ 115 private static function linebreakSplit($text) 116 { 117 $lines = []; 118 $line = ''; 119 120 foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) { 121 $line .= $part; 122 if (Multibyte::trim($part) === '') { 123 $lines[] = $line; 124 $line = ''; 125 } 126 } 127 $lines[] = $line; 128 129 return $lines; 130 } 131 132 /** 133 * Splits an array of lines by (consecutive sequences of) 134 * terminals, keeping terminals. 135 * 136 * Multibyte.php safe (atleast for UTF-8) 137 * 138 * For example: 139 * "There ... is. More!" 140 * ... becomes ... 141 * [ "There ", "...", " is", ".", " More", "!" ] 142 * 143 * @param string $line 144 * 145 * @return string[] 146 */ 147 private function punctuationSplit($line) 148 { 149 $parts = []; 150 151 $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe! 152 $is_terminal = in_array($chars[0], $this->terminals); 153 154 $part = ''; 155 foreach ($chars as $char) { 156 if (in_array($char, $this->terminals) !== $is_terminal) { 157 $parts[] = $part; 158 $part = ''; 159 $is_terminal = !$is_terminal; 160 } 161 $part .= $char; 162 } 163 164 if (!empty($part)) { 165 $parts[] = $part; 166 } 167 168 return $parts; 169 } 170 171 /** 172 * Appends each terminal item after it's preceding 173 * non-terminals. 174 * 175 * Multibyte.php safe (atleast for UTF-8) 176 * 177 * For example: 178 * [ "There ", "...", " is", ".", " More", "!" ] 179 * ... becomes ... 180 * [ "There ... is.", "More!" ] 181 * 182 * @param string[] $punctuations 183 * 184 * @return string[] 185 */ 186 private function punctuationMerge($punctuations) 187 { 188 $definite_terminals = array_diff($this->terminals, $this->abbreviators); 189 190 $merges = []; 191 $merge = ''; 192 193 $filtered = array_filter($punctuations, function ($p) { 194 return $p !== ''; 195 }); 196 197 foreach ($filtered as $punctuation) { 198 $merge .= $punctuation; 199 if (mb_strlen($punctuation) === 1 200 && in_array($punctuation, $this->terminals)) { 201 $merges[] = $merge; 202 $merge = ''; 203 } else { 204 foreach ($definite_terminals as $terminal) { 205 if (mb_strpos($punctuation, $terminal) !== false) { 206 $merges[] = $merge; 207 $merge = ''; 208 break; 209 } 210 } 211 } 212 } 213 if (!empty($merge)) { 214 $merges[] = $merge; 215 } 216 217 return $merges; 218 } 219 220 /** 221 * Looks for capitalized abbreviations & includes them with the following fragment. 222 * 223 * For example: 224 * [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ] 225 * ... becomes ... 226 * [ "Last week, former director of the F.B.I. James B. Comey was fired." ] 227 * [ "Mr. Comey was not available for comment." ] 228 * 229 * @param string[] $fragments 230 * 231 * @return string[] 232 */ 233 private function abbreviationMerge($fragments) 234 { 235 $return_fragment = []; 236 237 $previous_fragment = ''; 238 $previous_is_abbreviation = false; 239 $i = 0; 240 foreach ($fragments as $fragment) { 241 $is_abbreviation = self::isAbreviation($fragment); 242 243 // merge previous fragment with this 244 if ($previous_is_abbreviation) { 245 $fragment = $previous_fragment . $fragment; 246 } 247 $return_fragment[$i] = $fragment; 248 249 $previous_is_abbreviation = $is_abbreviation; 250 $previous_fragment = $fragment; 251 252 // only increment if this isn't an abbreviation 253 if (!$is_abbreviation) { 254 $i++; 255 } 256 } 257 258 return $return_fragment; 259 } 260 261 /** 262 * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters. 263 * 264 * @param $fragment 265 * 266 * @return bool 267 */ 268 private static function isAbreviation($fragment) 269 { 270 $words = mb_split('\s+', Multibyte::trim($fragment)); 271 272 $word_count = count($words); 273 274 $last_word = Multibyte::trim($words[$word_count - 1]); 275 $last_is_capital = preg_match('#^\p{Lu}#u', $last_word); 276 $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.'; 277 278 return $last_is_capital > 0 279 && $last_is_abbreviation > 0 280 && mb_strlen($last_word) <= 3; 281 } 282 283 /** 284 * Merges any part starting with a closing parenthesis ')' to the previous 285 * part. 286 * 287 * @param string[] $parts 288 * 289 * @return string[] 290 */ 291 private function parenthesesMerge($parts) 292 { 293 $subsentences = []; 294 295 foreach ($parts as $part) { 296 if ($part[0] === ')' && !empty($subsentences)) { 297 $subsentences[count($subsentences) - 1] .= $part; 298 } else { 299 $subsentences[] = $part; 300 } 301 } 302 303 return $subsentences; 304 } 305 306 /** 307 * Looks for closing quotes to include them with the previous statement. 308 * "That was very interesting," he said. 309 * "That was very interesting." 310 * 311 * @param string[] $statements 312 * 313 * @return string[] 314 */ 315 private function closeQuotesMerge($statements) 316 { 317 $i = 0; 318 $previous_statement = ''; 319 $return = []; 320 foreach ($statements as $statement) { 321 if (self::isEndQuote($statement)) { 322 $statement = $previous_statement . $statement; 323 } else { 324 $i++; 325 } 326 327 $return[$i] = $statement; 328 $previous_statement = $statement; 329 } 330 331 return $return; 332 } 333 334 /** 335 * Check if the entire string is a quotation mark or quote, then space, then lowercase. 336 * 337 * @param $statement 338 * 339 * @return bool 340 */ 341 private static function isEndQuote($statement) 342 { 343 $trimmed = Multibyte::trim($statement); 344 $first = mb_substr($statement, 0, 1); 345 346 return in_array($trimmed, ['"', '\'']) 347 || ( 348 in_array($first, ['"', '\'']) 349 && mb_substr($statement, 1, 1) === ' ' 350 && ctype_lower(mb_substr($statement, 2, 1)) === true 351 ); 352 } 353 354 /** 355 * Merges items into larger sentences. 356 * Multibyte.php safe 357 * 358 * @param string[] $shorts 359 * 360 * @return string[] 361 */ 362 private function sentenceMerge($shorts) 363 { 364 $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators); 365 366 $sentences = []; 367 368 $sentence = ''; 369 $has_words = false; 370 $previous_word_ending = null; 371 foreach ($shorts as $short) { 372 $word_count = count(mb_split('\s+', Multibyte::trim($short))); 373 $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals); 374 375 if ($after_non_abbreviating_terminal 376 || ($has_words && $word_count > 1)) { 377 378 $sentences[] = $sentence; 379 380 $sentence = ''; 381 $has_words = false; 382 } 383 384 $has_words = $has_words 385 || $word_count > 1; 386 387 $sentence .= $short; 388 $previous_word_ending = mb_substr($short, -1); 389 } 390 391 if (!empty($sentence)) { 392 $sentences[] = $sentence; 393 } 394 395 return $sentences; 396 } 397 398 /** 399 * Return the sentences sentences detected in the provided text. 400 * Set the Sentence::SPLIT_TRIM flag to trim whitespace. 401 * 402 * @param string $text 403 * @param integer $flags 404 * 405 * @return string[] 406 */ 407 public function split($text, $flags = 0) 408 { 409 static $pipeline = [ 410 'replaceFloatNumbers', 411 'punctuationSplit', 412 'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge 413 'punctuationMerge', 414 'abbreviationMerge', 415 'closeQuotesMerge', 416 'sentenceMerge', 417 'restoreReplacements', 418 ]; 419 420 // clean funny quotes 421 $text = Multibyte::cleanUnicode($text); 422 423 // Split 424 $sentences = []; 425 foreach (self::linebreakSplit($text) as $input) { 426 if (Multibyte::trim($input) !== '') { 427 foreach ($pipeline as $method) { 428 $input = $this->$method($input); 429 } 430 $sentences = array_merge($sentences, $input); 431 } 432 } 433 434 // Post process 435 if ($flags & self::SPLIT_TRIM) { 436 return self::trimSentences($sentences); 437 } 438 439 return $sentences; 440 } 441 442 /** 443 * Multibyte.php trim each string in an array. 444 * 445 * @param string[] $sentences 446 * 447 * @return string[] 448 */ 449 private static function trimSentences($sentences) 450 { 451 return array_map(function ($sentence) { 452 return Multibyte::trim($sentence); 453 }, $sentences); 454 } 455 456 /** 457 * Return the number of sentences detected in the provided text. 458 * 459 * @param string $text 460 * 461 * @return integer 462 */ 463 public function count($text) 464 { 465 return count($this->split($text)); 466 } 467 468} 469