1*8817535bSAndreas Gohr<?php 2*8817535bSAndreas Gohr 3*8817535bSAndreas Gohrnamespace Vanderlee\Sentence; 4*8817535bSAndreas Gohr 5*8817535bSAndreas Gohr/** 6*8817535bSAndreas Gohr * Segments sentences. 7*8817535bSAndreas Gohr * Clipping may not be perfect. 8*8817535bSAndreas Gohr * Sentence count should be VERY close to the truth. 9*8817535bSAndreas Gohr * 10*8817535bSAndreas Gohr * Multibyte.php safe (atleast for UTF-8), but rules based on germanic 11*8817535bSAndreas Gohr * language stucture (English, Dutch, German). Should work for most 12*8817535bSAndreas Gohr * latin-alphabet languages. 13*8817535bSAndreas Gohr * 14*8817535bSAndreas Gohr * @author Martijn van der Lee (@vanderlee) 15*8817535bSAndreas Gohr * @author @marktaw 16*8817535bSAndreas Gohr */ 17*8817535bSAndreas Gohrclass Sentence 18*8817535bSAndreas Gohr{ 19*8817535bSAndreas Gohr 20*8817535bSAndreas Gohr /** 21*8817535bSAndreas Gohr * Specify this flag with the split method to trim whitespace. 22*8817535bSAndreas Gohr */ 23*8817535bSAndreas Gohr const SPLIT_TRIM = 0x1; 24*8817535bSAndreas Gohr 25*8817535bSAndreas Gohr /** 26*8817535bSAndreas Gohr * List of characters used to terminate sentences. 27*8817535bSAndreas Gohr * 28*8817535bSAndreas Gohr * @var string[] 29*8817535bSAndreas Gohr */ 30*8817535bSAndreas Gohr private $terminals = ['.', '!', '?']; 31*8817535bSAndreas Gohr 32*8817535bSAndreas Gohr /** 33*8817535bSAndreas Gohr * List of characters used for abbreviations. 34*8817535bSAndreas Gohr * 35*8817535bSAndreas Gohr * @var string[] 36*8817535bSAndreas Gohr */ 37*8817535bSAndreas Gohr private $abbreviators = ['.']; 38*8817535bSAndreas Gohr 39*8817535bSAndreas Gohr /** 40*8817535bSAndreas Gohr * List of replacements in the text. 41*8817535bSAndreas Gohr * 42*8817535bSAndreas Gohr * @var string[] 43*8817535bSAndreas Gohr */ 44*8817535bSAndreas Gohr private $replacements = []; 45*8817535bSAndreas Gohr 46*8817535bSAndreas Gohr /** 47*8817535bSAndreas Gohr * Generate an in-text replacement code for the specified index 48*8817535bSAndreas Gohr * 49*8817535bSAndreas Gohr * @param int $index 50*8817535bSAndreas Gohr * 51*8817535bSAndreas Gohr * @return string 52*8817535bSAndreas Gohr */ 53*8817535bSAndreas Gohr private function getReplaceCode(int $index) 54*8817535bSAndreas Gohr { 55*8817535bSAndreas Gohr return 0x02 . $index . 0x03; 56*8817535bSAndreas Gohr } 57*8817535bSAndreas Gohr 58*8817535bSAndreas Gohr /** 59*8817535bSAndreas Gohr * Clean floating point numbers by replace them with an in-text index 60*8817535bSAndreas Gohr * 61*8817535bSAndreas Gohr * @param string $text 62*8817535bSAndreas Gohr * 63*8817535bSAndreas Gohr * @return string 64*8817535bSAndreas Gohr */ 65*8817535bSAndreas Gohr private function replaceFloatNumbers(string $text) 66*8817535bSAndreas Gohr { 67*8817535bSAndreas Gohr preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE); 68*8817535bSAndreas Gohr 69*8817535bSAndreas Gohr $this->replacements = []; 70*8817535bSAndreas Gohr $index = 0; 71*8817535bSAndreas Gohr foreach (array_reverse($matches[0]) as $match) { 72*8817535bSAndreas Gohr $number = $match[0]; 73*8817535bSAndreas Gohr $offset = $match[1]; 74*8817535bSAndreas Gohr $code = $this->getReplaceCode($index); 75*8817535bSAndreas Gohr 76*8817535bSAndreas Gohr $this->replacements[$index] = $number; 77*8817535bSAndreas Gohr 78*8817535bSAndreas Gohr $text = (string)substr_replace($text, $code, $offset, mb_strlen($number)); 79*8817535bSAndreas Gohr 80*8817535bSAndreas Gohr ++$index; 81*8817535bSAndreas Gohr } 82*8817535bSAndreas Gohr 83*8817535bSAndreas Gohr return $text; 84*8817535bSAndreas Gohr } 85*8817535bSAndreas Gohr 86*8817535bSAndreas Gohr /** 87*8817535bSAndreas Gohr * Restore any stored replacements 88*8817535bSAndreas Gohr * 89*8817535bSAndreas Gohr * @param string[] $text 90*8817535bSAndreas Gohr * 91*8817535bSAndreas Gohr * @return string[] 92*8817535bSAndreas Gohr */ 93*8817535bSAndreas Gohr private function restoreReplacements($text) 94*8817535bSAndreas Gohr { 95*8817535bSAndreas Gohr return array_map(function ($value) { 96*8817535bSAndreas Gohr foreach ($this->replacements as $index => $number) { 97*8817535bSAndreas Gohr $code = $this->getReplaceCode($index); 98*8817535bSAndreas Gohr $value = str_replace($code, $number, $value); 99*8817535bSAndreas Gohr } 100*8817535bSAndreas Gohr 101*8817535bSAndreas Gohr return $value; 102*8817535bSAndreas Gohr }, $text); 103*8817535bSAndreas Gohr } 104*8817535bSAndreas Gohr 105*8817535bSAndreas Gohr /** 106*8817535bSAndreas Gohr * Breaks a piece of text into lines by linebreak. 107*8817535bSAndreas Gohr * Eats up any linebreak characters as if one. 108*8817535bSAndreas Gohr * 109*8817535bSAndreas Gohr * Multibyte.php safe 110*8817535bSAndreas Gohr * 111*8817535bSAndreas Gohr * @param string $text 112*8817535bSAndreas Gohr * 113*8817535bSAndreas Gohr * @return string[] 114*8817535bSAndreas Gohr */ 115*8817535bSAndreas Gohr private static function linebreakSplit($text) 116*8817535bSAndreas Gohr { 117*8817535bSAndreas Gohr $lines = []; 118*8817535bSAndreas Gohr $line = ''; 119*8817535bSAndreas Gohr 120*8817535bSAndreas Gohr foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) { 121*8817535bSAndreas Gohr $line .= $part; 122*8817535bSAndreas Gohr if (Multibyte::trim($part) === '') { 123*8817535bSAndreas Gohr $lines[] = $line; 124*8817535bSAndreas Gohr $line = ''; 125*8817535bSAndreas Gohr } 126*8817535bSAndreas Gohr } 127*8817535bSAndreas Gohr $lines[] = $line; 128*8817535bSAndreas Gohr 129*8817535bSAndreas Gohr return $lines; 130*8817535bSAndreas Gohr } 131*8817535bSAndreas Gohr 132*8817535bSAndreas Gohr /** 133*8817535bSAndreas Gohr * Splits an array of lines by (consecutive sequences of) 134*8817535bSAndreas Gohr * terminals, keeping terminals. 135*8817535bSAndreas Gohr * 136*8817535bSAndreas Gohr * Multibyte.php safe (atleast for UTF-8) 137*8817535bSAndreas Gohr * 138*8817535bSAndreas Gohr * For example: 139*8817535bSAndreas Gohr * "There ... is. More!" 140*8817535bSAndreas Gohr * ... becomes ... 141*8817535bSAndreas Gohr * [ "There ", "...", " is", ".", " More", "!" ] 142*8817535bSAndreas Gohr * 143*8817535bSAndreas Gohr * @param string $line 144*8817535bSAndreas Gohr * 145*8817535bSAndreas Gohr * @return string[] 146*8817535bSAndreas Gohr */ 147*8817535bSAndreas Gohr private function punctuationSplit($line) 148*8817535bSAndreas Gohr { 149*8817535bSAndreas Gohr $parts = []; 150*8817535bSAndreas Gohr 151*8817535bSAndreas Gohr $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe! 152*8817535bSAndreas Gohr $is_terminal = in_array($chars[0], $this->terminals); 153*8817535bSAndreas Gohr 154*8817535bSAndreas Gohr $part = ''; 155*8817535bSAndreas Gohr foreach ($chars as $char) { 156*8817535bSAndreas Gohr if (in_array($char, $this->terminals) !== $is_terminal) { 157*8817535bSAndreas Gohr $parts[] = $part; 158*8817535bSAndreas Gohr $part = ''; 159*8817535bSAndreas Gohr $is_terminal = !$is_terminal; 160*8817535bSAndreas Gohr } 161*8817535bSAndreas Gohr $part .= $char; 162*8817535bSAndreas Gohr } 163*8817535bSAndreas Gohr 164*8817535bSAndreas Gohr if (!empty($part)) { 165*8817535bSAndreas Gohr $parts[] = $part; 166*8817535bSAndreas Gohr } 167*8817535bSAndreas Gohr 168*8817535bSAndreas Gohr return $parts; 169*8817535bSAndreas Gohr } 170*8817535bSAndreas Gohr 171*8817535bSAndreas Gohr /** 172*8817535bSAndreas Gohr * Appends each terminal item after it's preceding 173*8817535bSAndreas Gohr * non-terminals. 174*8817535bSAndreas Gohr * 175*8817535bSAndreas Gohr * Multibyte.php safe (atleast for UTF-8) 176*8817535bSAndreas Gohr * 177*8817535bSAndreas Gohr * For example: 178*8817535bSAndreas Gohr * [ "There ", "...", " is", ".", " More", "!" ] 179*8817535bSAndreas Gohr * ... becomes ... 180*8817535bSAndreas Gohr * [ "There ... is.", "More!" ] 181*8817535bSAndreas Gohr * 182*8817535bSAndreas Gohr * @param string[] $punctuations 183*8817535bSAndreas Gohr * 184*8817535bSAndreas Gohr * @return string[] 185*8817535bSAndreas Gohr */ 186*8817535bSAndreas Gohr private function punctuationMerge($punctuations) 187*8817535bSAndreas Gohr { 188*8817535bSAndreas Gohr $definite_terminals = array_diff($this->terminals, $this->abbreviators); 189*8817535bSAndreas Gohr 190*8817535bSAndreas Gohr $merges = []; 191*8817535bSAndreas Gohr $merge = ''; 192*8817535bSAndreas Gohr 193*8817535bSAndreas Gohr $filtered = array_filter($punctuations, function ($p) { 194*8817535bSAndreas Gohr return $p !== ''; 195*8817535bSAndreas Gohr }); 196*8817535bSAndreas Gohr 197*8817535bSAndreas Gohr foreach ($filtered as $punctuation) { 198*8817535bSAndreas Gohr $merge .= $punctuation; 199*8817535bSAndreas Gohr if (mb_strlen($punctuation) === 1 200*8817535bSAndreas Gohr && in_array($punctuation, $this->terminals)) { 201*8817535bSAndreas Gohr $merges[] = $merge; 202*8817535bSAndreas Gohr $merge = ''; 203*8817535bSAndreas Gohr } else { 204*8817535bSAndreas Gohr foreach ($definite_terminals as $terminal) { 205*8817535bSAndreas Gohr if (mb_strpos($punctuation, $terminal) !== false) { 206*8817535bSAndreas Gohr $merges[] = $merge; 207*8817535bSAndreas Gohr $merge = ''; 208*8817535bSAndreas Gohr break; 209*8817535bSAndreas Gohr } 210*8817535bSAndreas Gohr } 211*8817535bSAndreas Gohr } 212*8817535bSAndreas Gohr } 213*8817535bSAndreas Gohr if (!empty($merge)) { 214*8817535bSAndreas Gohr $merges[] = $merge; 215*8817535bSAndreas Gohr } 216*8817535bSAndreas Gohr 217*8817535bSAndreas Gohr return $merges; 218*8817535bSAndreas Gohr } 219*8817535bSAndreas Gohr 220*8817535bSAndreas Gohr /** 221*8817535bSAndreas Gohr * Looks for capitalized abbreviations & includes them with the following fragment. 222*8817535bSAndreas Gohr * 223*8817535bSAndreas Gohr * For example: 224*8817535bSAndreas Gohr * [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ] 225*8817535bSAndreas Gohr * ... becomes ... 226*8817535bSAndreas Gohr * [ "Last week, former director of the F.B.I. James B. Comey was fired." ] 227*8817535bSAndreas Gohr * [ "Mr. Comey was not available for comment." ] 228*8817535bSAndreas Gohr * 229*8817535bSAndreas Gohr * @param string[] $fragments 230*8817535bSAndreas Gohr * 231*8817535bSAndreas Gohr * @return string[] 232*8817535bSAndreas Gohr */ 233*8817535bSAndreas Gohr private function abbreviationMerge($fragments) 234*8817535bSAndreas Gohr { 235*8817535bSAndreas Gohr $return_fragment = []; 236*8817535bSAndreas Gohr 237*8817535bSAndreas Gohr $previous_fragment = ''; 238*8817535bSAndreas Gohr $previous_is_abbreviation = false; 239*8817535bSAndreas Gohr $i = 0; 240*8817535bSAndreas Gohr foreach ($fragments as $fragment) { 241*8817535bSAndreas Gohr $is_abbreviation = self::isAbreviation($fragment); 242*8817535bSAndreas Gohr 243*8817535bSAndreas Gohr // merge previous fragment with this 244*8817535bSAndreas Gohr if ($previous_is_abbreviation) { 245*8817535bSAndreas Gohr $fragment = $previous_fragment . $fragment; 246*8817535bSAndreas Gohr } 247*8817535bSAndreas Gohr $return_fragment[$i] = $fragment; 248*8817535bSAndreas Gohr 249*8817535bSAndreas Gohr $previous_is_abbreviation = $is_abbreviation; 250*8817535bSAndreas Gohr $previous_fragment = $fragment; 251*8817535bSAndreas Gohr 252*8817535bSAndreas Gohr // only increment if this isn't an abbreviation 253*8817535bSAndreas Gohr if (!$is_abbreviation) { 254*8817535bSAndreas Gohr $i++; 255*8817535bSAndreas Gohr } 256*8817535bSAndreas Gohr } 257*8817535bSAndreas Gohr 258*8817535bSAndreas Gohr return $return_fragment; 259*8817535bSAndreas Gohr } 260*8817535bSAndreas Gohr 261*8817535bSAndreas Gohr /** 262*8817535bSAndreas Gohr * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters. 263*8817535bSAndreas Gohr * 264*8817535bSAndreas Gohr * @param $fragment 265*8817535bSAndreas Gohr * 266*8817535bSAndreas Gohr * @return bool 267*8817535bSAndreas Gohr */ 268*8817535bSAndreas Gohr private static function isAbreviation($fragment) 269*8817535bSAndreas Gohr { 270*8817535bSAndreas Gohr $words = mb_split('\s+', Multibyte::trim($fragment)); 271*8817535bSAndreas Gohr 272*8817535bSAndreas Gohr $word_count = count($words); 273*8817535bSAndreas Gohr 274*8817535bSAndreas Gohr $last_word = Multibyte::trim($words[$word_count - 1]); 275*8817535bSAndreas Gohr $last_is_capital = preg_match('#^\p{Lu}#u', $last_word); 276*8817535bSAndreas Gohr $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.'; 277*8817535bSAndreas Gohr 278*8817535bSAndreas Gohr return $last_is_capital > 0 279*8817535bSAndreas Gohr && $last_is_abbreviation > 0 280*8817535bSAndreas Gohr && mb_strlen($last_word) <= 3; 281*8817535bSAndreas Gohr } 282*8817535bSAndreas Gohr 283*8817535bSAndreas Gohr /** 284*8817535bSAndreas Gohr * Merges any part starting with a closing parenthesis ')' to the previous 285*8817535bSAndreas Gohr * part. 286*8817535bSAndreas Gohr * 287*8817535bSAndreas Gohr * @param string[] $parts 288*8817535bSAndreas Gohr * 289*8817535bSAndreas Gohr * @return string[] 290*8817535bSAndreas Gohr */ 291*8817535bSAndreas Gohr private function parenthesesMerge($parts) 292*8817535bSAndreas Gohr { 293*8817535bSAndreas Gohr $subsentences = []; 294*8817535bSAndreas Gohr 295*8817535bSAndreas Gohr foreach ($parts as $part) { 296*8817535bSAndreas Gohr if ($part[0] === ')' && !empty($subsentences)) { 297*8817535bSAndreas Gohr $subsentences[count($subsentences) - 1] .= $part; 298*8817535bSAndreas Gohr } else { 299*8817535bSAndreas Gohr $subsentences[] = $part; 300*8817535bSAndreas Gohr } 301*8817535bSAndreas Gohr } 302*8817535bSAndreas Gohr 303*8817535bSAndreas Gohr return $subsentences; 304*8817535bSAndreas Gohr } 305*8817535bSAndreas Gohr 306*8817535bSAndreas Gohr /** 307*8817535bSAndreas Gohr * Looks for closing quotes to include them with the previous statement. 308*8817535bSAndreas Gohr * "That was very interesting," he said. 309*8817535bSAndreas Gohr * "That was very interesting." 310*8817535bSAndreas Gohr * 311*8817535bSAndreas Gohr * @param string[] $statements 312*8817535bSAndreas Gohr * 313*8817535bSAndreas Gohr * @return string[] 314*8817535bSAndreas Gohr */ 315*8817535bSAndreas Gohr private function closeQuotesMerge($statements) 316*8817535bSAndreas Gohr { 317*8817535bSAndreas Gohr $i = 0; 318*8817535bSAndreas Gohr $previous_statement = ''; 319*8817535bSAndreas Gohr $return = []; 320*8817535bSAndreas Gohr foreach ($statements as $statement) { 321*8817535bSAndreas Gohr if (self::isEndQuote($statement)) { 322*8817535bSAndreas Gohr $statement = $previous_statement . $statement; 323*8817535bSAndreas Gohr } else { 324*8817535bSAndreas Gohr $i++; 325*8817535bSAndreas Gohr } 326*8817535bSAndreas Gohr 327*8817535bSAndreas Gohr $return[$i] = $statement; 328*8817535bSAndreas Gohr $previous_statement = $statement; 329*8817535bSAndreas Gohr } 330*8817535bSAndreas Gohr 331*8817535bSAndreas Gohr return $return; 332*8817535bSAndreas Gohr } 333*8817535bSAndreas Gohr 334*8817535bSAndreas Gohr /** 335*8817535bSAndreas Gohr * Check if the entire string is a quotation mark or quote, then space, then lowercase. 336*8817535bSAndreas Gohr * 337*8817535bSAndreas Gohr * @param $statement 338*8817535bSAndreas Gohr * 339*8817535bSAndreas Gohr * @return bool 340*8817535bSAndreas Gohr */ 341*8817535bSAndreas Gohr private static function isEndQuote($statement) 342*8817535bSAndreas Gohr { 343*8817535bSAndreas Gohr $trimmed = Multibyte::trim($statement); 344*8817535bSAndreas Gohr $first = mb_substr($statement, 0, 1); 345*8817535bSAndreas Gohr 346*8817535bSAndreas Gohr return in_array($trimmed, ['"', '\'']) 347*8817535bSAndreas Gohr || ( 348*8817535bSAndreas Gohr in_array($first, ['"', '\'']) 349*8817535bSAndreas Gohr && mb_substr($statement, 1, 1) === ' ' 350*8817535bSAndreas Gohr && ctype_lower(mb_substr($statement, 2, 1)) === true 351*8817535bSAndreas Gohr ); 352*8817535bSAndreas Gohr } 353*8817535bSAndreas Gohr 354*8817535bSAndreas Gohr /** 355*8817535bSAndreas Gohr * Merges items into larger sentences. 356*8817535bSAndreas Gohr * Multibyte.php safe 357*8817535bSAndreas Gohr * 358*8817535bSAndreas Gohr * @param string[] $shorts 359*8817535bSAndreas Gohr * 360*8817535bSAndreas Gohr * @return string[] 361*8817535bSAndreas Gohr */ 362*8817535bSAndreas Gohr private function sentenceMerge($shorts) 363*8817535bSAndreas Gohr { 364*8817535bSAndreas Gohr $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators); 365*8817535bSAndreas Gohr 366*8817535bSAndreas Gohr $sentences = []; 367*8817535bSAndreas Gohr 368*8817535bSAndreas Gohr $sentence = ''; 369*8817535bSAndreas Gohr $has_words = false; 370*8817535bSAndreas Gohr $previous_word_ending = null; 371*8817535bSAndreas Gohr foreach ($shorts as $short) { 372*8817535bSAndreas Gohr $word_count = count(mb_split('\s+', Multibyte::trim($short))); 373*8817535bSAndreas Gohr $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals); 374*8817535bSAndreas Gohr 375*8817535bSAndreas Gohr if ($after_non_abbreviating_terminal 376*8817535bSAndreas Gohr || ($has_words && $word_count > 1)) { 377*8817535bSAndreas Gohr 378*8817535bSAndreas Gohr $sentences[] = $sentence; 379*8817535bSAndreas Gohr 380*8817535bSAndreas Gohr $sentence = ''; 381*8817535bSAndreas Gohr $has_words = false; 382*8817535bSAndreas Gohr } 383*8817535bSAndreas Gohr 384*8817535bSAndreas Gohr $has_words = $has_words 385*8817535bSAndreas Gohr || $word_count > 1; 386*8817535bSAndreas Gohr 387*8817535bSAndreas Gohr $sentence .= $short; 388*8817535bSAndreas Gohr $previous_word_ending = mb_substr($short, -1); 389*8817535bSAndreas Gohr } 390*8817535bSAndreas Gohr 391*8817535bSAndreas Gohr if (!empty($sentence)) { 392*8817535bSAndreas Gohr $sentences[] = $sentence; 393*8817535bSAndreas Gohr } 394*8817535bSAndreas Gohr 395*8817535bSAndreas Gohr return $sentences; 396*8817535bSAndreas Gohr } 397*8817535bSAndreas Gohr 398*8817535bSAndreas Gohr /** 399*8817535bSAndreas Gohr * Return the sentences sentences detected in the provided text. 400*8817535bSAndreas Gohr * Set the Sentence::SPLIT_TRIM flag to trim whitespace. 401*8817535bSAndreas Gohr * 402*8817535bSAndreas Gohr * @param string $text 403*8817535bSAndreas Gohr * @param integer $flags 404*8817535bSAndreas Gohr * 405*8817535bSAndreas Gohr * @return string[] 406*8817535bSAndreas Gohr */ 407*8817535bSAndreas Gohr public function split($text, $flags = 0) 408*8817535bSAndreas Gohr { 409*8817535bSAndreas Gohr static $pipeline = [ 410*8817535bSAndreas Gohr 'replaceFloatNumbers', 411*8817535bSAndreas Gohr 'punctuationSplit', 412*8817535bSAndreas Gohr 'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge 413*8817535bSAndreas Gohr 'punctuationMerge', 414*8817535bSAndreas Gohr 'abbreviationMerge', 415*8817535bSAndreas Gohr 'closeQuotesMerge', 416*8817535bSAndreas Gohr 'sentenceMerge', 417*8817535bSAndreas Gohr 'restoreReplacements', 418*8817535bSAndreas Gohr ]; 419*8817535bSAndreas Gohr 420*8817535bSAndreas Gohr // clean funny quotes 421*8817535bSAndreas Gohr $text = Multibyte::cleanUnicode($text); 422*8817535bSAndreas Gohr 423*8817535bSAndreas Gohr // Split 424*8817535bSAndreas Gohr $sentences = []; 425*8817535bSAndreas Gohr foreach (self::linebreakSplit($text) as $input) { 426*8817535bSAndreas Gohr if (Multibyte::trim($input) !== '') { 427*8817535bSAndreas Gohr foreach ($pipeline as $method) { 428*8817535bSAndreas Gohr $input = $this->$method($input); 429*8817535bSAndreas Gohr } 430*8817535bSAndreas Gohr $sentences = array_merge($sentences, $input); 431*8817535bSAndreas Gohr } 432*8817535bSAndreas Gohr } 433*8817535bSAndreas Gohr 434*8817535bSAndreas Gohr // Post process 435*8817535bSAndreas Gohr if ($flags & self::SPLIT_TRIM) { 436*8817535bSAndreas Gohr return self::trimSentences($sentences); 437*8817535bSAndreas Gohr } 438*8817535bSAndreas Gohr 439*8817535bSAndreas Gohr return $sentences; 440*8817535bSAndreas Gohr } 441*8817535bSAndreas Gohr 442*8817535bSAndreas Gohr /** 443*8817535bSAndreas Gohr * Multibyte.php trim each string in an array. 444*8817535bSAndreas Gohr * 445*8817535bSAndreas Gohr * @param string[] $sentences 446*8817535bSAndreas Gohr * 447*8817535bSAndreas Gohr * @return string[] 448*8817535bSAndreas Gohr */ 449*8817535bSAndreas Gohr private static function trimSentences($sentences) 450*8817535bSAndreas Gohr { 451*8817535bSAndreas Gohr return array_map(function ($sentence) { 452*8817535bSAndreas Gohr return Multibyte::trim($sentence); 453*8817535bSAndreas Gohr }, $sentences); 454*8817535bSAndreas Gohr } 455*8817535bSAndreas Gohr 456*8817535bSAndreas Gohr /** 457*8817535bSAndreas Gohr * Return the number of sentences detected in the provided text. 458*8817535bSAndreas Gohr * 459*8817535bSAndreas Gohr * @param string $text 460*8817535bSAndreas Gohr * 461*8817535bSAndreas Gohr * @return integer 462*8817535bSAndreas Gohr */ 463*8817535bSAndreas Gohr public function count($text) 464*8817535bSAndreas Gohr { 465*8817535bSAndreas Gohr return count($this->split($text)); 466*8817535bSAndreas Gohr } 467*8817535bSAndreas Gohr 468*8817535bSAndreas Gohr} 469