xref: /plugin/aichat/vendor/vanderlee/php-sentence/src/Sentence.php (revision 8817535b0c67f8b10e9b8c05dcdf58fc17827423)
1*8817535bSAndreas Gohr<?php
2*8817535bSAndreas Gohr
3*8817535bSAndreas Gohrnamespace Vanderlee\Sentence;
4*8817535bSAndreas Gohr
5*8817535bSAndreas Gohr/**
6*8817535bSAndreas Gohr * Segments sentences.
7*8817535bSAndreas Gohr * Clipping may not be perfect.
8*8817535bSAndreas Gohr * Sentence count should be VERY close to the truth.
9*8817535bSAndreas Gohr *
10*8817535bSAndreas Gohr * Multibyte.php safe (atleast for UTF-8), but rules based on germanic
11*8817535bSAndreas Gohr * language stucture (English, Dutch, German). Should work for most
12*8817535bSAndreas Gohr * latin-alphabet languages.
13*8817535bSAndreas Gohr *
14*8817535bSAndreas Gohr * @author Martijn van der Lee (@vanderlee)
15*8817535bSAndreas Gohr * @author @marktaw
16*8817535bSAndreas Gohr */
17*8817535bSAndreas Gohrclass Sentence
18*8817535bSAndreas Gohr{
19*8817535bSAndreas Gohr
20*8817535bSAndreas Gohr    /**
21*8817535bSAndreas Gohr     * Specify this flag with the split method to trim whitespace.
22*8817535bSAndreas Gohr     */
23*8817535bSAndreas Gohr    const SPLIT_TRIM = 0x1;
24*8817535bSAndreas Gohr
25*8817535bSAndreas Gohr    /**
26*8817535bSAndreas Gohr     * List of characters used to terminate sentences.
27*8817535bSAndreas Gohr     *
28*8817535bSAndreas Gohr     * @var string[]
29*8817535bSAndreas Gohr     */
30*8817535bSAndreas Gohr    private $terminals = ['.', '!', '?'];
31*8817535bSAndreas Gohr
32*8817535bSAndreas Gohr    /**
33*8817535bSAndreas Gohr     * List of characters used for abbreviations.
34*8817535bSAndreas Gohr     *
35*8817535bSAndreas Gohr     * @var string[]
36*8817535bSAndreas Gohr     */
37*8817535bSAndreas Gohr    private $abbreviators = ['.'];
38*8817535bSAndreas Gohr
39*8817535bSAndreas Gohr    /**
40*8817535bSAndreas Gohr     * List of replacements in the text.
41*8817535bSAndreas Gohr     *
42*8817535bSAndreas Gohr     * @var string[]
43*8817535bSAndreas Gohr     */
44*8817535bSAndreas Gohr    private $replacements = [];
45*8817535bSAndreas Gohr
46*8817535bSAndreas Gohr    /**
47*8817535bSAndreas Gohr     * Generate an in-text replacement code for the specified index
48*8817535bSAndreas Gohr     *
49*8817535bSAndreas Gohr     * @param int $index
50*8817535bSAndreas Gohr     *
51*8817535bSAndreas Gohr     * @return string
52*8817535bSAndreas Gohr     */
53*8817535bSAndreas Gohr    private function getReplaceCode(int $index)
54*8817535bSAndreas Gohr    {
55*8817535bSAndreas Gohr        return 0x02 . $index . 0x03;
56*8817535bSAndreas Gohr    }
57*8817535bSAndreas Gohr
58*8817535bSAndreas Gohr    /**
59*8817535bSAndreas Gohr     * Clean floating point numbers by replace them with an in-text index
60*8817535bSAndreas Gohr     *
61*8817535bSAndreas Gohr     * @param string $text
62*8817535bSAndreas Gohr     *
63*8817535bSAndreas Gohr     * @return string
64*8817535bSAndreas Gohr     */
65*8817535bSAndreas Gohr    private function replaceFloatNumbers(string $text)
66*8817535bSAndreas Gohr    {
67*8817535bSAndreas Gohr        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
68*8817535bSAndreas Gohr
69*8817535bSAndreas Gohr        $this->replacements = [];
70*8817535bSAndreas Gohr        $index = 0;
71*8817535bSAndreas Gohr        foreach (array_reverse($matches[0]) as $match) {
72*8817535bSAndreas Gohr            $number = $match[0];
73*8817535bSAndreas Gohr            $offset = $match[1];
74*8817535bSAndreas Gohr            $code = $this->getReplaceCode($index);
75*8817535bSAndreas Gohr
76*8817535bSAndreas Gohr            $this->replacements[$index] = $number;
77*8817535bSAndreas Gohr
78*8817535bSAndreas Gohr            $text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
79*8817535bSAndreas Gohr
80*8817535bSAndreas Gohr            ++$index;
81*8817535bSAndreas Gohr        }
82*8817535bSAndreas Gohr
83*8817535bSAndreas Gohr        return $text;
84*8817535bSAndreas Gohr    }
85*8817535bSAndreas Gohr
86*8817535bSAndreas Gohr    /**
87*8817535bSAndreas Gohr     * Restore any stored replacements
88*8817535bSAndreas Gohr     *
89*8817535bSAndreas Gohr     * @param string[] $text
90*8817535bSAndreas Gohr     *
91*8817535bSAndreas Gohr     * @return string[]
92*8817535bSAndreas Gohr     */
93*8817535bSAndreas Gohr    private function restoreReplacements($text)
94*8817535bSAndreas Gohr    {
95*8817535bSAndreas Gohr        return array_map(function ($value) {
96*8817535bSAndreas Gohr            foreach ($this->replacements as $index => $number) {
97*8817535bSAndreas Gohr                $code = $this->getReplaceCode($index);
98*8817535bSAndreas Gohr                $value = str_replace($code, $number, $value);
99*8817535bSAndreas Gohr            }
100*8817535bSAndreas Gohr
101*8817535bSAndreas Gohr            return $value;
102*8817535bSAndreas Gohr        }, $text);
103*8817535bSAndreas Gohr    }
104*8817535bSAndreas Gohr
105*8817535bSAndreas Gohr    /**
106*8817535bSAndreas Gohr     * Breaks a piece of text into lines by linebreak.
107*8817535bSAndreas Gohr     * Eats up any linebreak characters as if one.
108*8817535bSAndreas Gohr     *
109*8817535bSAndreas Gohr     * Multibyte.php safe
110*8817535bSAndreas Gohr     *
111*8817535bSAndreas Gohr     * @param string $text
112*8817535bSAndreas Gohr     *
113*8817535bSAndreas Gohr     * @return string[]
114*8817535bSAndreas Gohr     */
115*8817535bSAndreas Gohr    private static function linebreakSplit($text)
116*8817535bSAndreas Gohr    {
117*8817535bSAndreas Gohr        $lines = [];
118*8817535bSAndreas Gohr        $line = '';
119*8817535bSAndreas Gohr
120*8817535bSAndreas Gohr        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
121*8817535bSAndreas Gohr            $line .= $part;
122*8817535bSAndreas Gohr            if (Multibyte::trim($part) === '') {
123*8817535bSAndreas Gohr                $lines[] = $line;
124*8817535bSAndreas Gohr                $line = '';
125*8817535bSAndreas Gohr            }
126*8817535bSAndreas Gohr        }
127*8817535bSAndreas Gohr        $lines[] = $line;
128*8817535bSAndreas Gohr
129*8817535bSAndreas Gohr        return $lines;
130*8817535bSAndreas Gohr    }
131*8817535bSAndreas Gohr
132*8817535bSAndreas Gohr    /**
133*8817535bSAndreas Gohr     * Splits an array of lines by (consecutive sequences of)
134*8817535bSAndreas Gohr     * terminals, keeping terminals.
135*8817535bSAndreas Gohr     *
136*8817535bSAndreas Gohr     * Multibyte.php safe (atleast for UTF-8)
137*8817535bSAndreas Gohr     *
138*8817535bSAndreas Gohr     * For example:
139*8817535bSAndreas Gohr     *    "There ... is. More!"
140*8817535bSAndreas Gohr     *        ... becomes ...
141*8817535bSAndreas Gohr     *    [ "There ", "...", " is", ".", " More", "!" ]
142*8817535bSAndreas Gohr     *
143*8817535bSAndreas Gohr     * @param string $line
144*8817535bSAndreas Gohr     *
145*8817535bSAndreas Gohr     * @return string[]
146*8817535bSAndreas Gohr     */
147*8817535bSAndreas Gohr    private function punctuationSplit($line)
148*8817535bSAndreas Gohr    {
149*8817535bSAndreas Gohr        $parts = [];
150*8817535bSAndreas Gohr
151*8817535bSAndreas Gohr        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
152*8817535bSAndreas Gohr        $is_terminal = in_array($chars[0], $this->terminals);
153*8817535bSAndreas Gohr
154*8817535bSAndreas Gohr        $part = '';
155*8817535bSAndreas Gohr        foreach ($chars as $char) {
156*8817535bSAndreas Gohr            if (in_array($char, $this->terminals) !== $is_terminal) {
157*8817535bSAndreas Gohr                $parts[] = $part;
158*8817535bSAndreas Gohr                $part = '';
159*8817535bSAndreas Gohr                $is_terminal = !$is_terminal;
160*8817535bSAndreas Gohr            }
161*8817535bSAndreas Gohr            $part .= $char;
162*8817535bSAndreas Gohr        }
163*8817535bSAndreas Gohr
164*8817535bSAndreas Gohr        if (!empty($part)) {
165*8817535bSAndreas Gohr            $parts[] = $part;
166*8817535bSAndreas Gohr        }
167*8817535bSAndreas Gohr
168*8817535bSAndreas Gohr        return $parts;
169*8817535bSAndreas Gohr    }
170*8817535bSAndreas Gohr
171*8817535bSAndreas Gohr    /**
172*8817535bSAndreas Gohr     * Appends each terminal item after it's preceding
173*8817535bSAndreas Gohr     * non-terminals.
174*8817535bSAndreas Gohr     *
175*8817535bSAndreas Gohr     * Multibyte.php safe (atleast for UTF-8)
176*8817535bSAndreas Gohr     *
177*8817535bSAndreas Gohr     * For example:
178*8817535bSAndreas Gohr     *    [ "There ", "...", " is", ".", " More", "!" ]
179*8817535bSAndreas Gohr     *        ... becomes ...
180*8817535bSAndreas Gohr     *    [ "There ... is.", "More!" ]
181*8817535bSAndreas Gohr     *
182*8817535bSAndreas Gohr     * @param string[] $punctuations
183*8817535bSAndreas Gohr     *
184*8817535bSAndreas Gohr     * @return string[]
185*8817535bSAndreas Gohr     */
186*8817535bSAndreas Gohr    private function punctuationMerge($punctuations)
187*8817535bSAndreas Gohr    {
188*8817535bSAndreas Gohr        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
189*8817535bSAndreas Gohr
190*8817535bSAndreas Gohr        $merges = [];
191*8817535bSAndreas Gohr        $merge = '';
192*8817535bSAndreas Gohr
193*8817535bSAndreas Gohr        $filtered = array_filter($punctuations, function ($p) {
194*8817535bSAndreas Gohr            return $p !== '';
195*8817535bSAndreas Gohr        });
196*8817535bSAndreas Gohr
197*8817535bSAndreas Gohr        foreach ($filtered as $punctuation) {
198*8817535bSAndreas Gohr            $merge .= $punctuation;
199*8817535bSAndreas Gohr            if (mb_strlen($punctuation) === 1
200*8817535bSAndreas Gohr                && in_array($punctuation, $this->terminals)) {
201*8817535bSAndreas Gohr                $merges[] = $merge;
202*8817535bSAndreas Gohr                $merge = '';
203*8817535bSAndreas Gohr            } else {
204*8817535bSAndreas Gohr                foreach ($definite_terminals as $terminal) {
205*8817535bSAndreas Gohr                    if (mb_strpos($punctuation, $terminal) !== false) {
206*8817535bSAndreas Gohr                        $merges[] = $merge;
207*8817535bSAndreas Gohr                        $merge = '';
208*8817535bSAndreas Gohr                        break;
209*8817535bSAndreas Gohr                    }
210*8817535bSAndreas Gohr                }
211*8817535bSAndreas Gohr            }
212*8817535bSAndreas Gohr        }
213*8817535bSAndreas Gohr        if (!empty($merge)) {
214*8817535bSAndreas Gohr            $merges[] = $merge;
215*8817535bSAndreas Gohr        }
216*8817535bSAndreas Gohr
217*8817535bSAndreas Gohr        return $merges;
218*8817535bSAndreas Gohr    }
219*8817535bSAndreas Gohr
220*8817535bSAndreas Gohr    /**
221*8817535bSAndreas Gohr     * Looks for capitalized abbreviations & includes them with the following fragment.
222*8817535bSAndreas Gohr     *
223*8817535bSAndreas Gohr     * For example:
224*8817535bSAndreas Gohr     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
225*8817535bSAndreas Gohr     *        ... becomes ...
226*8817535bSAndreas Gohr     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
227*8817535bSAndreas Gohr     *  [ "Mr. Comey was not available for comment." ]
228*8817535bSAndreas Gohr     *
229*8817535bSAndreas Gohr     * @param string[] $fragments
230*8817535bSAndreas Gohr     *
231*8817535bSAndreas Gohr     * @return string[]
232*8817535bSAndreas Gohr     */
233*8817535bSAndreas Gohr    private function abbreviationMerge($fragments)
234*8817535bSAndreas Gohr    {
235*8817535bSAndreas Gohr        $return_fragment = [];
236*8817535bSAndreas Gohr
237*8817535bSAndreas Gohr        $previous_fragment = '';
238*8817535bSAndreas Gohr        $previous_is_abbreviation = false;
239*8817535bSAndreas Gohr        $i = 0;
240*8817535bSAndreas Gohr        foreach ($fragments as $fragment) {
241*8817535bSAndreas Gohr            $is_abbreviation = self::isAbreviation($fragment);
242*8817535bSAndreas Gohr
243*8817535bSAndreas Gohr            // merge previous fragment with this
244*8817535bSAndreas Gohr            if ($previous_is_abbreviation) {
245*8817535bSAndreas Gohr                $fragment = $previous_fragment . $fragment;
246*8817535bSAndreas Gohr            }
247*8817535bSAndreas Gohr            $return_fragment[$i] = $fragment;
248*8817535bSAndreas Gohr
249*8817535bSAndreas Gohr            $previous_is_abbreviation = $is_abbreviation;
250*8817535bSAndreas Gohr            $previous_fragment = $fragment;
251*8817535bSAndreas Gohr
252*8817535bSAndreas Gohr            // only increment if this isn't an abbreviation
253*8817535bSAndreas Gohr            if (!$is_abbreviation) {
254*8817535bSAndreas Gohr                $i++;
255*8817535bSAndreas Gohr            }
256*8817535bSAndreas Gohr        }
257*8817535bSAndreas Gohr
258*8817535bSAndreas Gohr        return $return_fragment;
259*8817535bSAndreas Gohr    }
260*8817535bSAndreas Gohr
261*8817535bSAndreas Gohr    /**
262*8817535bSAndreas Gohr     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
263*8817535bSAndreas Gohr     *
264*8817535bSAndreas Gohr     * @param $fragment
265*8817535bSAndreas Gohr     *
266*8817535bSAndreas Gohr     * @return bool
267*8817535bSAndreas Gohr     */
268*8817535bSAndreas Gohr    private static function isAbreviation($fragment)
269*8817535bSAndreas Gohr    {
270*8817535bSAndreas Gohr        $words = mb_split('\s+', Multibyte::trim($fragment));
271*8817535bSAndreas Gohr
272*8817535bSAndreas Gohr        $word_count = count($words);
273*8817535bSAndreas Gohr
274*8817535bSAndreas Gohr        $last_word = Multibyte::trim($words[$word_count - 1]);
275*8817535bSAndreas Gohr        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
276*8817535bSAndreas Gohr        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
277*8817535bSAndreas Gohr
278*8817535bSAndreas Gohr        return $last_is_capital > 0
279*8817535bSAndreas Gohr            && $last_is_abbreviation > 0
280*8817535bSAndreas Gohr            && mb_strlen($last_word) <= 3;
281*8817535bSAndreas Gohr    }
282*8817535bSAndreas Gohr
283*8817535bSAndreas Gohr    /**
284*8817535bSAndreas Gohr     * Merges any part starting with a closing parenthesis ')' to the previous
285*8817535bSAndreas Gohr     * part.
286*8817535bSAndreas Gohr     *
287*8817535bSAndreas Gohr     * @param string[] $parts
288*8817535bSAndreas Gohr     *
289*8817535bSAndreas Gohr     * @return string[]
290*8817535bSAndreas Gohr     */
291*8817535bSAndreas Gohr    private function parenthesesMerge($parts)
292*8817535bSAndreas Gohr    {
293*8817535bSAndreas Gohr        $subsentences = [];
294*8817535bSAndreas Gohr
295*8817535bSAndreas Gohr        foreach ($parts as $part) {
296*8817535bSAndreas Gohr            if ($part[0] === ')' && !empty($subsentences)) {
297*8817535bSAndreas Gohr                $subsentences[count($subsentences) - 1] .= $part;
298*8817535bSAndreas Gohr            } else {
299*8817535bSAndreas Gohr                $subsentences[] = $part;
300*8817535bSAndreas Gohr            }
301*8817535bSAndreas Gohr        }
302*8817535bSAndreas Gohr
303*8817535bSAndreas Gohr        return $subsentences;
304*8817535bSAndreas Gohr    }
305*8817535bSAndreas Gohr
306*8817535bSAndreas Gohr    /**
307*8817535bSAndreas Gohr     * Looks for closing quotes to include them with the previous statement.
308*8817535bSAndreas Gohr     * "That was very interesting," he said.
309*8817535bSAndreas Gohr     * "That was very interesting."
310*8817535bSAndreas Gohr     *
311*8817535bSAndreas Gohr     * @param string[] $statements
312*8817535bSAndreas Gohr     *
313*8817535bSAndreas Gohr     * @return string[]
314*8817535bSAndreas Gohr     */
315*8817535bSAndreas Gohr    private function closeQuotesMerge($statements)
316*8817535bSAndreas Gohr    {
317*8817535bSAndreas Gohr        $i = 0;
318*8817535bSAndreas Gohr        $previous_statement = '';
319*8817535bSAndreas Gohr        $return = [];
320*8817535bSAndreas Gohr        foreach ($statements as $statement) {
321*8817535bSAndreas Gohr            if (self::isEndQuote($statement)) {
322*8817535bSAndreas Gohr                $statement = $previous_statement . $statement;
323*8817535bSAndreas Gohr            } else {
324*8817535bSAndreas Gohr                $i++;
325*8817535bSAndreas Gohr            }
326*8817535bSAndreas Gohr
327*8817535bSAndreas Gohr            $return[$i] = $statement;
328*8817535bSAndreas Gohr            $previous_statement = $statement;
329*8817535bSAndreas Gohr        }
330*8817535bSAndreas Gohr
331*8817535bSAndreas Gohr        return $return;
332*8817535bSAndreas Gohr    }
333*8817535bSAndreas Gohr
334*8817535bSAndreas Gohr    /**
335*8817535bSAndreas Gohr     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
336*8817535bSAndreas Gohr     *
337*8817535bSAndreas Gohr     * @param $statement
338*8817535bSAndreas Gohr     *
339*8817535bSAndreas Gohr     * @return bool
340*8817535bSAndreas Gohr     */
341*8817535bSAndreas Gohr    private static function isEndQuote($statement)
342*8817535bSAndreas Gohr    {
343*8817535bSAndreas Gohr        $trimmed = Multibyte::trim($statement);
344*8817535bSAndreas Gohr        $first = mb_substr($statement, 0, 1);
345*8817535bSAndreas Gohr
346*8817535bSAndreas Gohr        return in_array($trimmed, ['"', '\''])
347*8817535bSAndreas Gohr            || (
348*8817535bSAndreas Gohr                in_array($first, ['"', '\''])
349*8817535bSAndreas Gohr                && mb_substr($statement, 1, 1) === ' '
350*8817535bSAndreas Gohr                && ctype_lower(mb_substr($statement, 2, 1)) === true
351*8817535bSAndreas Gohr            );
352*8817535bSAndreas Gohr    }
353*8817535bSAndreas Gohr
354*8817535bSAndreas Gohr    /**
355*8817535bSAndreas Gohr     * Merges items into larger sentences.
356*8817535bSAndreas Gohr     * Multibyte.php safe
357*8817535bSAndreas Gohr     *
358*8817535bSAndreas Gohr     * @param string[] $shorts
359*8817535bSAndreas Gohr     *
360*8817535bSAndreas Gohr     * @return string[]
361*8817535bSAndreas Gohr     */
362*8817535bSAndreas Gohr    private function sentenceMerge($shorts)
363*8817535bSAndreas Gohr    {
364*8817535bSAndreas Gohr        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
365*8817535bSAndreas Gohr
366*8817535bSAndreas Gohr        $sentences = [];
367*8817535bSAndreas Gohr
368*8817535bSAndreas Gohr        $sentence = '';
369*8817535bSAndreas Gohr        $has_words = false;
370*8817535bSAndreas Gohr        $previous_word_ending = null;
371*8817535bSAndreas Gohr        foreach ($shorts as $short) {
372*8817535bSAndreas Gohr            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
373*8817535bSAndreas Gohr            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
374*8817535bSAndreas Gohr
375*8817535bSAndreas Gohr            if ($after_non_abbreviating_terminal
376*8817535bSAndreas Gohr                || ($has_words && $word_count > 1)) {
377*8817535bSAndreas Gohr
378*8817535bSAndreas Gohr                $sentences[] = $sentence;
379*8817535bSAndreas Gohr
380*8817535bSAndreas Gohr                $sentence = '';
381*8817535bSAndreas Gohr                $has_words = false;
382*8817535bSAndreas Gohr            }
383*8817535bSAndreas Gohr
384*8817535bSAndreas Gohr            $has_words = $has_words
385*8817535bSAndreas Gohr                || $word_count > 1;
386*8817535bSAndreas Gohr
387*8817535bSAndreas Gohr            $sentence .= $short;
388*8817535bSAndreas Gohr            $previous_word_ending = mb_substr($short, -1);
389*8817535bSAndreas Gohr        }
390*8817535bSAndreas Gohr
391*8817535bSAndreas Gohr        if (!empty($sentence)) {
392*8817535bSAndreas Gohr            $sentences[] = $sentence;
393*8817535bSAndreas Gohr        }
394*8817535bSAndreas Gohr
395*8817535bSAndreas Gohr        return $sentences;
396*8817535bSAndreas Gohr    }
397*8817535bSAndreas Gohr
398*8817535bSAndreas Gohr    /**
399*8817535bSAndreas Gohr     * Return the sentences sentences detected in the provided text.
400*8817535bSAndreas Gohr     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
401*8817535bSAndreas Gohr     *
402*8817535bSAndreas Gohr     * @param string  $text
403*8817535bSAndreas Gohr     * @param integer $flags
404*8817535bSAndreas Gohr     *
405*8817535bSAndreas Gohr     * @return string[]
406*8817535bSAndreas Gohr     */
407*8817535bSAndreas Gohr    public function split($text, $flags = 0)
408*8817535bSAndreas Gohr    {
409*8817535bSAndreas Gohr        static $pipeline = [
410*8817535bSAndreas Gohr            'replaceFloatNumbers',
411*8817535bSAndreas Gohr            'punctuationSplit',
412*8817535bSAndreas Gohr            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
413*8817535bSAndreas Gohr            'punctuationMerge',
414*8817535bSAndreas Gohr            'abbreviationMerge',
415*8817535bSAndreas Gohr            'closeQuotesMerge',
416*8817535bSAndreas Gohr            'sentenceMerge',
417*8817535bSAndreas Gohr            'restoreReplacements',
418*8817535bSAndreas Gohr        ];
419*8817535bSAndreas Gohr
420*8817535bSAndreas Gohr        // clean funny quotes
421*8817535bSAndreas Gohr        $text = Multibyte::cleanUnicode($text);
422*8817535bSAndreas Gohr
423*8817535bSAndreas Gohr        // Split
424*8817535bSAndreas Gohr        $sentences = [];
425*8817535bSAndreas Gohr        foreach (self::linebreakSplit($text) as $input) {
426*8817535bSAndreas Gohr            if (Multibyte::trim($input) !== '') {
427*8817535bSAndreas Gohr                foreach ($pipeline as $method) {
428*8817535bSAndreas Gohr                    $input = $this->$method($input);
429*8817535bSAndreas Gohr                }
430*8817535bSAndreas Gohr                $sentences = array_merge($sentences, $input);
431*8817535bSAndreas Gohr            }
432*8817535bSAndreas Gohr        }
433*8817535bSAndreas Gohr
434*8817535bSAndreas Gohr        // Post process
435*8817535bSAndreas Gohr        if ($flags & self::SPLIT_TRIM) {
436*8817535bSAndreas Gohr            return self::trimSentences($sentences);
437*8817535bSAndreas Gohr        }
438*8817535bSAndreas Gohr
439*8817535bSAndreas Gohr        return $sentences;
440*8817535bSAndreas Gohr    }
441*8817535bSAndreas Gohr
442*8817535bSAndreas Gohr    /**
443*8817535bSAndreas Gohr     * Multibyte.php trim each string in an array.
444*8817535bSAndreas Gohr     *
445*8817535bSAndreas Gohr     * @param string[] $sentences
446*8817535bSAndreas Gohr     *
447*8817535bSAndreas Gohr     * @return string[]
448*8817535bSAndreas Gohr     */
449*8817535bSAndreas Gohr    private static function trimSentences($sentences)
450*8817535bSAndreas Gohr    {
451*8817535bSAndreas Gohr        return array_map(function ($sentence) {
452*8817535bSAndreas Gohr            return Multibyte::trim($sentence);
453*8817535bSAndreas Gohr        }, $sentences);
454*8817535bSAndreas Gohr    }
455*8817535bSAndreas Gohr
456*8817535bSAndreas Gohr    /**
457*8817535bSAndreas Gohr     * Return the number of sentences detected in the provided text.
458*8817535bSAndreas Gohr     *
459*8817535bSAndreas Gohr     * @param string $text
460*8817535bSAndreas Gohr     *
461*8817535bSAndreas Gohr     * @return integer
462*8817535bSAndreas Gohr     */
463*8817535bSAndreas Gohr    public function count($text)
464*8817535bSAndreas Gohr    {
465*8817535bSAndreas Gohr        return count($this->split($text));
466*8817535bSAndreas Gohr    }
467*8817535bSAndreas Gohr
468*8817535bSAndreas Gohr}
469