1<?php
2
3namespace Vanderlee\Sentence;
4
5/**
6 * Segments sentences.
7 * Clipping may not be perfect.
8 * Sentence count should be VERY close to the truth.
9 *
10 * Multibyte.php safe (atleast for UTF-8), but rules based on germanic
11 * language stucture (English, Dutch, German). Should work for most
12 * latin-alphabet languages.
13 *
14 * @author Martijn van der Lee (@vanderlee)
15 * @author @marktaw
16 */
17class Sentence
18{
19
20    /**
21     * Specify this flag with the split method to trim whitespace.
22     */
23    const SPLIT_TRIM = 0x1;
24
25    /**
26     * List of characters used to terminate sentences.
27     *
28     * @var string[]
29     */
30    private $terminals = ['.', '!', '?'];
31
32    /**
33     * List of characters used for abbreviations.
34     *
35     * @var string[]
36     */
37    private $abbreviators = ['.'];
38
39    /**
40     * List of replacements in the text.
41     *
42     * @var string[]
43     */
44    private $replacements = [];
45
46    /**
47     * Generate an in-text replacement code for the specified index
48     *
49     * @param int $index
50     *
51     * @return string
52     */
53    private function getReplaceCode(int $index)
54    {
55        return 0x02 . $index . 0x03;
56    }
57
58    /**
59     * Clean floating point numbers by replace them with an in-text index
60     *
61     * @param string $text
62     *
63     * @return string
64     */
65    private function replaceFloatNumbers(string $text)
66    {
67        preg_match_all('!\d+(?:\.\d+)?!', $text, $matches, PREG_OFFSET_CAPTURE);
68
69        $this->replacements = [];
70        $index = 0;
71        foreach (array_reverse($matches[0]) as $match) {
72            $number = $match[0];
73            $offset = $match[1];
74            $code = $this->getReplaceCode($index);
75
76            $this->replacements[$index] = $number;
77
78            $text = (string)substr_replace($text, $code, $offset, mb_strlen($number));
79
80            ++$index;
81        }
82
83        return $text;
84    }
85
86    /**
87     * Restore any stored replacements
88     *
89     * @param string[] $text
90     *
91     * @return string[]
92     */
93    private function restoreReplacements($text)
94    {
95        return array_map(function ($value) {
96            foreach ($this->replacements as $index => $number) {
97                $code = $this->getReplaceCode($index);
98                $value = str_replace($code, $number, $value);
99            }
100
101            return $value;
102        }, $text);
103    }
104
105    /**
106     * Breaks a piece of text into lines by linebreak.
107     * Eats up any linebreak characters as if one.
108     *
109     * Multibyte.php safe
110     *
111     * @param string $text
112     *
113     * @return string[]
114     */
115    private static function linebreakSplit($text)
116    {
117        $lines = [];
118        $line = '';
119
120        foreach (Multibyte::split('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
121            $line .= $part;
122            if (Multibyte::trim($part) === '') {
123                $lines[] = $line;
124                $line = '';
125            }
126        }
127        $lines[] = $line;
128
129        return $lines;
130    }
131
132    /**
133     * Splits an array of lines by (consecutive sequences of)
134     * terminals, keeping terminals.
135     *
136     * Multibyte.php safe (atleast for UTF-8)
137     *
138     * For example:
139     *    "There ... is. More!"
140     *        ... becomes ...
141     *    [ "There ", "...", " is", ".", " More", "!" ]
142     *
143     * @param string $line
144     *
145     * @return string[]
146     */
147    private function punctuationSplit($line)
148    {
149        $parts = [];
150
151        $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
152        $is_terminal = in_array($chars[0], $this->terminals);
153
154        $part = '';
155        foreach ($chars as $char) {
156            if (in_array($char, $this->terminals) !== $is_terminal) {
157                $parts[] = $part;
158                $part = '';
159                $is_terminal = !$is_terminal;
160            }
161            $part .= $char;
162        }
163
164        if (!empty($part)) {
165            $parts[] = $part;
166        }
167
168        return $parts;
169    }
170
171    /**
172     * Appends each terminal item after it's preceding
173     * non-terminals.
174     *
175     * Multibyte.php safe (atleast for UTF-8)
176     *
177     * For example:
178     *    [ "There ", "...", " is", ".", " More", "!" ]
179     *        ... becomes ...
180     *    [ "There ... is.", "More!" ]
181     *
182     * @param string[] $punctuations
183     *
184     * @return string[]
185     */
186    private function punctuationMerge($punctuations)
187    {
188        $definite_terminals = array_diff($this->terminals, $this->abbreviators);
189
190        $merges = [];
191        $merge = '';
192
193        $filtered = array_filter($punctuations, function ($p) {
194            return $p !== '';
195        });
196
197        foreach ($filtered as $punctuation) {
198            $merge .= $punctuation;
199            if (mb_strlen($punctuation) === 1
200                && in_array($punctuation, $this->terminals)) {
201                $merges[] = $merge;
202                $merge = '';
203            } else {
204                foreach ($definite_terminals as $terminal) {
205                    if (mb_strpos($punctuation, $terminal) !== false) {
206                        $merges[] = $merge;
207                        $merge = '';
208                        break;
209                    }
210                }
211            }
212        }
213        if (!empty($merge)) {
214            $merges[] = $merge;
215        }
216
217        return $merges;
218    }
219
220    /**
221     * Looks for capitalized abbreviations & includes them with the following fragment.
222     *
223     * For example:
224     *    [ "Last week, former director of the F.B.I. James B. Comey was fired. Mr. Comey was not available for comment." ]
225     *        ... becomes ...
226     *    [ "Last week, former director of the F.B.I. James B. Comey was fired." ]
227     *  [ "Mr. Comey was not available for comment." ]
228     *
229     * @param string[] $fragments
230     *
231     * @return string[]
232     */
233    private function abbreviationMerge($fragments)
234    {
235        $return_fragment = [];
236
237        $previous_fragment = '';
238        $previous_is_abbreviation = false;
239        $i = 0;
240        foreach ($fragments as $fragment) {
241            $is_abbreviation = self::isAbreviation($fragment);
242
243            // merge previous fragment with this
244            if ($previous_is_abbreviation) {
245                $fragment = $previous_fragment . $fragment;
246            }
247            $return_fragment[$i] = $fragment;
248
249            $previous_is_abbreviation = $is_abbreviation;
250            $previous_fragment = $fragment;
251
252            // only increment if this isn't an abbreviation
253            if (!$is_abbreviation) {
254                $i++;
255            }
256        }
257
258        return $return_fragment;
259    }
260
261    /**
262     * Check if the last word of fragment starts with a Capital, ends in "." & has less than 3 characters.
263     *
264     * @param $fragment
265     *
266     * @return bool
267     */
268    private static function isAbreviation($fragment)
269    {
270        $words = mb_split('\s+', Multibyte::trim($fragment));
271
272        $word_count = count($words);
273
274        $last_word = Multibyte::trim($words[$word_count - 1]);
275        $last_is_capital = preg_match('#^\p{Lu}#u', $last_word);
276        $last_is_abbreviation = mb_substr(Multibyte::trim($fragment), -1) === '.';
277
278        return $last_is_capital > 0
279            && $last_is_abbreviation > 0
280            && mb_strlen($last_word) <= 3;
281    }
282
283    /**
284     * Merges any part starting with a closing parenthesis ')' to the previous
285     * part.
286     *
287     * @param string[] $parts
288     *
289     * @return string[]
290     */
291    private function parenthesesMerge($parts)
292    {
293        $subsentences = [];
294
295        foreach ($parts as $part) {
296            if ($part[0] === ')' && !empty($subsentences)) {
297                $subsentences[count($subsentences) - 1] .= $part;
298            } else {
299                $subsentences[] = $part;
300            }
301        }
302
303        return $subsentences;
304    }
305
306    /**
307     * Looks for closing quotes to include them with the previous statement.
308     * "That was very interesting," he said.
309     * "That was very interesting."
310     *
311     * @param string[] $statements
312     *
313     * @return string[]
314     */
315    private function closeQuotesMerge($statements)
316    {
317        $i = 0;
318        $previous_statement = '';
319        $return = [];
320        foreach ($statements as $statement) {
321            if (self::isEndQuote($statement)) {
322                $statement = $previous_statement . $statement;
323            } else {
324                $i++;
325            }
326
327            $return[$i] = $statement;
328            $previous_statement = $statement;
329        }
330
331        return $return;
332    }
333
334    /**
335     * Check if the entire string is a quotation mark or quote, then space, then lowercase.
336     *
337     * @param $statement
338     *
339     * @return bool
340     */
341    private static function isEndQuote($statement)
342    {
343        $trimmed = Multibyte::trim($statement);
344        $first = mb_substr($statement, 0, 1);
345
346        return in_array($trimmed, ['"', '\''])
347            || (
348                in_array($first, ['"', '\''])
349                && mb_substr($statement, 1, 1) === ' '
350                && ctype_lower(mb_substr($statement, 2, 1)) === true
351            );
352    }
353
354    /**
355     * Merges items into larger sentences.
356     * Multibyte.php safe
357     *
358     * @param string[] $shorts
359     *
360     * @return string[]
361     */
362    private function sentenceMerge($shorts)
363    {
364        $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
365
366        $sentences = [];
367
368        $sentence = '';
369        $has_words = false;
370        $previous_word_ending = null;
371        foreach ($shorts as $short) {
372            $word_count = count(mb_split('\s+', Multibyte::trim($short)));
373            $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
374
375            if ($after_non_abbreviating_terminal
376                || ($has_words && $word_count > 1)) {
377
378                $sentences[] = $sentence;
379
380                $sentence = '';
381                $has_words = false;
382            }
383
384            $has_words = $has_words
385                || $word_count > 1;
386
387            $sentence .= $short;
388            $previous_word_ending = mb_substr($short, -1);
389        }
390
391        if (!empty($sentence)) {
392            $sentences[] = $sentence;
393        }
394
395        return $sentences;
396    }
397
398    /**
399     * Return the sentences sentences detected in the provided text.
400     * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
401     *
402     * @param string  $text
403     * @param integer $flags
404     *
405     * @return string[]
406     */
407    public function split($text, $flags = 0)
408    {
409        static $pipeline = [
410            'replaceFloatNumbers',
411            'punctuationSplit',
412            'parenthesesMerge', // also works after punctuationMerge or abbreviationMerge
413            'punctuationMerge',
414            'abbreviationMerge',
415            'closeQuotesMerge',
416            'sentenceMerge',
417            'restoreReplacements',
418        ];
419
420        // clean funny quotes
421        $text = Multibyte::cleanUnicode($text);
422
423        // Split
424        $sentences = [];
425        foreach (self::linebreakSplit($text) as $input) {
426            if (Multibyte::trim($input) !== '') {
427                foreach ($pipeline as $method) {
428                    $input = $this->$method($input);
429                }
430                $sentences = array_merge($sentences, $input);
431            }
432        }
433
434        // Post process
435        if ($flags & self::SPLIT_TRIM) {
436            return self::trimSentences($sentences);
437        }
438
439        return $sentences;
440    }
441
442    /**
443     * Multibyte.php trim each string in an array.
444     *
445     * @param string[] $sentences
446     *
447     * @return string[]
448     */
449    private static function trimSentences($sentences)
450    {
451        return array_map(function ($sentence) {
452            return Multibyte::trim($sentence);
453        }, $sentences);
454    }
455
456    /**
457     * Return the number of sentences detected in the provided text.
458     *
459     * @param string $text
460     *
461     * @return integer
462     */
463    public function count($text)
464    {
465        return count($this->split($text));
466    }
467
468}
469