1: <?php
2:
3: 4: 5: 6: 7: 8: 9: 10: 11:
12: class Sentence {
13: 14: 15:
16: const SPLIT_TRIM = 0x1;
17:
18: 19: 20: 21:
22: private $terminals = array('.', '!', '?');
23:
24: 25: 26: 27:
28: private $abbreviators = array('.');
29:
30: 31: 32: 33: 34:
35: private static function mbTrim($string) {
36: return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
37: }
38:
39: 40: 41: 42: 43: 44: 45: 46: 47:
48: private static function mbSplit($pattern, $string, $limit = -1, $flags = 0) {
49: $strlen = strlen($string);
50: mb_ereg_search_init($string);
51:
52: $lengths = array();
53: $position = 0;
54: while (($array = mb_ereg_search_pos($pattern, '')) !== false) {
55:
56: $lengths[] = array($array[0] - $position, false, null);
57:
58:
59: $position = $array[0] + $array[1];
60:
61:
62: $regs = mb_ereg_search_getregs();
63: $lengths[] = array($array[1], true, isset($regs[1]) && $regs[1]);
64:
65:
66: if ($position >= $strlen) {
67: break;
68: }
69: }
70:
71:
72: $lengths[] = array($strlen - $position, false, null);
73:
74:
75: $parts = array();
76: $position = 0;
77: $count = 1;
78: foreach ($lengths as $length) {
79: $is_delimiter = $length[1];
80: $is_captured = $length[2];
81:
82: if ($limit > 0 && !$is_delimiter && ($length[0] || ~$flags & PREG_SPLIT_NO_EMPTY) && ++$count > $limit) {
83: if ($length[0] > 0 || ~$flags & PREG_SPLIT_NO_EMPTY) {
84: $parts[] = $flags & PREG_SPLIT_OFFSET_CAPTURE
85: ? array(mb_strcut($string, $position), $position)
86: : mb_strcut($string, $position);
87: }
88: break;
89: } elseif ((!$is_delimiter || ($flags & PREG_SPLIT_DELIM_CAPTURE && $is_captured))
90: && ($length[0] || ~$flags & PREG_SPLIT_NO_EMPTY)) {
91: $parts[] = $flags & PREG_SPLIT_OFFSET_CAPTURE
92: ? array(mb_strcut($string, $position, $length[0]), $position)
93: : mb_strcut($string, $position, $length[0]);
94: }
95:
96: $position += $length[0];
97: }
98:
99: return $parts;
100: }
101:
102: 103: 104: 105: 106: 107: 108: 109: 110:
111: private static function linebreakSplit($text) {
112: $lines = array();
113: $line = '';
114:
115: foreach (self::mbSplit('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
116: $line .= $part;
117: if (self::mbTrim($part) === '') {
118: $lines[] = $line;
119: $line = '';
120: }
121: }
122: $lines[] = $line;
123:
124: return $lines;
125: }
126:
127: 128: 129: 130: 131: 132: 133: 134: 135: 136: 137: 138: 139: 140:
141: private function punctuationSplit($line) {
142: $parts = array();
143:
144: $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY);
145: $is_terminal = in_array($chars[0], $this->terminals);
146:
147: $part = '';
148: foreach ($chars as $index => $char) {
149: if (in_array($char, $this->terminals) !== $is_terminal) {
150: $parts[] = $part;
151: $part = '';
152: $is_terminal = !$is_terminal;
153: }
154: $part .= $char;
155: }
156:
157: if (!empty($part)) {
158: $parts[] = $part;
159: }
160:
161: return $parts;
162: }
163:
164: 165: 166: 167: 168: 169: 170: 171: 172: 173: 174: 175: 176: 177:
178: private function punctuationMerge($punctuations) {
179: $definite_terminals = array_diff($this->terminals, $this->abbreviators);
180:
181: $merges = array();
182: $merge = '';
183:
184: foreach ($punctuations as $punctuation) {
185: if ($punctuation !== '') {
186: $merge.= $punctuation;
187: if (mb_strlen($punctuation) === 1 && in_array($punctuation, $this->terminals)) {
188: $merges[] = $merge;
189: $merge = '';
190: } else {
191: foreach ($definite_terminals as $terminal) {
192: if (mb_strpos($punctuation, $terminal) !== false) {
193: $merges[] = $merge;
194: $merge = '';
195: break;
196: }
197: }
198: }
199: }
200: }
201: if (!empty($merge)) {
202: $merges[] = $merge;
203: }
204:
205: return $merges;
206: }
207:
208: 209: 210: 211: 212: 213: 214: 215: 216: 217: 218: 219: 220:
221: private function abbreviationMerge($fragments) {
222: $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
223:
224: $abbreviations = array();
225:
226: $abbreviation = '';
227:
228: $previous_word_count = null;
229: $previous_word_ending = null;
230: foreach ($fragments as $fragment) {
231: $word_count = count(mb_split('\s+', self::mbTrim($fragment)));
232: $starts_with_space = mb_ereg_match('^\s+', $fragment);
233: $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
234:
235: if ($after_non_abbreviating_terminal || ($previous_word_count !== null && ($previous_word_count !== 1 || $word_count !== 1) && $starts_with_space)) {
236: $abbreviations[] = $abbreviation;
237: $abbreviation = '';
238: }
239:
240: $abbreviation .= $fragment;
241: $previous_word_count = $word_count;
242: $previous_word_ending = mb_substr($fragment, -1);
243: }
244: if ($abbreviation !== '') {
245: $abbreviations[] = $abbreviation;
246: }
247:
248: return $abbreviations;
249: }
250:
251: 252: 253: 254: 255: 256: 257: 258:
259: private function sentenceMerge($shorts) {
260: $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
261:
262: $sentences = array();
263:
264: $sentence = '';
265: $has_words = false;
266: $previous_word_ending = null;
267: foreach ($shorts as $short) {
268: $word_count = count(mb_split('\s+', self::mbTrim($short)));
269: $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
270:
271: if ($after_non_abbreviating_terminal || ($has_words && $word_count > 1)) {
272: $sentences[] = $sentence;
273: $sentence = '';
274: $has_words = $word_count > 1;
275: } else {
276: $has_words = $has_words || $word_count > 1;
277: }
278:
279: $sentence.= $short;
280: $previous_word_ending = mb_substr($short, -1);
281: }
282: if (!empty($sentence)) {
283: $sentences[] = $sentence;
284: }
285:
286: return $sentences;
287: }
288:
289: 290: 291: 292: 293: 294: 295:
296: public function split($text, $flags = 0) {
297: $sentences = array();
298:
299:
300: foreach (self::linebreakSplit($text) as $line) {
301: if (self::mbTrim($line) !== '') {
302: $punctuations = $this->punctuationSplit($line);
303: $merges = $this->punctuationMerge($punctuations);
304: $shorts = $this->abbreviationMerge($merges);
305: $sentences = array_merge($sentences, $this->sentenceMerge($shorts));
306: }
307: }
308:
309:
310: if ($flags & self::SPLIT_TRIM) {
311: foreach ($sentences as &$sentence) {
312: $sentence = self::mbTrim($sentence);
313: }
314: unset($sentence);
315: }
316:
317: return $sentences;
318: }
319:
320: 321: 322: 323: 324:
325: public function count($text) {
326: return count($this->split($text));
327: }
328: }