File classes/Sentence.php

  1: <?php
  2: 
  3: /**
  4:  * Segments sentences.
  5:  * Clipping may not be perfect.
  6:  * Sentence count should be VERY close to the truth.
  7:  * 
  8:  * Multibyte safe (atleast for UTF-8), but rules based on germanic
  9:  * language stucture (English, Dutch, German). Should work for most
 10:  * latin-alphabet languages.
 11:  */
 12: class Sentence {
 13:     /**
 14:      * Specify this flag with the split method to trim whitespace.
 15:      */
 16:     const SPLIT_TRIM        = 0x1;
 17: 
 18:     /**
 19:      * List of characters used to terminate sentences.
 20:      * @var array
 21:      */
 22:     private $terminals      = array('.', '!', '?');
 23: 
 24:     /**
 25:      * List of characters used for abbreviations.
 26:      * @var array
 27:      */
 28:     private $abbreviators   = array('.');
 29: 
 30:     /**
 31:      * Multibyte safe version of standard trim() function.
 32:      * @param string $string
 33:      * @return string
 34:      */
 35:     private static function mbTrim($string) {
 36:         return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string);
 37:     }
 38: 
 39:     /**
 40:      * A cross between mb_split and preg_split, adding the preg_split flags
 41:      * to mb_split.
 42:      * @param string $pattern
 43:      * @param string $string
 44:      * @param int $limit
 45:      * @param int $flags
 46:      * @return array
 47:      */
 48:     private static function mbSplit($pattern, $string, $limit = -1, $flags = 0) {       
 49:         $strlen = strlen($string);      // bytes!   
 50:         mb_ereg_search_init($string);
 51:         
 52:         $lengths = array();
 53:         $position = 0;
 54:         while (($array = mb_ereg_search_pos($pattern, '')) !== false) {
 55:             // capture split
 56:             $lengths[] = array($array[0] - $position, false, null);
 57:                     
 58:             // move position
 59:             $position = $array[0] + $array[1];
 60:         
 61:             // capture delimiter
 62:             $regs = mb_ereg_search_getregs();           
 63:             $lengths[] = array($array[1], true, isset($regs[1]) && $regs[1]);
 64:             
 65:             // Continue on?
 66:             if ($position >= $strlen) {
 67:                 break;
 68:             }           
 69:         }
 70: 
 71:         // Add last bit, if not ending with split
 72:         $lengths[] = array($strlen - $position, false, null);
 73: 
 74:         // Substrings
 75:         $parts = array();
 76:         $position = 0;      
 77:         $count = 1;
 78:         foreach ($lengths as $length) {
 79:             $is_delimiter   = $length[1];
 80:             $is_captured    = $length[2];
 81:             
 82:             if ($limit > 0 && !$is_delimiter && ($length[0] || ~$flags & PREG_SPLIT_NO_EMPTY) && ++$count > $limit) {
 83:                 if ($length[0] > 0 || ~$flags & PREG_SPLIT_NO_EMPTY) {          
 84:                     $parts[]    = $flags & PREG_SPLIT_OFFSET_CAPTURE
 85:                                 ? array(mb_strcut($string, $position), $position)
 86:                                 : mb_strcut($string, $position);                
 87:                 }
 88:                 break;
 89:             } elseif ((!$is_delimiter || ($flags & PREG_SPLIT_DELIM_CAPTURE && $is_captured))
 90:                    && ($length[0] || ~$flags & PREG_SPLIT_NO_EMPTY)) {
 91:                 $parts[]    = $flags & PREG_SPLIT_OFFSET_CAPTURE
 92:                             ? array(mb_strcut($string, $position, $length[0]), $position)
 93:                             : mb_strcut($string, $position, $length[0]);
 94:             }
 95:             
 96:             $position += $length[0];
 97:         }
 98:         
 99:         return $parts;
100:     }   
101:     
102:     /**
103:      * Breaks a piece of text into lines by linebreak.
104:      * Eats up any linebreak characters as if one.
105:      * 
106:      * Multibyte safe
107:      * 
108:      * @param string $text
109:      * @return array
110:      */
111:     private static function linebreakSplit($text) {
112:         $lines = array();
113:         $line = '';
114:         
115:         foreach (self::mbSplit('([\r\n]+)', $text, -1, PREG_SPLIT_DELIM_CAPTURE) as $part) {
116:             $line .= $part;
117:             if (self::mbTrim($part) === '') {
118:                 $lines[] = $line;
119:                 $line = '';
120:             }
121:         }
122:         $lines[] = $line;
123:         
124:         return $lines;
125:     }
126: 
127:     /**
128:      * Splits an array of lines by (consecutive sequences of)
129:      * terminals, keeping terminals.
130:      * 
131:      * Multibyte safe (atleast for UTF-8)
132:      * 
133:      * For example:
134:      *  "There ... is. More!"
135:      *      ... becomes ...
136:      *  [ "There ", "...", " is", ".", " More", "!" ]
137:      * 
138:      * @param array $lines
139:      * @return array
140:      */
141:     private function punctuationSplit($line) {                                      
142:         $parts = array();
143: 
144:         $chars = preg_split('//u', $line, -1, PREG_SPLIT_NO_EMPTY); // This is UTF8 multibyte safe!
145:         $is_terminal = in_array($chars[0], $this->terminals);
146:         
147:         $part = '';
148:         foreach ($chars as $index => $char) {
149:             if (in_array($char, $this->terminals) !== $is_terminal) {
150:                 $parts[] = $part;
151:                 $part = '';
152:                 $is_terminal = !$is_terminal;
153:             }
154:             $part .= $char;                         
155:         }
156:         
157:         if (!empty($part)) {
158:             $parts[] = $part;                           
159:         }
160: 
161:         return $parts;
162:     }
163: 
164:     /**
165:      * Appends each terminal item after it's preceding
166:      * non-terminals.
167:      * 
168:      * Multibyte safe (atleast for UTF-8)
169:      * 
170:      * For example:
171:      *  [ "There ", "...", " is", ".", " More", "!" ]
172:      *      ... becomes ...
173:      *  [ "There ... is.", "More!" ]
174:      * 
175:      * @param array $punctuations
176:      * @return array
177:      */
178:     private function punctuationMerge($punctuations) {      
179:         $definite_terminals = array_diff($this->terminals, $this->abbreviators);
180:         
181:         $merges = array();
182:         $merge = '';
183: 
184:         foreach ($punctuations as $punctuation) {
185:             if ($punctuation !== '') {
186:                 $merge.= $punctuation;
187:                 if (mb_strlen($punctuation) === 1 && in_array($punctuation, $this->terminals)) {
188:                     $merges[] = $merge;
189:                     $merge = '';
190:                 } else {
191:                     foreach ($definite_terminals as $terminal) {
192:                         if (mb_strpos($punctuation, $terminal) !== false) {
193:                             $merges[] = $merge;
194:                             $merge = '';
195:                             break;
196:                         }
197:                     }
198:                 }
199:             }           
200:         }
201:         if (!empty($merge)) {
202:             $merges[] = $merge;
203:         }
204: 
205:         return $merges;
206:     }
207: 
208:     /**
209:      * Merges any one-word items with it's preceding items.
210:      * 
211:      * Multibyte safe
212:      * 
213:      * For example:
214:      *  [ "There ... is.", "More!" ]
215:      *      ... becomes ...
216:      *  [ "There ... is. More!" ]
217:      * 
218:      * @param array $fragments
219:      * @return array
220:      */
221:     private function abbreviationMerge($fragments) {
222:         $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
223:         
224:         $abbreviations = array();
225:         
226:         $abbreviation = '';
227:         
228:         $previous_word_count = null;
229:         $previous_word_ending = null;       
230:         foreach ($fragments as $fragment) {
231:             $word_count = count(mb_split('\s+', self::mbTrim($fragment)));
232:             $starts_with_space = mb_ereg_match('^\s+', $fragment);          
233:             $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
234:             
235:             if ($after_non_abbreviating_terminal || ($previous_word_count !== null && ($previous_word_count !== 1 || $word_count !== 1) && $starts_with_space)) {
236:                 $abbreviations[] = $abbreviation;
237:                 $abbreviation = '';
238:             }
239: 
240:             $abbreviation           .= $fragment;                   
241:             $previous_word_count    = $word_count;                          
242:             $previous_word_ending   = mb_substr($fragment, -1);         
243:         }
244:         if ($abbreviation !== '') {
245:             $abbreviations[] = $abbreviation;
246:         }
247: 
248:         return $abbreviations;
249:     }
250: 
251:     /**
252:      * Merges items into larger sentences.
253:      * 
254:      * Multibyte safe
255:      * 
256:      * @param array $shorts
257:      * @return array
258:      */
259:     private function sentenceMerge($shorts) {
260:         $non_abbreviating_terminals = array_diff($this->terminals, $this->abbreviators);
261: 
262:         $sentences = array();
263: 
264:         $sentence = '';                 
265:         $has_words = false;
266:         $previous_word_ending = null;
267:         foreach ($shorts as $short) {
268:             $word_count = count(mb_split('\s+', self::mbTrim($short)));         
269:             $after_non_abbreviating_terminal = in_array($previous_word_ending, $non_abbreviating_terminals);
270:             
271:             if ($after_non_abbreviating_terminal || ($has_words && $word_count > 1)) {
272:                 $sentences[] = $sentence;
273:                 $sentence = '';                     
274:                 $has_words = $word_count > 1;
275:             } else {
276:                 $has_words = $has_words || $word_count > 1;                     
277:             }
278:             
279:             $sentence.= $short;         
280:             $previous_word_ending = mb_substr($short, -1);                  
281:         }
282:         if (!empty($sentence)) {
283:             $sentences[] = $sentence;
284:         }           
285: 
286:         return $sentences;
287:     }
288: 
289:     /**
290:      * Return the sentences sentences detected in the provided text.
291:      * Set the Sentence::SPLIT_TRIM flag to trim whitespace.
292:      * @param string $text
293:      * @param integer $flags
294:      * @return array
295:      */
296:     public function split($text, $flags = 0) {      
297:         $sentences = array();
298: 
299:         // Split
300:         foreach (self::linebreakSplit($text) as $line) {                
301:             if (self::mbTrim($line) !== '') {
302:                 $punctuations   = $this->punctuationSplit($line);
303:                 $merges         = $this->punctuationMerge($punctuations);
304:                 $shorts         = $this->abbreviationMerge($merges);
305:                 $sentences      = array_merge($sentences, $this->sentenceMerge($shorts));
306:             }
307:         }
308:         
309:         // Post process
310:         if ($flags & self::SPLIT_TRIM) {
311:             foreach ($sentences as &$sentence) {
312:                 $sentence = self::mbTrim($sentence);
313:             }
314:             unset($sentence);
315:         }
316: 
317:         return $sentences;
318:     }
319: 
320:     /**
321:      * Return the number of sentences detected in the provided text.
322:      * @param string $text
323:      * @return integer
324:      */
325:     public function count($text) {
326:         return count($this->split($text));
327:     }
328: }
Classes

Functions