1*8817535bSAndreas Gohr<?php 2*8817535bSAndreas Gohr 3*8817535bSAndreas Gohrnamespace Vanderlee\Sentence; 4*8817535bSAndreas Gohr 5*8817535bSAndreas Gohr/** 6*8817535bSAndreas Gohr * Multibyte-safe utility functions 7*8817535bSAndreas Gohr */ 8*8817535bSAndreas Gohrclass Multibyte 9*8817535bSAndreas Gohr{ 10*8817535bSAndreas Gohr //https://stackoverflow.com/questions/20025030/convert-all-types-of-smart-quotes-with-php 11*8817535bSAndreas Gohr private static $unicodeCharacterMap = [ 12*8817535bSAndreas Gohr // Windows codepage 1252 13*8817535bSAndreas Gohr "\xC2\x82" => "'", // U+0082⇒U+201A single low-9 quotation mark 14*8817535bSAndreas Gohr "\xC2\x84" => '"', // U+0084⇒U+201E double low-9 quotation mark 15*8817535bSAndreas Gohr "\xC2\x8B" => "'", // U+008B⇒U+2039 single left-pointing angle quotation mark 16*8817535bSAndreas Gohr "\xC2\x91" => "'", // U+0091⇒U+2018 left single quotation mark 17*8817535bSAndreas Gohr "\xC2\x92" => "'", // U+0092⇒U+2019 right single quotation mark 18*8817535bSAndreas Gohr "\xC2\x93" => '"', // U+0093⇒U+201C left double quotation mark 19*8817535bSAndreas Gohr "\xC2\x94" => '"', // U+0094⇒U+201D right double quotation mark 20*8817535bSAndreas Gohr "\xC2\x9B" => "'", // U+009B⇒U+203A single right-pointing angle quotation mark 21*8817535bSAndreas Gohr // Regular Unicode // U+0022 quotation mark (") 22*8817535bSAndreas Gohr // U+0027 apostrophe (') 23*8817535bSAndreas Gohr "\xC2\xAB" => '"', // U+00AB left-pointing double angle quotation mark 24*8817535bSAndreas Gohr "\xC2\xBB" => '"', // U+00BB right-pointing double angle quotation mark 25*8817535bSAndreas Gohr "\xE2\x80\x98" => "'", // U+2018 left single quotation mark 26*8817535bSAndreas Gohr "\xE2\x80\x99" => "'", // U+2019 right single quotation mark 27*8817535bSAndreas Gohr "\xE2\x80\x9A" => "'", // U+201A single low-9 quotation mark 28*8817535bSAndreas Gohr "\xE2\x80\x9B" => "'", // U+201B single high-reversed-9 quotation mark 29*8817535bSAndreas Gohr "\xE2\x80\x9C" => '"', // U+201C left double quotation mark 30*8817535bSAndreas Gohr "\xE2\x80\x9D" => '"', // U+201D right double quotation mark 31*8817535bSAndreas Gohr "\xE2\x80\x9E" => '"', // U+201E double low-9 quotation mark 32*8817535bSAndreas Gohr "\xE2\x80\x9F" => '"', // U+201F double high-reversed-9 quotation mark 33*8817535bSAndreas Gohr "\xE2\x80\xB9" => "'", // U+2039 single left-pointing angle quotation mark 34*8817535bSAndreas Gohr "\xE2\x80\xBA" => "'", // U+203A single right-pointing angle quotation mark 35*8817535bSAndreas Gohr ]; 36*8817535bSAndreas Gohr 37*8817535bSAndreas Gohr /** 38*8817535bSAndreas Gohr * Replace 39*8817535bSAndreas Gohr * 40*8817535bSAndreas Gohr * @staticvar array $chr_map 41*8817535bSAndreas Gohr * @param string $string 42*8817535bSAndreas Gohr * @return string 43*8817535bSAndreas Gohr */ 44*8817535bSAndreas Gohr public static function cleanUnicode($string) 45*8817535bSAndreas Gohr { 46*8817535bSAndreas Gohr $character = array_keys(self::$unicodeCharacterMap); // but: for efficiency you should 47*8817535bSAndreas Gohr $replace = array_values(self::$unicodeCharacterMap); // pre-calculate these two arrays 48*8817535bSAndreas Gohr return str_replace($character, $replace, html_entity_decode($string, ENT_QUOTES, "UTF-8")); 49*8817535bSAndreas Gohr } 50*8817535bSAndreas Gohr 51*8817535bSAndreas Gohr /** 52*8817535bSAndreas Gohr * Multibyte.php safe version of standard trim() function. 53*8817535bSAndreas Gohr * 54*8817535bSAndreas Gohr * @param string $string 55*8817535bSAndreas Gohr * @return string 56*8817535bSAndreas Gohr */ 57*8817535bSAndreas Gohr public static function trim($string) 58*8817535bSAndreas Gohr { 59*8817535bSAndreas Gohr return mb_ereg_replace('^\s*([\s\S]*?)\s*$', '\1', $string); 60*8817535bSAndreas Gohr } 61*8817535bSAndreas Gohr 62*8817535bSAndreas Gohr /** 63*8817535bSAndreas Gohr * A cross between mb_split and preg_split, adding the preg_split flags 64*8817535bSAndreas Gohr * to mb_split. 65*8817535bSAndreas Gohr * 66*8817535bSAndreas Gohr * @param string $pattern 67*8817535bSAndreas Gohr * @param string $string 68*8817535bSAndreas Gohr * @param int $limit 69*8817535bSAndreas Gohr * @param int $flags 70*8817535bSAndreas Gohr * @return array 71*8817535bSAndreas Gohr */ 72*8817535bSAndreas Gohr public static function split($pattern, $string, $limit = -1, $flags = 0) 73*8817535bSAndreas Gohr { 74*8817535bSAndreas Gohr $offset_capture = (bool)($flags & PREG_SPLIT_OFFSET_CAPTURE); 75*8817535bSAndreas Gohr 76*8817535bSAndreas Gohr $lengths = self::getSplitLengths($pattern, $string); 77*8817535bSAndreas Gohr 78*8817535bSAndreas Gohr // Substrings 79*8817535bSAndreas Gohr $parts = []; 80*8817535bSAndreas Gohr $position = 0; 81*8817535bSAndreas Gohr $count = 1; 82*8817535bSAndreas Gohr foreach ($lengths as $length) { 83*8817535bSAndreas Gohr if (self::isLastPart($length, $flags, $limit, $count)) { 84*8817535bSAndreas Gohr $parts[] = self::makePart($string, $position, null, $offset_capture); 85*8817535bSAndreas Gohr return $parts; 86*8817535bSAndreas Gohr } 87*8817535bSAndreas Gohr 88*8817535bSAndreas Gohr if (self::isPart($length, $flags)) { 89*8817535bSAndreas Gohr $parts[] = self::makePart($string, $position, $length[0], $offset_capture); 90*8817535bSAndreas Gohr } 91*8817535bSAndreas Gohr 92*8817535bSAndreas Gohr $position += $length[0]; 93*8817535bSAndreas Gohr } 94*8817535bSAndreas Gohr 95*8817535bSAndreas Gohr return $parts; 96*8817535bSAndreas Gohr } 97*8817535bSAndreas Gohr 98*8817535bSAndreas Gohr /** 99*8817535bSAndreas Gohr * @param $length 100*8817535bSAndreas Gohr * @param $flags 101*8817535bSAndreas Gohr * @param $limit 102*8817535bSAndreas Gohr * @param $count 103*8817535bSAndreas Gohr * @return bool 104*8817535bSAndreas Gohr */ 105*8817535bSAndreas Gohr private static function isLastPart($length, $flags, $limit, &$count) 106*8817535bSAndreas Gohr { 107*8817535bSAndreas Gohr $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0]; 108*8817535bSAndreas Gohr $is_delimiter = $length[1]; 109*8817535bSAndreas Gohr 110*8817535bSAndreas Gohr return $limit > 0 111*8817535bSAndreas Gohr && !$is_delimiter 112*8817535bSAndreas Gohr && $split_empty 113*8817535bSAndreas Gohr && ++$count > $limit; 114*8817535bSAndreas Gohr } 115*8817535bSAndreas Gohr 116*8817535bSAndreas Gohr /** 117*8817535bSAndreas Gohr * @param $length 118*8817535bSAndreas Gohr * @param $flags 119*8817535bSAndreas Gohr * @return bool 120*8817535bSAndreas Gohr */ 121*8817535bSAndreas Gohr private static function isPart($length, $flags) 122*8817535bSAndreas Gohr { 123*8817535bSAndreas Gohr $split_empty = !($flags & PREG_SPLIT_NO_EMPTY) || $length[0]; 124*8817535bSAndreas Gohr $is_delimiter = $length[1]; 125*8817535bSAndreas Gohr $is_captured = ($flags & PREG_SPLIT_DELIM_CAPTURE) && $length[2]; 126*8817535bSAndreas Gohr 127*8817535bSAndreas Gohr return (!$is_delimiter 128*8817535bSAndreas Gohr || $is_captured) 129*8817535bSAndreas Gohr && $split_empty; 130*8817535bSAndreas Gohr } 131*8817535bSAndreas Gohr 132*8817535bSAndreas Gohr /** 133*8817535bSAndreas Gohr * Make part 134*8817535bSAndreas Gohr * @param string $string 135*8817535bSAndreas Gohr * @param integer $position 136*8817535bSAndreas Gohr * @param integer|null $length 137*8817535bSAndreas Gohr * @param bool $offset_capture 138*8817535bSAndreas Gohr * @return array|string 139*8817535bSAndreas Gohr */ 140*8817535bSAndreas Gohr private static function makePart($string, $position, $length = null, $offset_capture = false) 141*8817535bSAndreas Gohr { 142*8817535bSAndreas Gohr $cut = mb_strcut($string, $position, $length); 143*8817535bSAndreas Gohr 144*8817535bSAndreas Gohr return $offset_capture 145*8817535bSAndreas Gohr ? [$cut, $position] 146*8817535bSAndreas Gohr : $cut; 147*8817535bSAndreas Gohr } 148*8817535bSAndreas Gohr 149*8817535bSAndreas Gohr /** 150*8817535bSAndreas Gohr * Splits the string by pattern and for each element (part or split) returns: 151*8817535bSAndreas Gohr * [ 0 => length, 1 => is_delimiter?, 2 => 152*8817535bSAndreas Gohr * 153*8817535bSAndreas Gohr * @param $pattern 154*8817535bSAndreas Gohr * @param $string 155*8817535bSAndreas Gohr * @return array 156*8817535bSAndreas Gohr */ 157*8817535bSAndreas Gohr private static function getSplitLengths($pattern, $string) 158*8817535bSAndreas Gohr { 159*8817535bSAndreas Gohr $strlen = strlen($string); // bytes! 160*8817535bSAndreas Gohr $lengths = []; 161*8817535bSAndreas Gohr 162*8817535bSAndreas Gohr mb_ereg_search_init($string); 163*8817535bSAndreas Gohr 164*8817535bSAndreas Gohr $position = 0; 165*8817535bSAndreas Gohr while ($position < $strlen 166*8817535bSAndreas Gohr && ($array = mb_ereg_search_pos($pattern, '')) !== false) { 167*8817535bSAndreas Gohr // capture split 168*8817535bSAndreas Gohr $lengths[] = [$array[0] - $position, false, null]; 169*8817535bSAndreas Gohr 170*8817535bSAndreas Gohr // move position 171*8817535bSAndreas Gohr $position = $array[0] + $array[1]; 172*8817535bSAndreas Gohr 173*8817535bSAndreas Gohr // capture delimiter 174*8817535bSAndreas Gohr $regs = mb_ereg_search_getregs(); 175*8817535bSAndreas Gohr $lengths[] = [$array[1], true, isset($regs[1]) && $regs[1]]; 176*8817535bSAndreas Gohr } 177*8817535bSAndreas Gohr 178*8817535bSAndreas Gohr // Add last bit, if not ending with split 179*8817535bSAndreas Gohr $lengths[] = [$strlen - $position, false, null]; 180*8817535bSAndreas Gohr 181*8817535bSAndreas Gohr return $lengths; 182*8817535bSAndreas Gohr } 183*8817535bSAndreas Gohr}