1*dbc189b2SAndreas Gohr<?php 2*dbc189b2SAndreas Gohr 3*dbc189b2SAndreas Gohrnamespace dokuwiki\Utf8; 4*dbc189b2SAndreas Gohr 5*dbc189b2SAndreas Gohr/** 6*dbc189b2SAndreas Gohr * Methods and constants to handle Asian "words" 7*dbc189b2SAndreas Gohr * 8*dbc189b2SAndreas Gohr * This uses a crude regexp to determine which parts of an Asian string should be treated as words. 9*dbc189b2SAndreas Gohr * This is necessary because in some Asian languages a single unicode char represents a whole idea 10*dbc189b2SAndreas Gohr * without spaces separating them. 11*dbc189b2SAndreas Gohr */ 12*dbc189b2SAndreas Gohrclass Asian 13*dbc189b2SAndreas Gohr{ 14*dbc189b2SAndreas Gohr 15*dbc189b2SAndreas Gohr /** 16*dbc189b2SAndreas Gohr * This defines a non-capturing group for the use in regular expressions to match any asian character that 17*dbc189b2SAndreas Gohr * needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from 18*dbc189b2SAndreas Gohr * http://en.wikipedia.org/wiki/Unicode_block 19*dbc189b2SAndreas Gohr */ 20*dbc189b2SAndreas Gohr const REGEXP = 21*dbc189b2SAndreas Gohr '(?:' . 22*dbc189b2SAndreas Gohr 23*dbc189b2SAndreas Gohr '[\x{0E00}-\x{0E7F}]' . // Thai 24*dbc189b2SAndreas Gohr 25*dbc189b2SAndreas Gohr '|' . 26*dbc189b2SAndreas Gohr 27*dbc189b2SAndreas Gohr '[' . 28*dbc189b2SAndreas Gohr '\x{2E80}-\x{3040}' . // CJK -> Hangul 29*dbc189b2SAndreas Gohr '\x{309D}-\x{30A0}' . 30*dbc189b2SAndreas Gohr '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}' . 31*dbc189b2SAndreas Gohr '\x{F900}-\x{FAFF}' . // CJK Compatibility Ideographs 32*dbc189b2SAndreas Gohr '\x{FE30}-\x{FE4F}' . // CJK Compatibility Forms 33*dbc189b2SAndreas Gohr "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F" . // CJK Extension B 34*dbc189b2SAndreas Gohr "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF" . // CJK Extension C 35*dbc189b2SAndreas Gohr "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F" . // CJK Extension D 36*dbc189b2SAndreas Gohr "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF" . // CJK Compatibility Supplement 37*dbc189b2SAndreas Gohr ']' . 38*dbc189b2SAndreas Gohr 39*dbc189b2SAndreas Gohr '|' . 40*dbc189b2SAndreas Gohr 41*dbc189b2SAndreas Gohr '[' . // Hiragana/Katakana (can be two characters) 42*dbc189b2SAndreas Gohr '\x{3042}\x{3044}\x{3046}\x{3048}' . 43*dbc189b2SAndreas Gohr '\x{304A}-\x{3062}\x{3064}-\x{3082}' . 44*dbc189b2SAndreas Gohr '\x{3084}\x{3086}\x{3088}-\x{308D}' . 45*dbc189b2SAndreas Gohr '\x{308F}-\x{3094}' . 46*dbc189b2SAndreas Gohr '\x{30A2}\x{30A4}\x{30A6}\x{30A8}' . 47*dbc189b2SAndreas Gohr '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}' . 48*dbc189b2SAndreas Gohr '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}' . 49*dbc189b2SAndreas Gohr '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}' . 50*dbc189b2SAndreas Gohr '][' . 51*dbc189b2SAndreas Gohr '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}' . 52*dbc189b2SAndreas Gohr '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}' . 53*dbc189b2SAndreas Gohr '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}' . 54*dbc189b2SAndreas Gohr '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}' . 55*dbc189b2SAndreas Gohr '\x{31F0}-\x{31FF}' . 56*dbc189b2SAndreas Gohr ']?' . 57*dbc189b2SAndreas Gohr ')'; 58*dbc189b2SAndreas Gohr 59*dbc189b2SAndreas Gohr 60*dbc189b2SAndreas Gohr /** 61*dbc189b2SAndreas Gohr * Check if the given term contains Asian word characters 62*dbc189b2SAndreas Gohr * 63*dbc189b2SAndreas Gohr * @param string $term 64*dbc189b2SAndreas Gohr * @return bool 65*dbc189b2SAndreas Gohr */ 66*dbc189b2SAndreas Gohr public static function isAsianWords($term) 67*dbc189b2SAndreas Gohr { 68*dbc189b2SAndreas Gohr return (bool)preg_match('/' . self::REGEXP . '/u', $term); 69*dbc189b2SAndreas Gohr } 70*dbc189b2SAndreas Gohr 71*dbc189b2SAndreas Gohr /** 72*dbc189b2SAndreas Gohr * Surround all Asian words in the given text with the given separator 73*dbc189b2SAndreas Gohr * 74*dbc189b2SAndreas Gohr * @param string $text Original text containing asian words 75*dbc189b2SAndreas Gohr * @param string $sep the separator to use 76*dbc189b2SAndreas Gohr * @return string Text with separated asian words 77*dbc189b2SAndreas Gohr */ 78*dbc189b2SAndreas Gohr public static function separateAsianWords($text, $sep = ' ') 79*dbc189b2SAndreas Gohr { 80*dbc189b2SAndreas Gohr // handle asian chars as single words (may fail on older PHP version) 81*dbc189b2SAndreas Gohr $asia = @preg_replace('/(' . self::REGEXP . ')/u', $sep . '\1' . $sep, $text); 82*dbc189b2SAndreas Gohr if (!is_null($asia)) $text = $asia; // recover from regexp falure 83*dbc189b2SAndreas Gohr 84*dbc189b2SAndreas Gohr return $text; 85*dbc189b2SAndreas Gohr } 86*dbc189b2SAndreas Gohr 87*dbc189b2SAndreas Gohr /** 88*dbc189b2SAndreas Gohr * Split the given text into separate parts 89*dbc189b2SAndreas Gohr * 90*dbc189b2SAndreas Gohr * Each part is either a non-asian string, or a single asian word 91*dbc189b2SAndreas Gohr * 92*dbc189b2SAndreas Gohr * @param string $term 93*dbc189b2SAndreas Gohr * @return string[] 94*dbc189b2SAndreas Gohr */ 95*dbc189b2SAndreas Gohr public static function splitAsianWords($term) 96*dbc189b2SAndreas Gohr { 97*dbc189b2SAndreas Gohr return preg_split('/(' . self::REGEXP . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); 98*dbc189b2SAndreas Gohr } 99*dbc189b2SAndreas Gohr} 100