1dbc189b2SAndreas Gohr<?php 2dbc189b2SAndreas Gohr 3dbc189b2SAndreas Gohrnamespace dokuwiki\Utf8; 4dbc189b2SAndreas Gohr 5dbc189b2SAndreas Gohr/** 6dbc189b2SAndreas Gohr * Methods and constants to handle Asian "words" 7dbc189b2SAndreas Gohr * 8dbc189b2SAndreas Gohr * This uses a crude regexp to determine which parts of an Asian string should be treated as words. 9dbc189b2SAndreas Gohr * This is necessary because in some Asian languages a single unicode char represents a whole idea 10dbc189b2SAndreas Gohr * without spaces separating them. 11dbc189b2SAndreas Gohr */ 12dbc189b2SAndreas Gohrclass Asian 13dbc189b2SAndreas Gohr{ 14dbc189b2SAndreas Gohr 15dbc189b2SAndreas Gohr /** 16dbc189b2SAndreas Gohr * This defines a non-capturing group for the use in regular expressions to match any asian character that 17dbc189b2SAndreas Gohr * needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from 18dbc189b2SAndreas Gohr * http://en.wikipedia.org/wiki/Unicode_block 19dbc189b2SAndreas Gohr */ 20*74981a4eSAndreas Gohr public const REGEXP = 21dbc189b2SAndreas Gohr '(?:' . 22dbc189b2SAndreas Gohr 23dbc189b2SAndreas Gohr '[\x{0E00}-\x{0E7F}]' . // Thai 24dbc189b2SAndreas Gohr 25dbc189b2SAndreas Gohr '|' . 26dbc189b2SAndreas Gohr 27dbc189b2SAndreas Gohr '[' . 28dbc189b2SAndreas Gohr '\x{2E80}-\x{3040}' . // CJK -> Hangul 29dbc189b2SAndreas Gohr '\x{309D}-\x{30A0}' . 30dbc189b2SAndreas Gohr '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}' . 31dbc189b2SAndreas Gohr '\x{F900}-\x{FAFF}' . // CJK Compatibility Ideographs 32dbc189b2SAndreas Gohr '\x{FE30}-\x{FE4F}' . // CJK Compatibility Forms 33dbc189b2SAndreas Gohr "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F" . // CJK Extension B 34dbc189b2SAndreas Gohr "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF" . // CJK Extension C 35dbc189b2SAndreas Gohr "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F" . // CJK Extension D 36dbc189b2SAndreas Gohr "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF" . // CJK Compatibility Supplement 37dbc189b2SAndreas Gohr ']' . 38dbc189b2SAndreas Gohr 39dbc189b2SAndreas Gohr '|' . 40dbc189b2SAndreas Gohr 41dbc189b2SAndreas Gohr '[' . // Hiragana/Katakana (can be two characters) 42dbc189b2SAndreas Gohr '\x{3042}\x{3044}\x{3046}\x{3048}' . 43dbc189b2SAndreas Gohr '\x{304A}-\x{3062}\x{3064}-\x{3082}' . 44dbc189b2SAndreas Gohr '\x{3084}\x{3086}\x{3088}-\x{308D}' . 45dbc189b2SAndreas Gohr '\x{308F}-\x{3094}' . 46dbc189b2SAndreas Gohr '\x{30A2}\x{30A4}\x{30A6}\x{30A8}' . 47dbc189b2SAndreas Gohr '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}' . 48dbc189b2SAndreas Gohr '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}' . 49dbc189b2SAndreas Gohr '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}' . 50dbc189b2SAndreas Gohr '][' . 51dbc189b2SAndreas Gohr '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}' . 52dbc189b2SAndreas Gohr '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}' . 53dbc189b2SAndreas Gohr '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}' . 54dbc189b2SAndreas Gohr '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}' . 55dbc189b2SAndreas Gohr '\x{31F0}-\x{31FF}' . 56dbc189b2SAndreas Gohr ']?' . 57dbc189b2SAndreas Gohr ')'; 58dbc189b2SAndreas Gohr 59dbc189b2SAndreas Gohr 60dbc189b2SAndreas Gohr /** 61dbc189b2SAndreas Gohr * Check if the given term contains Asian word characters 62dbc189b2SAndreas Gohr * 63dbc189b2SAndreas Gohr * @param string $term 64dbc189b2SAndreas Gohr * @return bool 65dbc189b2SAndreas Gohr */ 66dbc189b2SAndreas Gohr public static function isAsianWords($term) 67dbc189b2SAndreas Gohr { 68dbc189b2SAndreas Gohr return (bool)preg_match('/' . self::REGEXP . '/u', $term); 69dbc189b2SAndreas Gohr } 70dbc189b2SAndreas Gohr 71dbc189b2SAndreas Gohr /** 72dbc189b2SAndreas Gohr * Surround all Asian words in the given text with the given separator 73dbc189b2SAndreas Gohr * 74dbc189b2SAndreas Gohr * @param string $text Original text containing asian words 75dbc189b2SAndreas Gohr * @param string $sep the separator to use 76dbc189b2SAndreas Gohr * @return string Text with separated asian words 77dbc189b2SAndreas Gohr */ 78dbc189b2SAndreas Gohr public static function separateAsianWords($text, $sep = ' ') 79dbc189b2SAndreas Gohr { 80dbc189b2SAndreas Gohr // handle asian chars as single words (may fail on older PHP version) 81dbc189b2SAndreas Gohr $asia = @preg_replace('/(' . self::REGEXP . ')/u', $sep . '\1' . $sep, $text); 82dbc189b2SAndreas Gohr if (!is_null($asia)) $text = $asia; // recover from regexp falure 83dbc189b2SAndreas Gohr 84dbc189b2SAndreas Gohr return $text; 85dbc189b2SAndreas Gohr } 86dbc189b2SAndreas Gohr 87dbc189b2SAndreas Gohr /** 88dbc189b2SAndreas Gohr * Split the given text into separate parts 89dbc189b2SAndreas Gohr * 90dbc189b2SAndreas Gohr * Each part is either a non-asian string, or a single asian word 91dbc189b2SAndreas Gohr * 92dbc189b2SAndreas Gohr * @param string $term 93dbc189b2SAndreas Gohr * @return string[] 94dbc189b2SAndreas Gohr */ 95dbc189b2SAndreas Gohr public static function splitAsianWords($term) 96dbc189b2SAndreas Gohr { 97dbc189b2SAndreas Gohr return preg_split('/(' . self::REGEXP . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); 98dbc189b2SAndreas Gohr } 99dbc189b2SAndreas Gohr} 100