1<?php 2 3namespace dokuwiki\Utf8; 4 5/** 6 * Methods and constants to handle Asian "words" 7 * 8 * This uses a crude regexp to determine which parts of an Asian string should be treated as words. 9 * This is necessary because in some Asian languages a single unicode char represents a whole idea 10 * without spaces separating them. 11 */ 12class Asian 13{ 14 /** 15 * This defines a non-capturing group for the use in regular expressions to match any asian character that 16 * needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from 17 * http://en.wikipedia.org/wiki/Unicode_block 18 */ 19 public const REGEXP = 20 '(?:' . 21 22 '[\x{0E00}-\x{0E7F}]' . // Thai 23 24 '|' . 25 26 '[' . 27 '\x{2E80}-\x{3040}' . // CJK -> Hangul 28 '\x{309D}-\x{30A0}' . 29 '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}' . 30 '\x{F900}-\x{FAFF}' . // CJK Compatibility Ideographs 31 '\x{FE30}-\x{FE4F}' . // CJK Compatibility Forms 32 "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F" . // CJK Extension B 33 "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF" . // CJK Extension C 34 "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F" . // CJK Extension D 35 "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF" . // CJK Compatibility Supplement 36 ']' . 37 38 '|' . 39 40 '[' . // Hiragana/Katakana (can be two characters) 41 '\x{3042}\x{3044}\x{3046}\x{3048}' . 42 '\x{304A}-\x{3062}\x{3064}-\x{3082}' . 43 '\x{3084}\x{3086}\x{3088}-\x{308D}' . 44 '\x{308F}-\x{3094}' . 45 '\x{30A2}\x{30A4}\x{30A6}\x{30A8}' . 46 '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}' . 47 '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}' . 48 '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}' . 49 '][' . 50 '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}' . 51 '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}' . 52 '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}' . 53 '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}' . 54 '\x{31F0}-\x{31FF}' . 55 ']?' . 56 ')'; 57 58 59 /** 60 * Check if the given term contains Asian word characters 61 * 62 * @param string $term 63 * @return bool 64 */ 65 public static function isAsianWords($term) 66 { 67 return (bool)preg_match('/' . self::REGEXP . '/u', $term); 68 } 69 70 /** 71 * Surround all Asian words in the given text with the given separator 72 * 73 * @param string $text Original text containing asian words 74 * @param string $sep the separator to use 75 * @return string Text with separated asian words 76 */ 77 public static function separateAsianWords($text, $sep = ' ') 78 { 79 // handle asian chars as single words (may fail on older PHP version) 80 $asia = @preg_replace('/(' . self::REGEXP . ')/u', $sep . '\1' . $sep, $text); 81 if (!is_null($asia)) $text = $asia; // recover from regexp falure 82 83 return $text; 84 } 85 86 /** 87 * Split the given text into separate parts 88 * 89 * Each part is either a non-asian string, or a single asian word 90 * 91 * @param string $term 92 * @return string[] 93 */ 94 public static function splitAsianWords($term) 95 { 96 return preg_split('/(' . self::REGEXP . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY); 97 } 98} 99