xref: /dokuwiki/inc/Utf8/Asian.php (revision d4f83172d9533c4d84f450fe22ef630816b21d75)
1dbc189b2SAndreas Gohr<?php
2dbc189b2SAndreas Gohr
3dbc189b2SAndreas Gohrnamespace dokuwiki\Utf8;
4dbc189b2SAndreas Gohr
5dbc189b2SAndreas Gohr/**
6dbc189b2SAndreas Gohr * Methods and constants to handle Asian "words"
7dbc189b2SAndreas Gohr *
8dbc189b2SAndreas Gohr * This uses a crude regexp to determine which parts of an Asian string should be treated as words.
9dbc189b2SAndreas Gohr * This is necessary because in some Asian languages a single unicode char represents a whole idea
10dbc189b2SAndreas Gohr * without spaces separating them.
11dbc189b2SAndreas Gohr */
12dbc189b2SAndreas Gohrclass Asian
13dbc189b2SAndreas Gohr{
14dbc189b2SAndreas Gohr    /**
15dbc189b2SAndreas Gohr     * This defines a non-capturing group for the use in regular expressions to match any asian character that
16dbc189b2SAndreas Gohr     * needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from
17dbc189b2SAndreas Gohr     * http://en.wikipedia.org/wiki/Unicode_block
18dbc189b2SAndreas Gohr     */
19*74981a4eSAndreas Gohr    public const REGEXP =
20dbc189b2SAndreas Gohr        '(?:' .
21dbc189b2SAndreas Gohr
22dbc189b2SAndreas Gohr        '[\x{0E00}-\x{0E7F}]' . // Thai
23dbc189b2SAndreas Gohr
24dbc189b2SAndreas Gohr        '|' .
25dbc189b2SAndreas Gohr
26dbc189b2SAndreas Gohr        '[' .
27dbc189b2SAndreas Gohr        '\x{2E80}-\x{3040}' .  // CJK -> Hangul
28dbc189b2SAndreas Gohr        '\x{309D}-\x{30A0}' .
29dbc189b2SAndreas Gohr        '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}' .
30dbc189b2SAndreas Gohr        '\x{F900}-\x{FAFF}' .  // CJK Compatibility Ideographs
31dbc189b2SAndreas Gohr        '\x{FE30}-\x{FE4F}' .  // CJK Compatibility Forms
32dbc189b2SAndreas Gohr        "\xF0\xA0\x80\x80-\xF0\xAA\x9B\x9F" . // CJK Extension B
33dbc189b2SAndreas Gohr        "\xF0\xAA\x9C\x80-\xF0\xAB\x9C\xBF" . // CJK Extension C
34dbc189b2SAndreas Gohr        "\xF0\xAB\x9D\x80-\xF0\xAB\xA0\x9F" . // CJK Extension D
35dbc189b2SAndreas Gohr        "\xF0\xAF\xA0\x80-\xF0\xAF\xAB\xBF" . // CJK Compatibility Supplement
36dbc189b2SAndreas Gohr        ']' .
37dbc189b2SAndreas Gohr
38dbc189b2SAndreas Gohr        '|' .
39dbc189b2SAndreas Gohr
40dbc189b2SAndreas Gohr        '[' .                // Hiragana/Katakana (can be two characters)
41dbc189b2SAndreas Gohr        '\x{3042}\x{3044}\x{3046}\x{3048}' .
42dbc189b2SAndreas Gohr        '\x{304A}-\x{3062}\x{3064}-\x{3082}' .
43dbc189b2SAndreas Gohr        '\x{3084}\x{3086}\x{3088}-\x{308D}' .
44dbc189b2SAndreas Gohr        '\x{308F}-\x{3094}' .
45dbc189b2SAndreas Gohr        '\x{30A2}\x{30A4}\x{30A6}\x{30A8}' .
46dbc189b2SAndreas Gohr        '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}' .
47dbc189b2SAndreas Gohr        '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}' .
48dbc189b2SAndreas Gohr        '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}' .
49dbc189b2SAndreas Gohr        '][' .
50dbc189b2SAndreas Gohr        '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}' .
51dbc189b2SAndreas Gohr        '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}' .
52dbc189b2SAndreas Gohr        '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}' .
53dbc189b2SAndreas Gohr        '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}' .
54dbc189b2SAndreas Gohr        '\x{31F0}-\x{31FF}' .
55dbc189b2SAndreas Gohr        ']?' .
56dbc189b2SAndreas Gohr        ')';
57dbc189b2SAndreas Gohr
58dbc189b2SAndreas Gohr
59dbc189b2SAndreas Gohr    /**
60dbc189b2SAndreas Gohr     * Check if the given term contains Asian word characters
61dbc189b2SAndreas Gohr     *
62dbc189b2SAndreas Gohr     * @param string $term
63dbc189b2SAndreas Gohr     * @return bool
64dbc189b2SAndreas Gohr     */
65dbc189b2SAndreas Gohr    public static function isAsianWords($term)
66dbc189b2SAndreas Gohr    {
67dbc189b2SAndreas Gohr        return (bool)preg_match('/' . self::REGEXP . '/u', $term);
68dbc189b2SAndreas Gohr    }
69dbc189b2SAndreas Gohr
70dbc189b2SAndreas Gohr    /**
71dbc189b2SAndreas Gohr     * Surround all Asian words in the given text with the given separator
72dbc189b2SAndreas Gohr     *
73dbc189b2SAndreas Gohr     * @param string $text Original text containing asian words
74dbc189b2SAndreas Gohr     * @param string $sep the separator to use
75dbc189b2SAndreas Gohr     * @return string Text with separated asian words
76dbc189b2SAndreas Gohr     */
77dbc189b2SAndreas Gohr    public static function separateAsianWords($text, $sep = ' ')
78dbc189b2SAndreas Gohr    {
79dbc189b2SAndreas Gohr        // handle asian chars as single words (may fail on older PHP version)
80dbc189b2SAndreas Gohr        $asia = @preg_replace('/(' . self::REGEXP . ')/u', $sep . '\1' . $sep, $text);
81dbc189b2SAndreas Gohr        if (!is_null($asia)) $text = $asia; // recover from regexp falure
82dbc189b2SAndreas Gohr
83dbc189b2SAndreas Gohr        return $text;
84dbc189b2SAndreas Gohr    }
85dbc189b2SAndreas Gohr
86dbc189b2SAndreas Gohr    /**
87dbc189b2SAndreas Gohr     * Split the given text into separate parts
88dbc189b2SAndreas Gohr     *
89dbc189b2SAndreas Gohr     * Each part is either a non-asian string, or a single asian word
90dbc189b2SAndreas Gohr     *
91dbc189b2SAndreas Gohr     * @param string $term
92dbc189b2SAndreas Gohr     * @return string[]
93dbc189b2SAndreas Gohr     */
94dbc189b2SAndreas Gohr    public static function splitAsianWords($term)
95dbc189b2SAndreas Gohr    {
96dbc189b2SAndreas Gohr        return preg_split('/(' . self::REGEXP . '+)/u', $term, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY);
97dbc189b2SAndreas Gohr    }
98dbc189b2SAndreas Gohr}
99