1<?php
2/**
3 * Re-work of the phpHyphenation-library from yellowgreen designbüro
4 * Original: JavaScript Hyphenator 10 (Beta) by Matthias Nater
5 *
6 * @author Markus Birth <markus@birth-online.de>
7 * @license Creative Commons Attribution-Share Alike 2.5 Switzerland
8 * @link http://yellowgreen.de/hyphenation-in-web/
9 */
10class phpHyphenation {
11    static protected $pathToPatterns = 'patterns/';
12    protected $language = 'en';
13    protected $patterns = array();
14    protected $dictWords = array();
15    protected $hyphen = '&shy;';
16    protected $leftMin = 2;
17    protected $rightMin = 2;
18    protected $charMin = 2;
19    protected $charMax = 10;
20    protected $ignoreTags = array('code', 'pre', 'script', 'style');
21
22    /**
23     * Sets the directory which contains the patterns
24     * @param string $path Path to the directory containing the patterns
25     * @return bool TRUE on success, FALSE if the specified $path does not exist
26     */
27    public static function setPatternPath($path) {
28        if (!is_dir($path)) return false;
29        self::$pathToPatterns = $path;
30        return true;
31    }
32
33    /**
34     * Sets the tags to ignore (default: code, pre, script, style)
35     * @param array $tags Array containing tags to ignore
36     * @param bool $append Set to true to append the specified $tags to the ignore-list (default: false)
37     */
38    public function setIgnoreTags($tags, $append=false) {
39        if (!$append) $this->ignoreTags = array();
40        $this->ignoreTags = array_merge($this->ignoreTags, $tags);
41    }
42
43    /**
44     * Returns the current ignore-list for tags
45     * @return array Array containing tags to ignore
46     */
47    public function getIgnoreTags() {
48        return $this->ignoreTags;
49    }
50
51    /**
52     * Sets the hyphen to use. Defaults to soft-hyphen entity.
53     * @param string $hyphen The hypen to use (default: <code>&shy;</code>)
54     * @return bool TRUE on success, FALSE on error.
55     */
56    public function setHyphen($hyphen='&shy;') {
57        if (strlen($hyphen) == 0) return false;
58        // update hyphenation in user dict
59        foreach ($this->dictWords as $key=>$value) {
60            $this->dictWords[$key] = str_replace($this->hyphen, $hyphen, $value);
61        }
62        $this->hyphen = $hyphen;
63        return true;
64    }
65
66    /**
67     * Sets the hyphenation constraints.
68     * @param int $leftMin Minimum letters to leave on the left side of a word (default: 2)
69     * @param int $rightMin Minimum letters to leave on the right side of a word (default: 2)
70     * @param int $charMin Minimum letters a word must have to be hyphenated (default: 2)
71     * @param int $charMax Maximum letters to search for a hyphenation possibility (default: 10)
72     */
73    public function setConstraints($leftMin=2, $rightMin=2, $charMin=2, $charMax=10) {
74        $this->leftMin  = $leftMin;
75        $this->rightMin = $rightMin;
76        $this->charMin  = $charMin;
77        $this->charMax  = $charMax;
78    }
79
80    /**
81     * Creates a new phpHyphenation-object. You might have to use phpHyphenation::setPatternPath() for it to find the patterns before you can instantiate the class.
82     * @param string $language Language patterns to use. A file with this name has to exist in self::$pathToPatterns. (default: en)
83     * @param string $hyphen Hyphen to use (default: <code>&shy;</code>)
84     * @return phpHyphenation
85     */
86    public function __construct($language='en', $hyphen='&shy;') {
87        mb_internal_encoding('utf-8');
88        $this->hyphen   = $hyphen;
89        if (!$this->loadLanguage($language)) return false;
90    }
91
92    /**
93     * Sets a new language for hyphenation.
94     * @param string $language Language patterns to use. A file with this name has to exist in $path.
95     * @param string $path The path to the patterns. Defaults to self::$pathToPatterns.
96     * @return bool TRUE on success, FALSE on error.
97     */
98    public function loadLanguage($language, $path = false) {
99        if ($path === false) $path = self::$pathToPatterns;
100        if (!file_exists($path . '/' . $language . '.php')) return false;
101        include($path . '/' . $language . '.php');
102        $this->language = $language;
103        $this->patterns = $this->convertPatterns($patterns);
104        return true;
105    }
106
107    /**
108     * Loads the user-defined hyphenations from a file. (Format: one word per line, hyphenation locations marked by a slash ("/").)
109     * @param string $filename Filename of the file containing the user defined words.
110     * @param bool $append Set to TRUE to append the new words to the list. (default: false)
111     * @return bool TRUE on sucess, FALSE on error.
112     */
113    public function loadUserDictFromFile($filename, $append=false) {
114        // get userDict
115        if (empty($filename) || !file_exists($filename)) return false;
116        $dictionary = file($filename, FILE_IGNORE_NEW_LINES);
117        return $this->loadUserDictFromArray($dictionary, $append);
118    }
119
120    /**
121     * Adds user-defined hyphenations from an array. (Format: one entry per word, hyphenation locations marked by a slash ("/").)
122     * @param array $userdict Array containing user defined words.
123     * @param bool $append Set to TRUE to append the new words to the list. (default: false)
124     * @return bool TRUE on success, FALSE on error.
125     */
126    public function loadUserDictFromArray($userdict, $append=false) {
127        if (!is_array($userdict)) return false;
128        if (!$append) $this->dictWords = array();
129        foreach ($userdict as $entry) {
130            $entry = mb_strtolower(trim($entry));
131            $this->dictWords[str_replace('/', '', $entry)] = str_replace('/', $this->hyphen, $entry);
132        }
133        return true;
134    }
135
136    /**
137     * Loads the patterns from a pattern file into an associative array.
138     * @param string $patterns Patterns separated by a space character (" ")
139     * @return array Associative array with the patterns
140     */
141    protected function convertPatterns($patterns) {
142        $patterns = mb_split(' ', $patterns);
143        $new_patterns = array();
144        foreach ($patterns as $pattern) {
145            $new_patterns[preg_replace('/[0-9]/', '', $pattern)] = $pattern;
146        }
147        return $new_patterns;
148    }
149
150    /**
151     * Hyphenates a complete text and ignores HTML tags defined in $this->ignoreTags.
152     * @param string $text Text to hyphenate
153     * @return string Text with $this->hyphen added to the hyphenation locations
154     */
155    public function doHyphenation($text) {
156        $result  = array();
157        $tag     = '';
158        $tagName = '';
159        $tagJump = 0;
160        $word    = '';
161        $word_boundaries = "<>\t\n\r\0\x0B !\"§$%&/()=?….,;:-–_„”«»‘’'/\\‹›()[]{}*+´`^|©℗®™℠¹²³";
162        $text   .= ' ';
163
164        for ($i=0;$i<mb_strlen($text);$i++) {
165            $char = mb_substr($text, $i, 1);
166            if (mb_strpos($word_boundaries, $char)===false && $tag=='') {
167                $word .= $char;
168                continue;
169            }
170            if ($word != '') {
171                $result[] = $this->wordHyphenation($word);
172                $word = '';
173            }
174            if ($tag != '' || $char == '<') {
175                $tag .= $char;
176            }
177            if ($tag != '' && $char == '>') {
178#echo 'tag closed: *' . $tag . '#' . PHP_EOL;
179                $tagSep  = mb_strpos($tag, ' ');
180                $tagSep2 = mb_strpos($tag, '>');
181                if ($tagSep === false || $tagSep2 < $tagSep) {
182                    $tagSep = $tagSep2;
183                }
184                $tagName = mb_substr($tag, 1, $tagSep-1);
185#echo 'tagName: ' . $tagName . PHP_EOL;
186                if ($tagJump == 0 && in_array(mb_strtolower($tagName), $this->ignoreTags)) {
187                    $tagJump = 1;
188#echo 'IGNORING TAG: ' . $tagName . PHP_EOL;
189                } elseif ($tagJump == 0 || mb_strtolower(mb_substr($tag, -mb_strlen($tagName)-3)) == '</'.mb_strtolower($tagName).'>') {
190#echo 'Tag done: *' . $tag . '#' . PHP_EOL;
191                    $result[] = $tag;
192                    $tag = '';
193                    $tagJump = 0;
194                }
195            }
196            if ($tag == '' && $char != '<' && $char != '>') {
197                $result[] = $char;
198            }
199        }
200        if ($tag != '') $result[] = $tag;
201        $text = join('', $result);
202        return substr($text, 0, -1);
203    }
204
205    /**
206     * Hyphenates a single word, i.e. inserts $this->hyphen at locations for hyphenation.
207     * @param string $word Single word to hyphenate
208     * @return string Hyphenated version of the word
209     */
210    public function wordHyphenation($word) {
211        if(mb_strlen($word) < $this->charMin) return $word;
212        if(mb_strpos($word, $this->hyphen) !== false) return $word;
213        if(isset($this->dictWords[mb_strtolower($word)])) return $this->dictWords[mb_strtolower($word)];
214
215        $text_word = '_' . $word . '_';
216        $word_length = mb_strlen($text_word);
217        $single_character = preg_split('//u', $text_word, -1, PREG_SPLIT_NO_EMPTY);
218        $text_word = mb_strtolower($text_word);
219        $hyphenated_word = array();
220        $numb3rs = array('0' => true, '1' => true, '2' => true, '3' => true, '4' => true, '5' => true, '6' => true, '7' => true, '8' => true, '9' => true);
221
222        for ($position=0; $position<=($word_length-$this->charMin); $position++) {
223            $maxwins = min(($word_length-$position), $this->charMax);
224
225            for ($win=$this->charMin; $win<=$maxwins; $win++) {
226                if (isset($this->patterns[mb_substr($text_word, $position, $win)])) {
227                    $pattern = $this->patterns[mb_substr($text_word, $position, $win)];
228                    $digits = 1;
229                    $pattern_length = mb_strlen($pattern);
230
231                    for ($i=0; $i<$pattern_length; $i++) {
232                        $char = $pattern[$i];
233                        if (isset($numb3rs[$char])) {
234                            $zero = ($i==0)?$position-1:$position+$i-$digits;
235                            if (!isset($hyphenated_word[$zero]) || $hyphenated_word[$zero]!=$char) $hyphenated_word[$zero] = $char;
236                            $digits++;
237                        }
238                    }
239                }
240            }
241        }
242
243        $inserted = 0;
244        for ($i=$this->leftMin; $i<=(mb_strlen($word)-$this->rightMin); $i++) {
245            if (isset($hyphenated_word[$i]) && $hyphenated_word[$i]%2!=0) {
246                array_splice($single_character, $i+$inserted+1, 0, $this->hyphen);
247                $inserted++;
248            }
249        }
250
251        return implode('', array_slice($single_character, 1, -1));
252    }
253
254
255}
256?>