1<?php
2
3/*
4 * This file is part of Mustache.php.
5 *
6 * (c) 2010-2017 Justin Hileman
7 *
8 * For the full copyright and license information, please view the LICENSE
9 * file that was distributed with this source code.
10 */
11
12/**
13 * Mustache Tokenizer class.
14 *
15 * This class is responsible for turning raw template source into a set of Mustache tokens.
16 */
17class Mustache_Tokenizer
18{
19    // Finite state machine states
20    const IN_TEXT     = 0;
21    const IN_TAG_TYPE = 1;
22    const IN_TAG      = 2;
23
24    // Token types
25    const T_SECTION      = '#';
26    const T_INVERTED     = '^';
27    const T_END_SECTION  = '/';
28    const T_COMMENT      = '!';
29    const T_PARTIAL      = '>';
30    const T_PARENT       = '<';
31    const T_DELIM_CHANGE = '=';
32    const T_ESCAPED      = '_v';
33    const T_UNESCAPED    = '{';
34    const T_UNESCAPED_2  = '&';
35    const T_TEXT         = '_t';
36    const T_PRAGMA       = '%';
37    const T_BLOCK_VAR    = '$';
38    const T_BLOCK_ARG    = '$arg';
39
40    // Valid token types
41    private static $tagTypes = array(
42        self::T_SECTION      => true,
43        self::T_INVERTED     => true,
44        self::T_END_SECTION  => true,
45        self::T_COMMENT      => true,
46        self::T_PARTIAL      => true,
47        self::T_PARENT       => true,
48        self::T_DELIM_CHANGE => true,
49        self::T_ESCAPED      => true,
50        self::T_UNESCAPED    => true,
51        self::T_UNESCAPED_2  => true,
52        self::T_PRAGMA       => true,
53        self::T_BLOCK_VAR    => true,
54    );
55
56    // Token properties
57    const TYPE    = 'type';
58    const NAME    = 'name';
59    const OTAG    = 'otag';
60    const CTAG    = 'ctag';
61    const LINE    = 'line';
62    const INDEX   = 'index';
63    const END     = 'end';
64    const INDENT  = 'indent';
65    const NODES   = 'nodes';
66    const VALUE   = 'value';
67    const FILTERS = 'filters';
68
69    private $state;
70    private $tagType;
71    private $buffer;
72    private $tokens;
73    private $seenTag;
74    private $line;
75
76    private $otag;
77    private $otagChar;
78    private $otagLen;
79
80    private $ctag;
81    private $ctagChar;
82    private $ctagLen;
83
84    /**
85     * Scan and tokenize template source.
86     *
87     * @throws Mustache_Exception_SyntaxException when mismatched section tags are encountered
88     * @throws Mustache_Exception_InvalidArgumentException when $delimiters string is invalid
89     *
90     * @param string $text       Mustache template source to tokenize
91     * @param string $delimiters Optionally, pass initial opening and closing delimiters (default: null)
92     *
93     * @return array Set of Mustache tokens
94     */
95    public function scan($text, $delimiters = null)
96    {
97        // Setting mbstring.func_overload makes things *really* slow.
98        // Let's do everyone a favor and scan this string as ASCII instead.
99        //
100        // @codeCoverageIgnoreStart
101        $encoding = null;
102        if (function_exists('mb_internal_encoding') && ini_get('mbstring.func_overload') & 2) {
103            $encoding = mb_internal_encoding();
104            mb_internal_encoding('ASCII');
105        }
106        // @codeCoverageIgnoreEnd
107
108        $this->reset();
109
110        if ($delimiters = trim($delimiters)) {
111            $this->setDelimiters($delimiters);
112        }
113
114        $len = strlen($text);
115        for ($i = 0; $i < $len; $i++) {
116            switch ($this->state) {
117                case self::IN_TEXT:
118                    $char = $text[$i];
119                    // Test whether it's time to change tags.
120                    if ($char === $this->otagChar && substr($text, $i, $this->otagLen) === $this->otag) {
121                        $i--;
122                        $this->flushBuffer();
123                        $this->state = self::IN_TAG_TYPE;
124                    } else {
125                        $this->buffer .= $char;
126                        if ($char === "\n") {
127                            $this->flushBuffer();
128                            $this->line++;
129                        }
130                    }
131                    break;
132
133                case self::IN_TAG_TYPE:
134                    $i += $this->otagLen - 1;
135                    $char = $text[$i + 1];
136                    if (isset(self::$tagTypes[$char])) {
137                        $tag = $char;
138                        $this->tagType = $tag;
139                    } else {
140                        $tag = null;
141                        $this->tagType = self::T_ESCAPED;
142                    }
143
144                    if ($this->tagType === self::T_DELIM_CHANGE) {
145                        $i = $this->changeDelimiters($text, $i);
146                        $this->state = self::IN_TEXT;
147                    } elseif ($this->tagType === self::T_PRAGMA) {
148                        $i = $this->addPragma($text, $i);
149                        $this->state = self::IN_TEXT;
150                    } else {
151                        if ($tag !== null) {
152                            $i++;
153                        }
154                        $this->state = self::IN_TAG;
155                    }
156                    $this->seenTag = $i;
157                    break;
158
159                default:
160                    $char = $text[$i];
161                    // Test whether it's time to change tags.
162                    if ($char === $this->ctagChar && substr($text, $i, $this->ctagLen) === $this->ctag) {
163                        $token = array(
164                            self::TYPE  => $this->tagType,
165                            self::NAME  => trim($this->buffer),
166                            self::OTAG  => $this->otag,
167                            self::CTAG  => $this->ctag,
168                            self::LINE  => $this->line,
169                            self::INDEX => ($this->tagType === self::T_END_SECTION) ? $this->seenTag - $this->otagLen : $i + $this->ctagLen,
170                        );
171
172                        if ($this->tagType === self::T_UNESCAPED) {
173                            // Clean up `{{{ tripleStache }}}` style tokens.
174                            if ($this->ctag === '}}') {
175                                if (($i + 2 < $len) && $text[$i + 2] === '}') {
176                                    $i++;
177                                } else {
178                                    $msg = sprintf(
179                                        'Mismatched tag delimiters: %s on line %d',
180                                        $token[self::NAME],
181                                        $token[self::LINE]
182                                    );
183
184                                    throw new Mustache_Exception_SyntaxException($msg, $token);
185                                }
186                            } else {
187                                $lastName = $token[self::NAME];
188                                if (substr($lastName, -1) === '}') {
189                                    $token[self::NAME] = trim(substr($lastName, 0, -1));
190                                } else {
191                                    $msg = sprintf(
192                                        'Mismatched tag delimiters: %s on line %d',
193                                        $token[self::NAME],
194                                        $token[self::LINE]
195                                    );
196
197                                    throw new Mustache_Exception_SyntaxException($msg, $token);
198                                }
199                            }
200                        }
201
202                        $this->buffer = '';
203                        $i += $this->ctagLen - 1;
204                        $this->state = self::IN_TEXT;
205                        $this->tokens[] = $token;
206                    } else {
207                        $this->buffer .= $char;
208                    }
209                    break;
210            }
211        }
212
213        $this->flushBuffer();
214
215        // Restore the user's encoding...
216        // @codeCoverageIgnoreStart
217        if ($encoding) {
218            mb_internal_encoding($encoding);
219        }
220        // @codeCoverageIgnoreEnd
221
222        return $this->tokens;
223    }
224
225    /**
226     * Helper function to reset tokenizer internal state.
227     */
228    private function reset()
229    {
230        $this->state    = self::IN_TEXT;
231        $this->tagType  = null;
232        $this->buffer   = '';
233        $this->tokens   = array();
234        $this->seenTag  = false;
235        $this->line     = 0;
236
237        $this->otag     = '{{';
238        $this->otagChar = '{';
239        $this->otagLen  = 2;
240
241        $this->ctag     = '}}';
242        $this->ctagChar = '}';
243        $this->ctagLen  = 2;
244    }
245
246    /**
247     * Flush the current buffer to a token.
248     */
249    private function flushBuffer()
250    {
251        if (strlen($this->buffer) > 0) {
252            $this->tokens[] = array(
253                self::TYPE  => self::T_TEXT,
254                self::LINE  => $this->line,
255                self::VALUE => $this->buffer,
256            );
257            $this->buffer   = '';
258        }
259    }
260
261    /**
262     * Change the current Mustache delimiters. Set new `otag` and `ctag` values.
263     *
264     * @throws Mustache_Exception_SyntaxException when delimiter string is invalid
265     *
266     * @param string $text  Mustache template source
267     * @param int    $index Current tokenizer index
268     *
269     * @return int New index value
270     */
271    private function changeDelimiters($text, $index)
272    {
273        $startIndex = strpos($text, '=', $index) + 1;
274        $close      = '=' . $this->ctag;
275        $closeIndex = strpos($text, $close, $index);
276
277        $token = array(
278            self::TYPE => self::T_DELIM_CHANGE,
279            self::LINE => $this->line,
280        );
281
282        try {
283            $this->setDelimiters(trim(substr($text, $startIndex, $closeIndex - $startIndex)));
284        } catch (Mustache_Exception_InvalidArgumentException $e) {
285            throw new Mustache_Exception_SyntaxException($e->getMessage(), $token);
286        }
287
288        $this->tokens[] = $token;
289
290        return $closeIndex + strlen($close) - 1;
291    }
292
293    /**
294     * Set the current Mustache `otag` and `ctag` delimiters.
295     *
296     * @throws Mustache_Exception_InvalidArgumentException when delimiter string is invalid
297     *
298     * @param string $delimiters
299     */
300    private function setDelimiters($delimiters)
301    {
302        if (!preg_match('/^\s*(\S+)\s+(\S+)\s*$/', $delimiters, $matches)) {
303            throw new Mustache_Exception_InvalidArgumentException(sprintf('Invalid delimiters: %s', $delimiters));
304        }
305
306        list($_, $otag, $ctag) = $matches;
307
308        $this->otag     = $otag;
309        $this->otagChar = $otag[0];
310        $this->otagLen  = strlen($otag);
311
312        $this->ctag     = $ctag;
313        $this->ctagChar = $ctag[0];
314        $this->ctagLen  = strlen($ctag);
315    }
316
317    /**
318     * Add pragma token.
319     *
320     * Pragmas are hoisted to the front of the template, so all pragma tokens
321     * will appear at the front of the token list.
322     *
323     * @param string $text
324     * @param int    $index
325     *
326     * @return int New index value
327     */
328    private function addPragma($text, $index)
329    {
330        $end    = strpos($text, $this->ctag, $index);
331        $pragma = trim(substr($text, $index + 2, $end - $index - 2));
332
333        // Pragmas are hoisted to the front of the template.
334        array_unshift($this->tokens, array(
335            self::TYPE => self::T_PRAGMA,
336            self::NAME => $pragma,
337            self::LINE => 0,
338        ));
339
340        return $end + $this->ctagLen - 1;
341    }
342}
343