1<?php
2
3/**
4 * Injector that auto paragraphs text in the root node based on
5 * double-spacing.
6 * @todo Ensure all states are unit tested, including variations as well.
7 * @todo Make a graph of the flow control for this Injector.
8 */
9class HTMLPurifier_Injector_AutoParagraph extends HTMLPurifier_Injector
10{
11    /**
12     * @type string
13     */
14    public $name = 'AutoParagraph';
15
16    /**
17     * @type array
18     */
19    public $needed = array('p');
20
21    /**
22     * @return HTMLPurifier_Token_Start
23     */
24    private function _pStart()
25    {
26        $par = new HTMLPurifier_Token_Start('p');
27        $par->armor['MakeWellFormed_TagClosedError'] = true;
28        return $par;
29    }
30
31    /**
32     * @param HTMLPurifier_Token_Text $token
33     */
34    public function handleText(&$token)
35    {
36        $text = $token->data;
37        // Does the current parent allow <p> tags?
38        if ($this->allowsElement('p')) {
39            if (empty($this->currentNesting) || strpos($text, "\n\n") !== false) {
40                // Note that we have differing behavior when dealing with text
41                // in the anonymous root node, or a node inside the document.
42                // If the text as a double-newline, the treatment is the same;
43                // if it doesn't, see the next if-block if you're in the document.
44
45                $i = $nesting = null;
46                if (!$this->forwardUntilEndToken($i, $current, $nesting) && $token->is_whitespace) {
47                    // State 1.1: ...    ^ (whitespace, then document end)
48                    //               ----
49                    // This is a degenerate case
50                } else {
51                    if (!$token->is_whitespace || $this->_isInline($current)) {
52                        // State 1.2: PAR1
53                        //            ----
54
55                        // State 1.3: PAR1\n\nPAR2
56                        //            ------------
57
58                        // State 1.4: <div>PAR1\n\nPAR2 (see State 2)
59                        //                 ------------
60                        $token = array($this->_pStart());
61                        $this->_splitText($text, $token);
62                    } else {
63                        // State 1.5: \n<hr />
64                        //            --
65                    }
66                }
67            } else {
68                // State 2:   <div>PAR1... (similar to 1.4)
69                //                 ----
70
71                // We're in an element that allows paragraph tags, but we're not
72                // sure if we're going to need them.
73                if ($this->_pLookAhead()) {
74                    // State 2.1: <div>PAR1<b>PAR1\n\nPAR2
75                    //                 ----
76                    // Note: This will always be the first child, since any
77                    // previous inline element would have triggered this very
78                    // same routine, and found the double newline. One possible
79                    // exception would be a comment.
80                    $token = array($this->_pStart(), $token);
81                } else {
82                    // State 2.2.1: <div>PAR1<div>
83                    //                   ----
84
85                    // State 2.2.2: <div>PAR1<b>PAR1</b></div>
86                    //                   ----
87                }
88            }
89            // Is the current parent a <p> tag?
90        } elseif (!empty($this->currentNesting) &&
91            $this->currentNesting[count($this->currentNesting) - 1]->name == 'p') {
92            // State 3.1: ...<p>PAR1
93            //                  ----
94
95            // State 3.2: ...<p>PAR1\n\nPAR2
96            //                  ------------
97            $token = array();
98            $this->_splitText($text, $token);
99            // Abort!
100        } else {
101            // State 4.1: ...<b>PAR1
102            //                  ----
103
104            // State 4.2: ...<b>PAR1\n\nPAR2
105            //                  ------------
106        }
107    }
108
109    /**
110     * @param HTMLPurifier_Token $token
111     */
112    public function handleElement(&$token)
113    {
114        // We don't have to check if we're already in a <p> tag for block
115        // tokens, because the tag would have been autoclosed by MakeWellFormed.
116        if ($this->allowsElement('p')) {
117            if (!empty($this->currentNesting)) {
118                if ($this->_isInline($token)) {
119                    // State 1: <div>...<b>
120                    //                  ---
121                    // Check if this token is adjacent to the parent token
122                    // (seek backwards until token isn't whitespace)
123                    $i = null;
124                    $this->backward($i, $prev);
125
126                    if (!$prev instanceof HTMLPurifier_Token_Start) {
127                        // Token wasn't adjacent
128                        if ($prev instanceof HTMLPurifier_Token_Text &&
129                            substr($prev->data, -2) === "\n\n"
130                        ) {
131                            // State 1.1.4: <div><p>PAR1</p>\n\n<b>
132                            //                                  ---
133                            // Quite frankly, this should be handled by splitText
134                            $token = array($this->_pStart(), $token);
135                        } else {
136                            // State 1.1.1: <div><p>PAR1</p><b>
137                            //                              ---
138                            // State 1.1.2: <div><br /><b>
139                            //                         ---
140                            // State 1.1.3: <div>PAR<b>
141                            //                      ---
142                        }
143                    } else {
144                        // State 1.2.1: <div><b>
145                        //                   ---
146                        // Lookahead to see if <p> is needed.
147                        if ($this->_pLookAhead()) {
148                            // State 1.3.1: <div><b>PAR1\n\nPAR2
149                            //                   ---
150                            $token = array($this->_pStart(), $token);
151                        } else {
152                            // State 1.3.2: <div><b>PAR1</b></div>
153                            //                   ---
154
155                            // State 1.3.3: <div><b>PAR1</b><div></div>\n\n</div>
156                            //                   ---
157                        }
158                    }
159                } else {
160                    // State 2.3: ...<div>
161                    //               -----
162                }
163            } else {
164                if ($this->_isInline($token)) {
165                    // State 3.1: <b>
166                    //            ---
167                    // This is where the {p} tag is inserted, not reflected in
168                    // inputTokens yet, however.
169                    $token = array($this->_pStart(), $token);
170                } else {
171                    // State 3.2: <div>
172                    //            -----
173                }
174
175                $i = null;
176                if ($this->backward($i, $prev)) {
177                    if (!$prev instanceof HTMLPurifier_Token_Text) {
178                        // State 3.1.1: ...</p>{p}<b>
179                        //                        ---
180                        // State 3.2.1: ...</p><div>
181                        //                     -----
182                        if (!is_array($token)) {
183                            $token = array($token);
184                        }
185                        array_unshift($token, new HTMLPurifier_Token_Text("\n\n"));
186                    } else {
187                        // State 3.1.2: ...</p>\n\n{p}<b>
188                        //                            ---
189                        // State 3.2.2: ...</p>\n\n<div>
190                        //                         -----
191                        // Note: PAR<ELEM> cannot occur because PAR would have been
192                        // wrapped in <p> tags.
193                    }
194                }
195            }
196        } else {
197            // State 2.2: <ul><li>
198            //                ----
199            // State 2.4: <p><b>
200            //               ---
201        }
202    }
203
204    /**
205     * Splits up a text in paragraph tokens and appends them
206     * to the result stream that will replace the original
207     * @param string $data String text data that will be processed
208     *    into paragraphs
209     * @param HTMLPurifier_Token[] $result Reference to array of tokens that the
210     *    tags will be appended onto
211     */
212    private function _splitText($data, &$result)
213    {
214        $raw_paragraphs = explode("\n\n", $data);
215        $paragraphs = array(); // without empty paragraphs
216        $needs_start = false;
217        $needs_end = false;
218
219        $c = count($raw_paragraphs);
220        if ($c == 1) {
221            // There were no double-newlines, abort quickly. In theory this
222            // should never happen.
223            $result[] = new HTMLPurifier_Token_Text($data);
224            return;
225        }
226        for ($i = 0; $i < $c; $i++) {
227            $par = $raw_paragraphs[$i];
228            if (trim($par) !== '') {
229                $paragraphs[] = $par;
230            } else {
231                if ($i == 0) {
232                    // Double newline at the front
233                    if (empty($result)) {
234                        // The empty result indicates that the AutoParagraph
235                        // injector did not add any start paragraph tokens.
236                        // This means that we have been in a paragraph for
237                        // a while, and the newline means we should start a new one.
238                        $result[] = new HTMLPurifier_Token_End('p');
239                        $result[] = new HTMLPurifier_Token_Text("\n\n");
240                        // However, the start token should only be added if
241                        // there is more processing to be done (i.e. there are
242                        // real paragraphs in here). If there are none, the
243                        // next start paragraph tag will be handled by the
244                        // next call to the injector
245                        $needs_start = true;
246                    } else {
247                        // We just started a new paragraph!
248                        // Reinstate a double-newline for presentation's sake, since
249                        // it was in the source code.
250                        array_unshift($result, new HTMLPurifier_Token_Text("\n\n"));
251                    }
252                } elseif ($i + 1 == $c) {
253                    // Double newline at the end
254                    // There should be a trailing </p> when we're finally done.
255                    $needs_end = true;
256                }
257            }
258        }
259
260        // Check if this was just a giant blob of whitespace. Move this earlier,
261        // perhaps?
262        if (empty($paragraphs)) {
263            return;
264        }
265
266        // Add the start tag indicated by \n\n at the beginning of $data
267        if ($needs_start) {
268            $result[] = $this->_pStart();
269        }
270
271        // Append the paragraphs onto the result
272        foreach ($paragraphs as $par) {
273            $result[] = new HTMLPurifier_Token_Text($par);
274            $result[] = new HTMLPurifier_Token_End('p');
275            $result[] = new HTMLPurifier_Token_Text("\n\n");
276            $result[] = $this->_pStart();
277        }
278
279        // Remove trailing start token; Injector will handle this later if
280        // it was indeed needed. This prevents from needing to do a lookahead,
281        // at the cost of a lookbehind later.
282        array_pop($result);
283
284        // If there is no need for an end tag, remove all of it and let
285        // MakeWellFormed close it later.
286        if (!$needs_end) {
287            array_pop($result); // removes \n\n
288            array_pop($result); // removes </p>
289        }
290    }
291
292    /**
293     * Returns true if passed token is inline (and, ergo, allowed in
294     * paragraph tags)
295     * @param HTMLPurifier_Token $token
296     * @return bool
297     */
298    private function _isInline($token)
299    {
300        return isset($this->htmlDefinition->info['p']->child->elements[$token->name]);
301    }
302
303    /**
304     * Looks ahead in the token list and determines whether or not we need
305     * to insert a <p> tag.
306     * @return bool
307     */
308    private function _pLookAhead()
309    {
310        if ($this->currentToken instanceof HTMLPurifier_Token_Start) {
311            $nesting = 1;
312        } else {
313            $nesting = 0;
314        }
315        $ok = false;
316        $i = null;
317        while ($this->forwardUntilEndToken($i, $current, $nesting)) {
318            $result = $this->_checkNeedsP($current);
319            if ($result !== null) {
320                $ok = $result;
321                break;
322            }
323        }
324        return $ok;
325    }
326
327    /**
328     * Determines if a particular token requires an earlier inline token
329     * to get a paragraph. This should be used with _forwardUntilEndToken
330     * @param HTMLPurifier_Token $current
331     * @return bool
332     */
333    private function _checkNeedsP($current)
334    {
335        if ($current instanceof HTMLPurifier_Token_Start) {
336            if (!$this->_isInline($current)) {
337                // <div>PAR1<div>
338                //      ----
339                // Terminate early, since we hit a block element
340                return false;
341            }
342        } elseif ($current instanceof HTMLPurifier_Token_Text) {
343            if (strpos($current->data, "\n\n") !== false) {
344                // <div>PAR1<b>PAR1\n\nPAR2
345                //      ----
346                return true;
347            } else {
348                // <div>PAR1<b>PAR1...
349                //      ----
350            }
351        }
352        return null;
353    }
354}
355
356// vim: et sw=4 sts=4
357