xref: /plugin/zwidoku/Html2Text.php (revision f82bbc904bd835fc66a3f52ffaef251433904ec2)
1*f82bbc90SS.Chekanov<?php
2*f82bbc90SS.Chekanov
3*f82bbc90SS.Chekanov/*
4*f82bbc90SS.Chekanov * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
5*f82bbc90SS.Chekanov *
6*f82bbc90SS.Chekanov * This script is free software; you can redistribute it and/or modify
7*f82bbc90SS.Chekanov * it under the terms of the GNU General Public License as published by
8*f82bbc90SS.Chekanov * the Free Software Foundation; either version 2 of the License, or
9*f82bbc90SS.Chekanov * (at your option) any later version.
10*f82bbc90SS.Chekanov *
11*f82bbc90SS.Chekanov * The GNU General Public License can be found at
12*f82bbc90SS.Chekanov * http://www.gnu.org/copyleft/gpl.html.
13*f82bbc90SS.Chekanov *
14*f82bbc90SS.Chekanov * This script is distributed in the hope that it will be useful,
15*f82bbc90SS.Chekanov * but WITHOUT ANY WARRANTY; without even the implied warranty of
16*f82bbc90SS.Chekanov * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17*f82bbc90SS.Chekanov * GNU General Public License for more details.
18*f82bbc90SS.Chekanov */
19*f82bbc90SS.Chekanov
20*f82bbc90SS.Chekanovnamespace Html2Text;
21*f82bbc90SS.Chekanov
22*f82bbc90SS.Chekanovclass Html2Text
23*f82bbc90SS.Chekanov{
24*f82bbc90SS.Chekanov    const ENCODING = 'UTF-8';
25*f82bbc90SS.Chekanov
26*f82bbc90SS.Chekanov    protected $htmlFuncFlags;
27*f82bbc90SS.Chekanov
28*f82bbc90SS.Chekanov    /**
29*f82bbc90SS.Chekanov     * Contains the HTML content to convert.
30*f82bbc90SS.Chekanov     *
31*f82bbc90SS.Chekanov     * @var string $html
32*f82bbc90SS.Chekanov     */
33*f82bbc90SS.Chekanov    protected $html;
34*f82bbc90SS.Chekanov
35*f82bbc90SS.Chekanov    /**
36*f82bbc90SS.Chekanov     * Contains the converted, formatted text.
37*f82bbc90SS.Chekanov     *
38*f82bbc90SS.Chekanov     * @var string $text
39*f82bbc90SS.Chekanov     */
40*f82bbc90SS.Chekanov    protected $text;
41*f82bbc90SS.Chekanov
42*f82bbc90SS.Chekanov    /**
43*f82bbc90SS.Chekanov     * List of preg* regular expression patterns to search for,
44*f82bbc90SS.Chekanov     * used in conjunction with $replace.
45*f82bbc90SS.Chekanov     *
46*f82bbc90SS.Chekanov     * @var array $search
47*f82bbc90SS.Chekanov     * @see $replace
48*f82bbc90SS.Chekanov     */
49*f82bbc90SS.Chekanov    protected $search = array(
50*f82bbc90SS.Chekanov        "/\r/",                                           // Non-legal carriage return
51*f82bbc90SS.Chekanov        "/[\n\t]+/",                                      // Newlines and tabs
52*f82bbc90SS.Chekanov        '/<head\b[^>]*>.*?<\/head>/i',                    // <head>
53*f82bbc90SS.Chekanov        '/<script\b[^>]*>.*?<\/script>/i',                // <script>s -- which strip_tags supposedly has problems with
54*f82bbc90SS.Chekanov        '/<style\b[^>]*>.*?<\/style>/i',                  // <style>s -- which strip_tags supposedly has problems with
55*f82bbc90SS.Chekanov        '/<i\b[^>]*>(.*?)<\/i>/i',                        // <i>
56*f82bbc90SS.Chekanov        '/<em\b[^>]*>(.*?)<\/em>/i',                      // <em>
57*f82bbc90SS.Chekanov        '/<ins\b[^>]*>(.*?)<\/ins>/i',                    // <ins>
58*f82bbc90SS.Chekanov        '/(<ul\b[^>]*>|<\/ul>)/i',                        // <ul> and </ul>
59*f82bbc90SS.Chekanov        '/(<ol\b[^>]*>|<\/ol>)/i',                        // <ol> and </ol>
60*f82bbc90SS.Chekanov        '/(<dl\b[^>]*>|<\/dl>)/i',                        // <dl> and </dl>
61*f82bbc90SS.Chekanov        '/<li\b[^>]*>(.*?)<\/li>/i',                      // <li> and </li>
62*f82bbc90SS.Chekanov        '/<dd\b[^>]*>(.*?)<\/dd>/i',                      // <dd> and </dd>
63*f82bbc90SS.Chekanov        '/<dt\b[^>]*>(.*?)<\/dt>/i',                      // <dt> and </dt>
64*f82bbc90SS.Chekanov        '/<li\b[^>]*>/i',                                 // <li>
65*f82bbc90SS.Chekanov        '/<hr\b[^>]*>/i',                                 // <hr>
66*f82bbc90SS.Chekanov        '/<div\b[^>]*>/i',                                // <div>
67*f82bbc90SS.Chekanov        '/(<table\b[^>]*>|<\/table>)/i',                  // <table> and </table>
68*f82bbc90SS.Chekanov        '/(<tr\b[^>]*>|<\/tr>)/i',                        // <tr> and </tr>
69*f82bbc90SS.Chekanov        '/<td\b[^>]*>(.*?)<\/td>/i',                      // <td> and </td>
70*f82bbc90SS.Chekanov        '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
71*f82bbc90SS.Chekanov        '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i',         // <img> with alt tag
72*f82bbc90SS.Chekanov    );
73*f82bbc90SS.Chekanov
74*f82bbc90SS.Chekanov    /**
75*f82bbc90SS.Chekanov     * List of pattern replacements corresponding to patterns searched.
76*f82bbc90SS.Chekanov     *
77*f82bbc90SS.Chekanov     * @var array $replace
78*f82bbc90SS.Chekanov     * @see $search
79*f82bbc90SS.Chekanov     */
80*f82bbc90SS.Chekanov    protected $replace = array(
81*f82bbc90SS.Chekanov        '',                              // Non-legal carriage return
82*f82bbc90SS.Chekanov        ' ',                             // Newlines and tabs
83*f82bbc90SS.Chekanov        '',                              // <head>
84*f82bbc90SS.Chekanov        '',                              // <script>s -- which strip_tags supposedly has problems with
85*f82bbc90SS.Chekanov        '',                              // <style>s -- which strip_tags supposedly has problems with
86*f82bbc90SS.Chekanov        '_\\1_',                         // <i>
87*f82bbc90SS.Chekanov        '_\\1_',                         // <em>
88*f82bbc90SS.Chekanov        '_\\1_',                         // <ins>
89*f82bbc90SS.Chekanov        "\n\n",                          // <ul> and </ul>
90*f82bbc90SS.Chekanov        "\n\n",                          // <ol> and </ol>
91*f82bbc90SS.Chekanov        "\n\n",                          // <dl> and </dl>
92*f82bbc90SS.Chekanov        "\t* \\1\n",                     // <li> and </li>
93*f82bbc90SS.Chekanov        " \\1\n",                        // <dd> and </dd>
94*f82bbc90SS.Chekanov        "\t* \\1",                       // <dt> and </dt>
95*f82bbc90SS.Chekanov        "\n\t* ",                        // <li>
96*f82bbc90SS.Chekanov        "\n-------------------------\n", // <hr>
97*f82bbc90SS.Chekanov        "<div>\n",                       // <div>
98*f82bbc90SS.Chekanov        "\n\n",                          // <table> and </table>
99*f82bbc90SS.Chekanov        "\n",                            // <tr> and </tr>
100*f82bbc90SS.Chekanov        "\t\t\\1\n",                     // <td> and </td>
101*f82bbc90SS.Chekanov        "",                              // <span class="_html2text_ignore">...</span>
102*f82bbc90SS.Chekanov        '[\\2]',                         // <img> with alt tag
103*f82bbc90SS.Chekanov    );
104*f82bbc90SS.Chekanov
105*f82bbc90SS.Chekanov    /**
106*f82bbc90SS.Chekanov     * List of preg* regular expression patterns to search for,
107*f82bbc90SS.Chekanov     * used in conjunction with $entReplace.
108*f82bbc90SS.Chekanov     *
109*f82bbc90SS.Chekanov     * @var array $entSearch
110*f82bbc90SS.Chekanov     * @see $entReplace
111*f82bbc90SS.Chekanov     */
112*f82bbc90SS.Chekanov    protected $entSearch = array(
113*f82bbc90SS.Chekanov        '/&#153;/i',                                     // TM symbol in win-1252
114*f82bbc90SS.Chekanov        '/&#151;/i',                                     // m-dash in win-1252
115*f82bbc90SS.Chekanov        '/&(amp|#38);/i',                                // Ampersand: see converter()
116*f82bbc90SS.Chekanov        '/[ ]{2,}/',                                     // Runs of spaces, post-handling
117*f82bbc90SS.Chekanov        '/&#39;/i',                                      // The apostrophe symbol
118*f82bbc90SS.Chekanov    );
119*f82bbc90SS.Chekanov
120*f82bbc90SS.Chekanov    /**
121*f82bbc90SS.Chekanov     * List of pattern replacements corresponding to patterns searched.
122*f82bbc90SS.Chekanov     *
123*f82bbc90SS.Chekanov     * @var array $entReplace
124*f82bbc90SS.Chekanov     * @see $entSearch
125*f82bbc90SS.Chekanov     */
126*f82bbc90SS.Chekanov    protected $entReplace = array(
127*f82bbc90SS.Chekanov        '™',         // TM symbol
128*f82bbc90SS.Chekanov        '—',         // m-dash
129*f82bbc90SS.Chekanov        '|+|amp|+|', // Ampersand: see converter()
130*f82bbc90SS.Chekanov        ' ',         // Runs of spaces, post-handling
131*f82bbc90SS.Chekanov        '\'',        // Apostrophe
132*f82bbc90SS.Chekanov    );
133*f82bbc90SS.Chekanov
134*f82bbc90SS.Chekanov    /**
135*f82bbc90SS.Chekanov     * List of preg* regular expression patterns to search for
136*f82bbc90SS.Chekanov     * and replace using callback function.
137*f82bbc90SS.Chekanov     *
138*f82bbc90SS.Chekanov     * @var array $callbackSearch
139*f82bbc90SS.Chekanov     */
140*f82bbc90SS.Chekanov    protected $callbackSearch = array(
141*f82bbc90SS.Chekanov        '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
142*f82bbc90SS.Chekanov        '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si',                  // <p> with surrounding whitespace.
143*f82bbc90SS.Chekanov        '/<(br)[^>]*>[ ]*/i',                                    // <br> with leading whitespace after the newline.
144*f82bbc90SS.Chekanov        '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
145*f82bbc90SS.Chekanov        '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
146*f82bbc90SS.Chekanov        '/<(del)( [^>]*)?>(.*?)<\/del>/i',                       // <del>
147*f82bbc90SS.Chekanov        '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
148*f82bbc90SS.Chekanov        '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i'  // <a href="">
149*f82bbc90SS.Chekanov    );
150*f82bbc90SS.Chekanov
151*f82bbc90SS.Chekanov    /**
152*f82bbc90SS.Chekanov     * List of preg* regular expression patterns to search for in PRE body,
153*f82bbc90SS.Chekanov     * used in conjunction with $preReplace.
154*f82bbc90SS.Chekanov     *
155*f82bbc90SS.Chekanov     * @var array $preSearch
156*f82bbc90SS.Chekanov     * @see $preReplace
157*f82bbc90SS.Chekanov     */
158*f82bbc90SS.Chekanov    protected $preSearch = array(
159*f82bbc90SS.Chekanov        "/\n/",
160*f82bbc90SS.Chekanov        "/\t/",
161*f82bbc90SS.Chekanov        '/ /',
162*f82bbc90SS.Chekanov        '/<pre[^>]*>/',
163*f82bbc90SS.Chekanov        '/<\/pre>/'
164*f82bbc90SS.Chekanov    );
165*f82bbc90SS.Chekanov
166*f82bbc90SS.Chekanov    /**
167*f82bbc90SS.Chekanov     * List of pattern replacements corresponding to patterns searched for PRE body.
168*f82bbc90SS.Chekanov     *
169*f82bbc90SS.Chekanov     * @var array $preReplace
170*f82bbc90SS.Chekanov     * @see $preSearch
171*f82bbc90SS.Chekanov     */
172*f82bbc90SS.Chekanov    protected $preReplace = array(
173*f82bbc90SS.Chekanov        '<br>',
174*f82bbc90SS.Chekanov        '&nbsp;&nbsp;&nbsp;&nbsp;',
175*f82bbc90SS.Chekanov        '&nbsp;',
176*f82bbc90SS.Chekanov        '',
177*f82bbc90SS.Chekanov        '',
178*f82bbc90SS.Chekanov    );
179*f82bbc90SS.Chekanov
180*f82bbc90SS.Chekanov    /**
181*f82bbc90SS.Chekanov     * Temporary workspace used during PRE processing.
182*f82bbc90SS.Chekanov     *
183*f82bbc90SS.Chekanov     * @var string $preContent
184*f82bbc90SS.Chekanov     */
185*f82bbc90SS.Chekanov    protected $preContent = '';
186*f82bbc90SS.Chekanov
187*f82bbc90SS.Chekanov    /**
188*f82bbc90SS.Chekanov     * Contains the base URL that relative links should resolve to.
189*f82bbc90SS.Chekanov     *
190*f82bbc90SS.Chekanov     * @var string $baseurl
191*f82bbc90SS.Chekanov     */
192*f82bbc90SS.Chekanov    protected $baseurl = '';
193*f82bbc90SS.Chekanov
194*f82bbc90SS.Chekanov    /**
195*f82bbc90SS.Chekanov     * Indicates whether content in the $html variable has been converted yet.
196*f82bbc90SS.Chekanov     *
197*f82bbc90SS.Chekanov     * @var boolean $converted
198*f82bbc90SS.Chekanov     * @see $html, $text
199*f82bbc90SS.Chekanov     */
200*f82bbc90SS.Chekanov    protected $converted = false;
201*f82bbc90SS.Chekanov
202*f82bbc90SS.Chekanov    /**
203*f82bbc90SS.Chekanov     * Contains URL addresses from links to be rendered in plain text.
204*f82bbc90SS.Chekanov     *
205*f82bbc90SS.Chekanov     * @var array $linkList
206*f82bbc90SS.Chekanov     * @see buildlinkList()
207*f82bbc90SS.Chekanov     */
208*f82bbc90SS.Chekanov    protected $linkList = array();
209*f82bbc90SS.Chekanov
210*f82bbc90SS.Chekanov    /**
211*f82bbc90SS.Chekanov     * Various configuration options (able to be set in the constructor)
212*f82bbc90SS.Chekanov     *
213*f82bbc90SS.Chekanov     * @var array $options
214*f82bbc90SS.Chekanov     */
215*f82bbc90SS.Chekanov    protected $options = array(
216*f82bbc90SS.Chekanov        'do_links' => 'inline', // 'none'
217*f82bbc90SS.Chekanov                                // 'inline' (show links inline)
218*f82bbc90SS.Chekanov                                // 'nextline' (show links on the next line)
219*f82bbc90SS.Chekanov                                // 'table' (if a table of link URLs should be listed after the text.
220*f82bbc90SS.Chekanov                                // 'bbcode' (show links as bbcode)
221*f82bbc90SS.Chekanov
222*f82bbc90SS.Chekanov        'width' => 70,          //  Maximum width of the formatted text, in columns.
223*f82bbc90SS.Chekanov                                //  Set this value to 0 (or less) to ignore word wrapping
224*f82bbc90SS.Chekanov                                //  and not constrain text to a fixed-width column.
225*f82bbc90SS.Chekanov    );
226*f82bbc90SS.Chekanov
227*f82bbc90SS.Chekanov    private function legacyConstruct($html = '', $fromFile = false, array $options = array())
228*f82bbc90SS.Chekanov    {
229*f82bbc90SS.Chekanov        $this->set_html($html, $fromFile);
230*f82bbc90SS.Chekanov        $this->options = array_merge($this->options, $options);
231*f82bbc90SS.Chekanov    }
232*f82bbc90SS.Chekanov
233*f82bbc90SS.Chekanov    /**
234*f82bbc90SS.Chekanov     * @param string $html    Source HTML
235*f82bbc90SS.Chekanov     * @param array  $options Set configuration options
236*f82bbc90SS.Chekanov     */
237*f82bbc90SS.Chekanov    public function __construct($html = '', $options = array())
238*f82bbc90SS.Chekanov    {
239*f82bbc90SS.Chekanov        // for backwards compatibility
240*f82bbc90SS.Chekanov        if (!is_array($options)) {
241*f82bbc90SS.Chekanov            return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
242*f82bbc90SS.Chekanov        }
243*f82bbc90SS.Chekanov
244*f82bbc90SS.Chekanov        $this->html = $html;
245*f82bbc90SS.Chekanov        $this->options = array_merge($this->options, $options);
246*f82bbc90SS.Chekanov        $this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
247*f82bbc90SS.Chekanov            ? ENT_QUOTES
248*f82bbc90SS.Chekanov            : ENT_QUOTES | ENT_HTML5;
249*f82bbc90SS.Chekanov    }
250*f82bbc90SS.Chekanov
251*f82bbc90SS.Chekanov    /**
252*f82bbc90SS.Chekanov    * Get the source HTML
253*f82bbc90SS.Chekanov    *
254*f82bbc90SS.Chekanov    * @return string
255*f82bbc90SS.Chekanov    */
256*f82bbc90SS.Chekanov    public function getHtml()
257*f82bbc90SS.Chekanov    {
258*f82bbc90SS.Chekanov        return $this->html;
259*f82bbc90SS.Chekanov    }
260*f82bbc90SS.Chekanov
261*f82bbc90SS.Chekanov    /**
262*f82bbc90SS.Chekanov     * Set the source HTML
263*f82bbc90SS.Chekanov     *
264*f82bbc90SS.Chekanov     * @param string $html HTML source content
265*f82bbc90SS.Chekanov     */
266*f82bbc90SS.Chekanov    public function setHtml($html)
267*f82bbc90SS.Chekanov    {
268*f82bbc90SS.Chekanov        $this->html = $html;
269*f82bbc90SS.Chekanov        $this->converted = false;
270*f82bbc90SS.Chekanov    }
271*f82bbc90SS.Chekanov
272*f82bbc90SS.Chekanov    /**
273*f82bbc90SS.Chekanov     * @deprecated
274*f82bbc90SS.Chekanov     */
275*f82bbc90SS.Chekanov    public function set_html($html, $from_file = false)
276*f82bbc90SS.Chekanov    {
277*f82bbc90SS.Chekanov        if ($from_file) {
278*f82bbc90SS.Chekanov            throw new \InvalidArgumentException("Argument from_file no longer supported");
279*f82bbc90SS.Chekanov        }
280*f82bbc90SS.Chekanov
281*f82bbc90SS.Chekanov        return $this->setHtml($html);
282*f82bbc90SS.Chekanov    }
283*f82bbc90SS.Chekanov
284*f82bbc90SS.Chekanov    /**
285*f82bbc90SS.Chekanov     * Returns the text, converted from HTML.
286*f82bbc90SS.Chekanov     *
287*f82bbc90SS.Chekanov     * @return string Plain text
288*f82bbc90SS.Chekanov     */
289*f82bbc90SS.Chekanov    public function getText()
290*f82bbc90SS.Chekanov    {
291*f82bbc90SS.Chekanov        if (!$this->converted) {
292*f82bbc90SS.Chekanov            $this->convert();
293*f82bbc90SS.Chekanov        }
294*f82bbc90SS.Chekanov
295*f82bbc90SS.Chekanov        return $this->text;
296*f82bbc90SS.Chekanov    }
297*f82bbc90SS.Chekanov
298*f82bbc90SS.Chekanov    /**
299*f82bbc90SS.Chekanov     * @deprecated
300*f82bbc90SS.Chekanov     */
301*f82bbc90SS.Chekanov    public function get_text()
302*f82bbc90SS.Chekanov    {
303*f82bbc90SS.Chekanov        return $this->getText();
304*f82bbc90SS.Chekanov    }
305*f82bbc90SS.Chekanov
306*f82bbc90SS.Chekanov    /**
307*f82bbc90SS.Chekanov     * @deprecated
308*f82bbc90SS.Chekanov     */
309*f82bbc90SS.Chekanov    public function print_text()
310*f82bbc90SS.Chekanov    {
311*f82bbc90SS.Chekanov        print $this->getText();
312*f82bbc90SS.Chekanov    }
313*f82bbc90SS.Chekanov
314*f82bbc90SS.Chekanov    /**
315*f82bbc90SS.Chekanov     * @deprecated
316*f82bbc90SS.Chekanov     */
317*f82bbc90SS.Chekanov    public function p()
318*f82bbc90SS.Chekanov    {
319*f82bbc90SS.Chekanov        return $this->print_text();
320*f82bbc90SS.Chekanov    }
321*f82bbc90SS.Chekanov
322*f82bbc90SS.Chekanov    /**
323*f82bbc90SS.Chekanov     * Sets a base URL to handle relative links.
324*f82bbc90SS.Chekanov     *
325*f82bbc90SS.Chekanov     * @param string $baseurl
326*f82bbc90SS.Chekanov     */
327*f82bbc90SS.Chekanov    public function setBaseUrl($baseurl)
328*f82bbc90SS.Chekanov    {
329*f82bbc90SS.Chekanov        $this->baseurl = $baseurl;
330*f82bbc90SS.Chekanov    }
331*f82bbc90SS.Chekanov
332*f82bbc90SS.Chekanov    /**
333*f82bbc90SS.Chekanov     * @deprecated
334*f82bbc90SS.Chekanov     */
335*f82bbc90SS.Chekanov    public function set_base_url($baseurl)
336*f82bbc90SS.Chekanov    {
337*f82bbc90SS.Chekanov        return $this->setBaseUrl($baseurl);
338*f82bbc90SS.Chekanov    }
339*f82bbc90SS.Chekanov
340*f82bbc90SS.Chekanov    protected function convert()
341*f82bbc90SS.Chekanov    {
342*f82bbc90SS.Chekanov       $origEncoding = mb_internal_encoding();
343*f82bbc90SS.Chekanov       mb_internal_encoding(self::ENCODING);
344*f82bbc90SS.Chekanov
345*f82bbc90SS.Chekanov       $this->doConvert();
346*f82bbc90SS.Chekanov
347*f82bbc90SS.Chekanov       mb_internal_encoding($origEncoding);
348*f82bbc90SS.Chekanov    }
349*f82bbc90SS.Chekanov
350*f82bbc90SS.Chekanov    protected function doConvert()
351*f82bbc90SS.Chekanov    {
352*f82bbc90SS.Chekanov        $this->linkList = array();
353*f82bbc90SS.Chekanov
354*f82bbc90SS.Chekanov        $text = trim($this->html);
355*f82bbc90SS.Chekanov
356*f82bbc90SS.Chekanov        $this->converter($text);
357*f82bbc90SS.Chekanov
358*f82bbc90SS.Chekanov        if ($this->linkList) {
359*f82bbc90SS.Chekanov            $text .= "\n\nLinks:\n------\n";
360*f82bbc90SS.Chekanov            foreach ($this->linkList as $i => $url) {
361*f82bbc90SS.Chekanov                $text .= '[' . ($i + 1) . '] ' . $url . "\n";
362*f82bbc90SS.Chekanov            }
363*f82bbc90SS.Chekanov        }
364*f82bbc90SS.Chekanov
365*f82bbc90SS.Chekanov        $this->text = $text;
366*f82bbc90SS.Chekanov
367*f82bbc90SS.Chekanov        $this->converted = true;
368*f82bbc90SS.Chekanov    }
369*f82bbc90SS.Chekanov
370*f82bbc90SS.Chekanov    protected function converter(&$text)
371*f82bbc90SS.Chekanov    {
372*f82bbc90SS.Chekanov        $this->convertBlockquotes($text);
373*f82bbc90SS.Chekanov        $this->convertPre($text);
374*f82bbc90SS.Chekanov        $text = preg_replace($this->search, $this->replace, $text);
375*f82bbc90SS.Chekanov        $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
376*f82bbc90SS.Chekanov        $text = strip_tags($text);
377*f82bbc90SS.Chekanov        $text = preg_replace($this->entSearch, $this->entReplace, $text);
378*f82bbc90SS.Chekanov        $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);
379*f82bbc90SS.Chekanov
380*f82bbc90SS.Chekanov        // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
381*f82bbc90SS.Chekanov        $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
382*f82bbc90SS.Chekanov
383*f82bbc90SS.Chekanov        // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
384*f82bbc90SS.Chekanov        // This properly handles situation of "&amp;quot;" in input string
385*f82bbc90SS.Chekanov        $text = str_replace('|+|amp|+|', '&', $text);
386*f82bbc90SS.Chekanov
387*f82bbc90SS.Chekanov        // Normalise empty lines
388*f82bbc90SS.Chekanov        $text = preg_replace("/\n\s+\n/", "\n\n", $text);
389*f82bbc90SS.Chekanov        $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
390*f82bbc90SS.Chekanov
391*f82bbc90SS.Chekanov        // remove leading empty lines (can be produced by eg. P tag on the beginning)
392*f82bbc90SS.Chekanov        $text = ltrim($text, "\n");
393*f82bbc90SS.Chekanov
394*f82bbc90SS.Chekanov        if ($this->options['width'] > 0) {
395*f82bbc90SS.Chekanov            $text = wordwrap($text, $this->options['width']);
396*f82bbc90SS.Chekanov        }
397*f82bbc90SS.Chekanov    }
398*f82bbc90SS.Chekanov
399*f82bbc90SS.Chekanov    /**
400*f82bbc90SS.Chekanov     * Helper function called by preg_replace() on link replacement.
401*f82bbc90SS.Chekanov     *
402*f82bbc90SS.Chekanov     * Maintains an internal list of links to be displayed at the end of the
403*f82bbc90SS.Chekanov     * text, with numeric indices to the original point in the text they
404*f82bbc90SS.Chekanov     * appeared. Also makes an effort at identifying and handling absolute
405*f82bbc90SS.Chekanov     * and relative links.
406*f82bbc90SS.Chekanov     *
407*f82bbc90SS.Chekanov     * @param  string $link          URL of the link
408*f82bbc90SS.Chekanov     * @param  string $display       Part of the text to associate number with
409*f82bbc90SS.Chekanov     * @param  null   $linkOverride
410*f82bbc90SS.Chekanov     * @return string
411*f82bbc90SS.Chekanov     */
412*f82bbc90SS.Chekanov    protected function buildlinkList($link, $display, $linkOverride = null)
413*f82bbc90SS.Chekanov    {
414*f82bbc90SS.Chekanov        $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
415*f82bbc90SS.Chekanov        if ($linkMethod == 'none') {
416*f82bbc90SS.Chekanov            return $display;
417*f82bbc90SS.Chekanov        }
418*f82bbc90SS.Chekanov
419*f82bbc90SS.Chekanov        // Ignored link types
420*f82bbc90SS.Chekanov        if (preg_match('!^(javascript:|mailto:|#)!i', html_entity_decode($link, $this->htmlFuncFlags, self::ENCODING))) {
421*f82bbc90SS.Chekanov            return $display;
422*f82bbc90SS.Chekanov        }
423*f82bbc90SS.Chekanov
424*f82bbc90SS.Chekanov        if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
425*f82bbc90SS.Chekanov            $url = $link;
426*f82bbc90SS.Chekanov        } else {
427*f82bbc90SS.Chekanov            $url = $this->baseurl;
428*f82bbc90SS.Chekanov            if (mb_substr($link, 0, 1) != '/') {
429*f82bbc90SS.Chekanov                $url .= '/';
430*f82bbc90SS.Chekanov            }
431*f82bbc90SS.Chekanov            $url .= $link;
432*f82bbc90SS.Chekanov        }
433*f82bbc90SS.Chekanov
434*f82bbc90SS.Chekanov        if ($linkMethod == 'table') {
435*f82bbc90SS.Chekanov            if (($index = array_search($url, $this->linkList)) === false) {
436*f82bbc90SS.Chekanov                $index = count($this->linkList);
437*f82bbc90SS.Chekanov                $this->linkList[] = $url;
438*f82bbc90SS.Chekanov            }
439*f82bbc90SS.Chekanov
440*f82bbc90SS.Chekanov            return $display . ' [' . ($index + 1) . ']';
441*f82bbc90SS.Chekanov        } elseif ($linkMethod == 'nextline') {
442*f82bbc90SS.Chekanov            if ($url === $display) {
443*f82bbc90SS.Chekanov                return $display;
444*f82bbc90SS.Chekanov            }
445*f82bbc90SS.Chekanov            return $display . "\n[" . $url . ']';
446*f82bbc90SS.Chekanov        } elseif ($linkMethod == 'bbcode') {
447*f82bbc90SS.Chekanov            return sprintf('[url=%s]%s[/url]', $url, $display);
448*f82bbc90SS.Chekanov        } else { // link_method defaults to inline
449*f82bbc90SS.Chekanov            if ($url === $display) {
450*f82bbc90SS.Chekanov                return $display;
451*f82bbc90SS.Chekanov            }
452*f82bbc90SS.Chekanov            return $display . ' [' . $url . ']';
453*f82bbc90SS.Chekanov        }
454*f82bbc90SS.Chekanov    }
455*f82bbc90SS.Chekanov
456*f82bbc90SS.Chekanov    /**
457*f82bbc90SS.Chekanov     * Helper function for PRE body conversion.
458*f82bbc90SS.Chekanov     *
459*f82bbc90SS.Chekanov     * @param string &$text HTML content
460*f82bbc90SS.Chekanov     */
461*f82bbc90SS.Chekanov    protected function convertPre(&$text)
462*f82bbc90SS.Chekanov    {
463*f82bbc90SS.Chekanov        // get the content of PRE element
464*f82bbc90SS.Chekanov        while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
465*f82bbc90SS.Chekanov            // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
466*f82bbc90SS.Chekanov            $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);
467*f82bbc90SS.Chekanov
468*f82bbc90SS.Chekanov            // Run our defined tags search-and-replace with callback
469*f82bbc90SS.Chekanov            $this->preContent = preg_replace_callback(
470*f82bbc90SS.Chekanov                $this->callbackSearch,
471*f82bbc90SS.Chekanov                array($this, 'pregCallback'),
472*f82bbc90SS.Chekanov                $this->preContent
473*f82bbc90SS.Chekanov            );
474*f82bbc90SS.Chekanov
475*f82bbc90SS.Chekanov            // convert the content
476*f82bbc90SS.Chekanov            $this->preContent = sprintf(
477*f82bbc90SS.Chekanov                '<div><br>%s<br></div>',
478*f82bbc90SS.Chekanov                preg_replace($this->preSearch, $this->preReplace, $this->preContent)
479*f82bbc90SS.Chekanov            );
480*f82bbc90SS.Chekanov
481*f82bbc90SS.Chekanov            // replace the content (use callback because content can contain $0 variable)
482*f82bbc90SS.Chekanov            $text = preg_replace_callback(
483*f82bbc90SS.Chekanov                '/<pre[^>]*>.*<\/pre>/ismU',
484*f82bbc90SS.Chekanov                array($this, 'pregPreCallback'),
485*f82bbc90SS.Chekanov                $text,
486*f82bbc90SS.Chekanov                1
487*f82bbc90SS.Chekanov            );
488*f82bbc90SS.Chekanov
489*f82bbc90SS.Chekanov            // free memory
490*f82bbc90SS.Chekanov            $this->preContent = '';
491*f82bbc90SS.Chekanov        }
492*f82bbc90SS.Chekanov    }
493*f82bbc90SS.Chekanov
494*f82bbc90SS.Chekanov    /**
495*f82bbc90SS.Chekanov     * Helper function for BLOCKQUOTE body conversion.
496*f82bbc90SS.Chekanov     *
497*f82bbc90SS.Chekanov     * @param string &$text HTML content
498*f82bbc90SS.Chekanov     */
499*f82bbc90SS.Chekanov    protected function convertBlockquotes(&$text)
500*f82bbc90SS.Chekanov    {
501*f82bbc90SS.Chekanov        if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
502*f82bbc90SS.Chekanov            $originalText = $text;
503*f82bbc90SS.Chekanov            $start = 0;
504*f82bbc90SS.Chekanov            $taglen = 0;
505*f82bbc90SS.Chekanov            $level = 0;
506*f82bbc90SS.Chekanov            $diff = 0;
507*f82bbc90SS.Chekanov            foreach ($matches[0] as $m) {
508*f82bbc90SS.Chekanov                $m[1] = mb_strlen(substr($originalText, 0, $m[1]));
509*f82bbc90SS.Chekanov                if ($m[0][0] == '<' && $m[0][1] == '/') {
510*f82bbc90SS.Chekanov                    $level--;
511*f82bbc90SS.Chekanov                    if ($level < 0) {
512*f82bbc90SS.Chekanov                        $level = 0; // malformed HTML: go to next blockquote
513*f82bbc90SS.Chekanov                    } elseif ($level > 0) {
514*f82bbc90SS.Chekanov                        // skip inner blockquote
515*f82bbc90SS.Chekanov                    } else {
516*f82bbc90SS.Chekanov                        $end = $m[1];
517*f82bbc90SS.Chekanov                        $len = $end - $taglen - $start;
518*f82bbc90SS.Chekanov                        // Get blockquote content
519*f82bbc90SS.Chekanov                        $body = mb_substr($text, $start + $taglen - $diff, $len);
520*f82bbc90SS.Chekanov
521*f82bbc90SS.Chekanov                        // Set text width
522*f82bbc90SS.Chekanov                        $pWidth = $this->options['width'];
523*f82bbc90SS.Chekanov                        if ($this->options['width'] > 0) $this->options['width'] -= 2;
524*f82bbc90SS.Chekanov                        // Convert blockquote content
525*f82bbc90SS.Chekanov                        $body = trim($body);
526*f82bbc90SS.Chekanov                        $this->converter($body);
527*f82bbc90SS.Chekanov                        // Add citation markers and create PRE block
528*f82bbc90SS.Chekanov                        $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
529*f82bbc90SS.Chekanov                        $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
530*f82bbc90SS.Chekanov                        // Re-set text width
531*f82bbc90SS.Chekanov                        $this->options['width'] = $pWidth;
532*f82bbc90SS.Chekanov                        // Replace content
533*f82bbc90SS.Chekanov                        $text = mb_substr($text, 0, $start - $diff)
534*f82bbc90SS.Chekanov                            . $body
535*f82bbc90SS.Chekanov                            . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
536*f82bbc90SS.Chekanov
537*f82bbc90SS.Chekanov                        $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
538*f82bbc90SS.Chekanov                        unset($body);
539*f82bbc90SS.Chekanov                    }
540*f82bbc90SS.Chekanov                } else {
541*f82bbc90SS.Chekanov                    if ($level == 0) {
542*f82bbc90SS.Chekanov                        $start = $m[1];
543*f82bbc90SS.Chekanov                        $taglen = mb_strlen($m[0]);
544*f82bbc90SS.Chekanov                    }
545*f82bbc90SS.Chekanov                    $level++;
546*f82bbc90SS.Chekanov                }
547*f82bbc90SS.Chekanov            }
548*f82bbc90SS.Chekanov        }
549*f82bbc90SS.Chekanov    }
550*f82bbc90SS.Chekanov
551*f82bbc90SS.Chekanov    /**
552*f82bbc90SS.Chekanov     * Callback function for preg_replace_callback use.
553*f82bbc90SS.Chekanov     *
554*f82bbc90SS.Chekanov     * @param  array  $matches PREG matches
555*f82bbc90SS.Chekanov     * @return string
556*f82bbc90SS.Chekanov     */
557*f82bbc90SS.Chekanov    protected function pregCallback($matches)
558*f82bbc90SS.Chekanov    {
559*f82bbc90SS.Chekanov        switch (mb_strtolower($matches[1])) {
560*f82bbc90SS.Chekanov            case 'p':
561*f82bbc90SS.Chekanov                // Replace newlines with spaces.
562*f82bbc90SS.Chekanov                $para = str_replace("\n", " ", $matches[3]);
563*f82bbc90SS.Chekanov
564*f82bbc90SS.Chekanov                // Trim trailing and leading whitespace within the tag.
565*f82bbc90SS.Chekanov                $para = trim($para);
566*f82bbc90SS.Chekanov
567*f82bbc90SS.Chekanov                // Add trailing newlines for this para.
568*f82bbc90SS.Chekanov                return "\n" . $para . "\n";
569*f82bbc90SS.Chekanov            case 'br':
570*f82bbc90SS.Chekanov                return "\n";
571*f82bbc90SS.Chekanov            // chekanov
572*f82bbc90SS.Chekanov	    case 'b':
573*f82bbc90SS.Chekanov            case 'strong':
574*f82bbc90SS.Chekanov                return $this->toupper($matches[3]);
575*f82bbc90SS.Chekanov	    case 'del':
576*f82bbc90SS.Chekanov                return $this->tostrike($matches[3]);
577*f82bbc90SS.Chekanov            case 'th':
578*f82bbc90SS.Chekanov                return $this->toupper("\t\t" . $matches[3] . "\n");
579*f82bbc90SS.Chekanov            case 'h':
580*f82bbc90SS.Chekanov                return $this->toupper("\n\n" . $matches[3] . "\n\n");
581*f82bbc90SS.Chekanov            case 'a':
582*f82bbc90SS.Chekanov                // override the link method
583*f82bbc90SS.Chekanov                $linkOverride = null;
584*f82bbc90SS.Chekanov                if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
585*f82bbc90SS.Chekanov                    $linkOverride = $linkOverrideMatch[1];
586*f82bbc90SS.Chekanov                }
587*f82bbc90SS.Chekanov                // Remove spaces in URL (#1487805)
588*f82bbc90SS.Chekanov                $url = str_replace(' ', '', $matches[3]);
589*f82bbc90SS.Chekanov
590*f82bbc90SS.Chekanov                return $this->buildlinkList($url, $matches[5], $linkOverride);
591*f82bbc90SS.Chekanov        }
592*f82bbc90SS.Chekanov
593*f82bbc90SS.Chekanov        return '';
594*f82bbc90SS.Chekanov    }
595*f82bbc90SS.Chekanov
596*f82bbc90SS.Chekanov    /**
597*f82bbc90SS.Chekanov     * Callback function for preg_replace_callback use in PRE content handler.
598*f82bbc90SS.Chekanov     *
599*f82bbc90SS.Chekanov     * @param  array  $matches PREG matches
600*f82bbc90SS.Chekanov     * @return string
601*f82bbc90SS.Chekanov     */
602*f82bbc90SS.Chekanov    protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
603*f82bbc90SS.Chekanov    {
604*f82bbc90SS.Chekanov        return $this->preContent;
605*f82bbc90SS.Chekanov    }
606*f82bbc90SS.Chekanov
607*f82bbc90SS.Chekanov    /**
608*f82bbc90SS.Chekanov     * Strtoupper function with HTML tags and entities handling.
609*f82bbc90SS.Chekanov     *
610*f82bbc90SS.Chekanov     * @param  string $str Text to convert
611*f82bbc90SS.Chekanov     * @return string Converted text
612*f82bbc90SS.Chekanov     */
613*f82bbc90SS.Chekanov    protected function toupper($str)
614*f82bbc90SS.Chekanov    {
615*f82bbc90SS.Chekanov        // string can contain HTML tags
616*f82bbc90SS.Chekanov        $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
617*f82bbc90SS.Chekanov
618*f82bbc90SS.Chekanov        // convert toupper only the text between HTML tags
619*f82bbc90SS.Chekanov        foreach ($chunks as $i => $chunk) {
620*f82bbc90SS.Chekanov            if ($chunk[0] != '<') {
621*f82bbc90SS.Chekanov                $chunks[$i] = $this->strtoupper($chunk);
622*f82bbc90SS.Chekanov            }
623*f82bbc90SS.Chekanov        }
624*f82bbc90SS.Chekanov
625*f82bbc90SS.Chekanov        return implode($chunks);
626*f82bbc90SS.Chekanov    }
627*f82bbc90SS.Chekanov
628*f82bbc90SS.Chekanov    /**
629*f82bbc90SS.Chekanov     * Strtoupper multibyte wrapper function with HTML entities handling.
630*f82bbc90SS.Chekanov     *
631*f82bbc90SS.Chekanov     * @param  string $str Text to convert
632*f82bbc90SS.Chekanov     * @return string Converted text
633*f82bbc90SS.Chekanov     */
634*f82bbc90SS.Chekanov    protected function strtoupper($str)
635*f82bbc90SS.Chekanov    {
636*f82bbc90SS.Chekanov        $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
637*f82bbc90SS.Chekanov        $str = mb_strtoupper($str);
638*f82bbc90SS.Chekanov        $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);
639*f82bbc90SS.Chekanov
640*f82bbc90SS.Chekanov        return $str;
641*f82bbc90SS.Chekanov    }
642*f82bbc90SS.Chekanov
643*f82bbc90SS.Chekanov    /**
644*f82bbc90SS.Chekanov     * Helper function for DEL conversion.
645*f82bbc90SS.Chekanov     *
646*f82bbc90SS.Chekanov     * @param  string $text HTML content
647*f82bbc90SS.Chekanov     * @return string Converted text
648*f82bbc90SS.Chekanov     */
649*f82bbc90SS.Chekanov    protected function tostrike($str)
650*f82bbc90SS.Chekanov    {
651*f82bbc90SS.Chekanov        $rtn = '';
652*f82bbc90SS.Chekanov        for ($i = 0; $i < mb_strlen($str); $i++) {
653*f82bbc90SS.Chekanov            $chr = mb_substr($str, $i, 1);
654*f82bbc90SS.Chekanov            $combiningChr = chr(0xC0 | 0x336 >> 6). chr(0x80 | 0x336 & 0x3F);
655*f82bbc90SS.Chekanov            $rtn .= $chr . $combiningChr;
656*f82bbc90SS.Chekanov        }
657*f82bbc90SS.Chekanov        return $rtn;
658*f82bbc90SS.Chekanov    }
659*f82bbc90SS.Chekanov}
660*f82bbc90SS.Chekanov
661