xref: /plugin/zwidoku/Html2Text.php (revision f82bbc904bd835fc66a3f52ffaef251433904ec2)
1<?php
2
3/*
4 * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
5 *
6 * This script is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * The GNU General Public License can be found at
12 * http://www.gnu.org/copyleft/gpl.html.
13 *
14 * This script is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 */
19
20namespace Html2Text;
21
22class Html2Text
23{
24    const ENCODING = 'UTF-8';
25
26    protected $htmlFuncFlags;
27
28    /**
29     * Contains the HTML content to convert.
30     *
31     * @var string $html
32     */
33    protected $html;
34
35    /**
36     * Contains the converted, formatted text.
37     *
38     * @var string $text
39     */
40    protected $text;
41
42    /**
43     * List of preg* regular expression patterns to search for,
44     * used in conjunction with $replace.
45     *
46     * @var array $search
47     * @see $replace
48     */
49    protected $search = array(
50        "/\r/",                                           // Non-legal carriage return
51        "/[\n\t]+/",                                      // Newlines and tabs
52        '/<head\b[^>]*>.*?<\/head>/i',                    // <head>
53        '/<script\b[^>]*>.*?<\/script>/i',                // <script>s -- which strip_tags supposedly has problems with
54        '/<style\b[^>]*>.*?<\/style>/i',                  // <style>s -- which strip_tags supposedly has problems with
55        '/<i\b[^>]*>(.*?)<\/i>/i',                        // <i>
56        '/<em\b[^>]*>(.*?)<\/em>/i',                      // <em>
57        '/<ins\b[^>]*>(.*?)<\/ins>/i',                    // <ins>
58        '/(<ul\b[^>]*>|<\/ul>)/i',                        // <ul> and </ul>
59        '/(<ol\b[^>]*>|<\/ol>)/i',                        // <ol> and </ol>
60        '/(<dl\b[^>]*>|<\/dl>)/i',                        // <dl> and </dl>
61        '/<li\b[^>]*>(.*?)<\/li>/i',                      // <li> and </li>
62        '/<dd\b[^>]*>(.*?)<\/dd>/i',                      // <dd> and </dd>
63        '/<dt\b[^>]*>(.*?)<\/dt>/i',                      // <dt> and </dt>
64        '/<li\b[^>]*>/i',                                 // <li>
65        '/<hr\b[^>]*>/i',                                 // <hr>
66        '/<div\b[^>]*>/i',                                // <div>
67        '/(<table\b[^>]*>|<\/table>)/i',                  // <table> and </table>
68        '/(<tr\b[^>]*>|<\/tr>)/i',                        // <tr> and </tr>
69        '/<td\b[^>]*>(.*?)<\/td>/i',                      // <td> and </td>
70        '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
71        '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i',         // <img> with alt tag
72    );
73
74    /**
75     * List of pattern replacements corresponding to patterns searched.
76     *
77     * @var array $replace
78     * @see $search
79     */
80    protected $replace = array(
81        '',                              // Non-legal carriage return
82        ' ',                             // Newlines and tabs
83        '',                              // <head>
84        '',                              // <script>s -- which strip_tags supposedly has problems with
85        '',                              // <style>s -- which strip_tags supposedly has problems with
86        '_\\1_',                         // <i>
87        '_\\1_',                         // <em>
88        '_\\1_',                         // <ins>
89        "\n\n",                          // <ul> and </ul>
90        "\n\n",                          // <ol> and </ol>
91        "\n\n",                          // <dl> and </dl>
92        "\t* \\1\n",                     // <li> and </li>
93        " \\1\n",                        // <dd> and </dd>
94        "\t* \\1",                       // <dt> and </dt>
95        "\n\t* ",                        // <li>
96        "\n-------------------------\n", // <hr>
97        "<div>\n",                       // <div>
98        "\n\n",                          // <table> and </table>
99        "\n",                            // <tr> and </tr>
100        "\t\t\\1\n",                     // <td> and </td>
101        "",                              // <span class="_html2text_ignore">...</span>
102        '[\\2]',                         // <img> with alt tag
103    );
104
105    /**
106     * List of preg* regular expression patterns to search for,
107     * used in conjunction with $entReplace.
108     *
109     * @var array $entSearch
110     * @see $entReplace
111     */
112    protected $entSearch = array(
113        '/&#153;/i',                                     // TM symbol in win-1252
114        '/&#151;/i',                                     // m-dash in win-1252
115        '/&(amp|#38);/i',                                // Ampersand: see converter()
116        '/[ ]{2,}/',                                     // Runs of spaces, post-handling
117        '/&#39;/i',                                      // The apostrophe symbol
118    );
119
120    /**
121     * List of pattern replacements corresponding to patterns searched.
122     *
123     * @var array $entReplace
124     * @see $entSearch
125     */
126    protected $entReplace = array(
127        '™',         // TM symbol
128        '—',         // m-dash
129        '|+|amp|+|', // Ampersand: see converter()
130        ' ',         // Runs of spaces, post-handling
131        '\'',        // Apostrophe
132    );
133
134    /**
135     * List of preg* regular expression patterns to search for
136     * and replace using callback function.
137     *
138     * @var array $callbackSearch
139     */
140    protected $callbackSearch = array(
141        '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
142        '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si',                  // <p> with surrounding whitespace.
143        '/<(br)[^>]*>[ ]*/i',                                    // <br> with leading whitespace after the newline.
144        '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
145        '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
146        '/<(del)( [^>]*)?>(.*?)<\/del>/i',                       // <del>
147        '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
148        '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i'  // <a href="">
149    );
150
151    /**
152     * List of preg* regular expression patterns to search for in PRE body,
153     * used in conjunction with $preReplace.
154     *
155     * @var array $preSearch
156     * @see $preReplace
157     */
158    protected $preSearch = array(
159        "/\n/",
160        "/\t/",
161        '/ /',
162        '/<pre[^>]*>/',
163        '/<\/pre>/'
164    );
165
166    /**
167     * List of pattern replacements corresponding to patterns searched for PRE body.
168     *
169     * @var array $preReplace
170     * @see $preSearch
171     */
172    protected $preReplace = array(
173        '<br>',
174        '&nbsp;&nbsp;&nbsp;&nbsp;',
175        '&nbsp;',
176        '',
177        '',
178    );
179
180    /**
181     * Temporary workspace used during PRE processing.
182     *
183     * @var string $preContent
184     */
185    protected $preContent = '';
186
187    /**
188     * Contains the base URL that relative links should resolve to.
189     *
190     * @var string $baseurl
191     */
192    protected $baseurl = '';
193
194    /**
195     * Indicates whether content in the $html variable has been converted yet.
196     *
197     * @var boolean $converted
198     * @see $html, $text
199     */
200    protected $converted = false;
201
202    /**
203     * Contains URL addresses from links to be rendered in plain text.
204     *
205     * @var array $linkList
206     * @see buildlinkList()
207     */
208    protected $linkList = array();
209
210    /**
211     * Various configuration options (able to be set in the constructor)
212     *
213     * @var array $options
214     */
215    protected $options = array(
216        'do_links' => 'inline', // 'none'
217                                // 'inline' (show links inline)
218                                // 'nextline' (show links on the next line)
219                                // 'table' (if a table of link URLs should be listed after the text.
220                                // 'bbcode' (show links as bbcode)
221
222        'width' => 70,          //  Maximum width of the formatted text, in columns.
223                                //  Set this value to 0 (or less) to ignore word wrapping
224                                //  and not constrain text to a fixed-width column.
225    );
226
227    private function legacyConstruct($html = '', $fromFile = false, array $options = array())
228    {
229        $this->set_html($html, $fromFile);
230        $this->options = array_merge($this->options, $options);
231    }
232
233    /**
234     * @param string $html    Source HTML
235     * @param array  $options Set configuration options
236     */
237    public function __construct($html = '', $options = array())
238    {
239        // for backwards compatibility
240        if (!is_array($options)) {
241            return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
242        }
243
244        $this->html = $html;
245        $this->options = array_merge($this->options, $options);
246        $this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
247            ? ENT_QUOTES
248            : ENT_QUOTES | ENT_HTML5;
249    }
250
251    /**
252    * Get the source HTML
253    *
254    * @return string
255    */
256    public function getHtml()
257    {
258        return $this->html;
259    }
260
261    /**
262     * Set the source HTML
263     *
264     * @param string $html HTML source content
265     */
266    public function setHtml($html)
267    {
268        $this->html = $html;
269        $this->converted = false;
270    }
271
272    /**
273     * @deprecated
274     */
275    public function set_html($html, $from_file = false)
276    {
277        if ($from_file) {
278            throw new \InvalidArgumentException("Argument from_file no longer supported");
279        }
280
281        return $this->setHtml($html);
282    }
283
284    /**
285     * Returns the text, converted from HTML.
286     *
287     * @return string Plain text
288     */
289    public function getText()
290    {
291        if (!$this->converted) {
292            $this->convert();
293        }
294
295        return $this->text;
296    }
297
298    /**
299     * @deprecated
300     */
301    public function get_text()
302    {
303        return $this->getText();
304    }
305
306    /**
307     * @deprecated
308     */
309    public function print_text()
310    {
311        print $this->getText();
312    }
313
314    /**
315     * @deprecated
316     */
317    public function p()
318    {
319        return $this->print_text();
320    }
321
322    /**
323     * Sets a base URL to handle relative links.
324     *
325     * @param string $baseurl
326     */
327    public function setBaseUrl($baseurl)
328    {
329        $this->baseurl = $baseurl;
330    }
331
332    /**
333     * @deprecated
334     */
335    public function set_base_url($baseurl)
336    {
337        return $this->setBaseUrl($baseurl);
338    }
339
340    protected function convert()
341    {
342       $origEncoding = mb_internal_encoding();
343       mb_internal_encoding(self::ENCODING);
344
345       $this->doConvert();
346
347       mb_internal_encoding($origEncoding);
348    }
349
350    protected function doConvert()
351    {
352        $this->linkList = array();
353
354        $text = trim($this->html);
355
356        $this->converter($text);
357
358        if ($this->linkList) {
359            $text .= "\n\nLinks:\n------\n";
360            foreach ($this->linkList as $i => $url) {
361                $text .= '[' . ($i + 1) . '] ' . $url . "\n";
362            }
363        }
364
365        $this->text = $text;
366
367        $this->converted = true;
368    }
369
370    protected function converter(&$text)
371    {
372        $this->convertBlockquotes($text);
373        $this->convertPre($text);
374        $text = preg_replace($this->search, $this->replace, $text);
375        $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
376        $text = strip_tags($text);
377        $text = preg_replace($this->entSearch, $this->entReplace, $text);
378        $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);
379
380        // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
381        $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
382
383        // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
384        // This properly handles situation of "&amp;quot;" in input string
385        $text = str_replace('|+|amp|+|', '&', $text);
386
387        // Normalise empty lines
388        $text = preg_replace("/\n\s+\n/", "\n\n", $text);
389        $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
390
391        // remove leading empty lines (can be produced by eg. P tag on the beginning)
392        $text = ltrim($text, "\n");
393
394        if ($this->options['width'] > 0) {
395            $text = wordwrap($text, $this->options['width']);
396        }
397    }
398
399    /**
400     * Helper function called by preg_replace() on link replacement.
401     *
402     * Maintains an internal list of links to be displayed at the end of the
403     * text, with numeric indices to the original point in the text they
404     * appeared. Also makes an effort at identifying and handling absolute
405     * and relative links.
406     *
407     * @param  string $link          URL of the link
408     * @param  string $display       Part of the text to associate number with
409     * @param  null   $linkOverride
410     * @return string
411     */
412    protected function buildlinkList($link, $display, $linkOverride = null)
413    {
414        $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
415        if ($linkMethod == 'none') {
416            return $display;
417        }
418
419        // Ignored link types
420        if (preg_match('!^(javascript:|mailto:|#)!i', html_entity_decode($link, $this->htmlFuncFlags, self::ENCODING))) {
421            return $display;
422        }
423
424        if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
425            $url = $link;
426        } else {
427            $url = $this->baseurl;
428            if (mb_substr($link, 0, 1) != '/') {
429                $url .= '/';
430            }
431            $url .= $link;
432        }
433
434        if ($linkMethod == 'table') {
435            if (($index = array_search($url, $this->linkList)) === false) {
436                $index = count($this->linkList);
437                $this->linkList[] = $url;
438            }
439
440            return $display . ' [' . ($index + 1) . ']';
441        } elseif ($linkMethod == 'nextline') {
442            if ($url === $display) {
443                return $display;
444            }
445            return $display . "\n[" . $url . ']';
446        } elseif ($linkMethod == 'bbcode') {
447            return sprintf('[url=%s]%s[/url]', $url, $display);
448        } else { // link_method defaults to inline
449            if ($url === $display) {
450                return $display;
451            }
452            return $display . ' [' . $url . ']';
453        }
454    }
455
456    /**
457     * Helper function for PRE body conversion.
458     *
459     * @param string &$text HTML content
460     */
461    protected function convertPre(&$text)
462    {
463        // get the content of PRE element
464        while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
465            // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
466            $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);
467
468            // Run our defined tags search-and-replace with callback
469            $this->preContent = preg_replace_callback(
470                $this->callbackSearch,
471                array($this, 'pregCallback'),
472                $this->preContent
473            );
474
475            // convert the content
476            $this->preContent = sprintf(
477                '<div><br>%s<br></div>',
478                preg_replace($this->preSearch, $this->preReplace, $this->preContent)
479            );
480
481            // replace the content (use callback because content can contain $0 variable)
482            $text = preg_replace_callback(
483                '/<pre[^>]*>.*<\/pre>/ismU',
484                array($this, 'pregPreCallback'),
485                $text,
486                1
487            );
488
489            // free memory
490            $this->preContent = '';
491        }
492    }
493
494    /**
495     * Helper function for BLOCKQUOTE body conversion.
496     *
497     * @param string &$text HTML content
498     */
499    protected function convertBlockquotes(&$text)
500    {
501        if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
502            $originalText = $text;
503            $start = 0;
504            $taglen = 0;
505            $level = 0;
506            $diff = 0;
507            foreach ($matches[0] as $m) {
508                $m[1] = mb_strlen(substr($originalText, 0, $m[1]));
509                if ($m[0][0] == '<' && $m[0][1] == '/') {
510                    $level--;
511                    if ($level < 0) {
512                        $level = 0; // malformed HTML: go to next blockquote
513                    } elseif ($level > 0) {
514                        // skip inner blockquote
515                    } else {
516                        $end = $m[1];
517                        $len = $end - $taglen - $start;
518                        // Get blockquote content
519                        $body = mb_substr($text, $start + $taglen - $diff, $len);
520
521                        // Set text width
522                        $pWidth = $this->options['width'];
523                        if ($this->options['width'] > 0) $this->options['width'] -= 2;
524                        // Convert blockquote content
525                        $body = trim($body);
526                        $this->converter($body);
527                        // Add citation markers and create PRE block
528                        $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
529                        $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
530                        // Re-set text width
531                        $this->options['width'] = $pWidth;
532                        // Replace content
533                        $text = mb_substr($text, 0, $start - $diff)
534                            . $body
535                            . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
536
537                        $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
538                        unset($body);
539                    }
540                } else {
541                    if ($level == 0) {
542                        $start = $m[1];
543                        $taglen = mb_strlen($m[0]);
544                    }
545                    $level++;
546                }
547            }
548        }
549    }
550
551    /**
552     * Callback function for preg_replace_callback use.
553     *
554     * @param  array  $matches PREG matches
555     * @return string
556     */
557    protected function pregCallback($matches)
558    {
559        switch (mb_strtolower($matches[1])) {
560            case 'p':
561                // Replace newlines with spaces.
562                $para = str_replace("\n", " ", $matches[3]);
563
564                // Trim trailing and leading whitespace within the tag.
565                $para = trim($para);
566
567                // Add trailing newlines for this para.
568                return "\n" . $para . "\n";
569            case 'br':
570                return "\n";
571            // chekanov
572	    case 'b':
573            case 'strong':
574                return $this->toupper($matches[3]);
575	    case 'del':
576                return $this->tostrike($matches[3]);
577            case 'th':
578                return $this->toupper("\t\t" . $matches[3] . "\n");
579            case 'h':
580                return $this->toupper("\n\n" . $matches[3] . "\n\n");
581            case 'a':
582                // override the link method
583                $linkOverride = null;
584                if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
585                    $linkOverride = $linkOverrideMatch[1];
586                }
587                // Remove spaces in URL (#1487805)
588                $url = str_replace(' ', '', $matches[3]);
589
590                return $this->buildlinkList($url, $matches[5], $linkOverride);
591        }
592
593        return '';
594    }
595
596    /**
597     * Callback function for preg_replace_callback use in PRE content handler.
598     *
599     * @param  array  $matches PREG matches
600     * @return string
601     */
602    protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
603    {
604        return $this->preContent;
605    }
606
607    /**
608     * Strtoupper function with HTML tags and entities handling.
609     *
610     * @param  string $str Text to convert
611     * @return string Converted text
612     */
613    protected function toupper($str)
614    {
615        // string can contain HTML tags
616        $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
617
618        // convert toupper only the text between HTML tags
619        foreach ($chunks as $i => $chunk) {
620            if ($chunk[0] != '<') {
621                $chunks[$i] = $this->strtoupper($chunk);
622            }
623        }
624
625        return implode($chunks);
626    }
627
628    /**
629     * Strtoupper multibyte wrapper function with HTML entities handling.
630     *
631     * @param  string $str Text to convert
632     * @return string Converted text
633     */
634    protected function strtoupper($str)
635    {
636        $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
637        $str = mb_strtoupper($str);
638        $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);
639
640        return $str;
641    }
642
643    /**
644     * Helper function for DEL conversion.
645     *
646     * @param  string $text HTML content
647     * @return string Converted text
648     */
649    protected function tostrike($str)
650    {
651        $rtn = '';
652        for ($i = 0; $i < mb_strlen($str); $i++) {
653            $chr = mb_substr($str, $i, 1);
654            $combiningChr = chr(0xC0 | 0x336 >> 6). chr(0x80 | 0x336 & 0x3F);
655            $rtn .= $chr . $combiningChr;
656        }
657        return $rtn;
658    }
659}
660
661