1*f82bbc90SS.Chekanov<?php 2*f82bbc90SS.Chekanov 3*f82bbc90SS.Chekanov/* 4*f82bbc90SS.Chekanov * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com> 5*f82bbc90SS.Chekanov * 6*f82bbc90SS.Chekanov * This script is free software; you can redistribute it and/or modify 7*f82bbc90SS.Chekanov * it under the terms of the GNU General Public License as published by 8*f82bbc90SS.Chekanov * the Free Software Foundation; either version 2 of the License, or 9*f82bbc90SS.Chekanov * (at your option) any later version. 10*f82bbc90SS.Chekanov * 11*f82bbc90SS.Chekanov * The GNU General Public License can be found at 12*f82bbc90SS.Chekanov * http://www.gnu.org/copyleft/gpl.html. 13*f82bbc90SS.Chekanov * 14*f82bbc90SS.Chekanov * This script is distributed in the hope that it will be useful, 15*f82bbc90SS.Chekanov * but WITHOUT ANY WARRANTY; without even the implied warranty of 16*f82bbc90SS.Chekanov * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17*f82bbc90SS.Chekanov * GNU General Public License for more details. 18*f82bbc90SS.Chekanov */ 19*f82bbc90SS.Chekanov 20*f82bbc90SS.Chekanovnamespace Html2Text; 21*f82bbc90SS.Chekanov 22*f82bbc90SS.Chekanovclass Html2Text 23*f82bbc90SS.Chekanov{ 24*f82bbc90SS.Chekanov const ENCODING = 'UTF-8'; 25*f82bbc90SS.Chekanov 26*f82bbc90SS.Chekanov protected $htmlFuncFlags; 27*f82bbc90SS.Chekanov 28*f82bbc90SS.Chekanov /** 29*f82bbc90SS.Chekanov * Contains the HTML content to convert. 30*f82bbc90SS.Chekanov * 31*f82bbc90SS.Chekanov * @var string $html 32*f82bbc90SS.Chekanov */ 33*f82bbc90SS.Chekanov protected $html; 34*f82bbc90SS.Chekanov 35*f82bbc90SS.Chekanov /** 36*f82bbc90SS.Chekanov * Contains the converted, formatted text. 37*f82bbc90SS.Chekanov * 38*f82bbc90SS.Chekanov * @var string $text 39*f82bbc90SS.Chekanov */ 40*f82bbc90SS.Chekanov protected $text; 41*f82bbc90SS.Chekanov 42*f82bbc90SS.Chekanov /** 43*f82bbc90SS.Chekanov * List of preg* regular expression patterns to search for, 44*f82bbc90SS.Chekanov * used in conjunction with $replace. 45*f82bbc90SS.Chekanov * 46*f82bbc90SS.Chekanov * @var array $search 47*f82bbc90SS.Chekanov * @see $replace 48*f82bbc90SS.Chekanov */ 49*f82bbc90SS.Chekanov protected $search = array( 50*f82bbc90SS.Chekanov "/\r/", // Non-legal carriage return 51*f82bbc90SS.Chekanov "/[\n\t]+/", // Newlines and tabs 52*f82bbc90SS.Chekanov '/<head\b[^>]*>.*?<\/head>/i', // <head> 53*f82bbc90SS.Chekanov '/<script\b[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with 54*f82bbc90SS.Chekanov '/<style\b[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with 55*f82bbc90SS.Chekanov '/<i\b[^>]*>(.*?)<\/i>/i', // <i> 56*f82bbc90SS.Chekanov '/<em\b[^>]*>(.*?)<\/em>/i', // <em> 57*f82bbc90SS.Chekanov '/<ins\b[^>]*>(.*?)<\/ins>/i', // <ins> 58*f82bbc90SS.Chekanov '/(<ul\b[^>]*>|<\/ul>)/i', // <ul> and </ul> 59*f82bbc90SS.Chekanov '/(<ol\b[^>]*>|<\/ol>)/i', // <ol> and </ol> 60*f82bbc90SS.Chekanov '/(<dl\b[^>]*>|<\/dl>)/i', // <dl> and </dl> 61*f82bbc90SS.Chekanov '/<li\b[^>]*>(.*?)<\/li>/i', // <li> and </li> 62*f82bbc90SS.Chekanov '/<dd\b[^>]*>(.*?)<\/dd>/i', // <dd> and </dd> 63*f82bbc90SS.Chekanov '/<dt\b[^>]*>(.*?)<\/dt>/i', // <dt> and </dt> 64*f82bbc90SS.Chekanov '/<li\b[^>]*>/i', // <li> 65*f82bbc90SS.Chekanov '/<hr\b[^>]*>/i', // <hr> 66*f82bbc90SS.Chekanov '/<div\b[^>]*>/i', // <div> 67*f82bbc90SS.Chekanov '/(<table\b[^>]*>|<\/table>)/i', // <table> and </table> 68*f82bbc90SS.Chekanov '/(<tr\b[^>]*>|<\/tr>)/i', // <tr> and </tr> 69*f82bbc90SS.Chekanov '/<td\b[^>]*>(.*?)<\/td>/i', // <td> and </td> 70*f82bbc90SS.Chekanov '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span> 71*f82bbc90SS.Chekanov '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt tag 72*f82bbc90SS.Chekanov ); 73*f82bbc90SS.Chekanov 74*f82bbc90SS.Chekanov /** 75*f82bbc90SS.Chekanov * List of pattern replacements corresponding to patterns searched. 76*f82bbc90SS.Chekanov * 77*f82bbc90SS.Chekanov * @var array $replace 78*f82bbc90SS.Chekanov * @see $search 79*f82bbc90SS.Chekanov */ 80*f82bbc90SS.Chekanov protected $replace = array( 81*f82bbc90SS.Chekanov '', // Non-legal carriage return 82*f82bbc90SS.Chekanov ' ', // Newlines and tabs 83*f82bbc90SS.Chekanov '', // <head> 84*f82bbc90SS.Chekanov '', // <script>s -- which strip_tags supposedly has problems with 85*f82bbc90SS.Chekanov '', // <style>s -- which strip_tags supposedly has problems with 86*f82bbc90SS.Chekanov '_\\1_', // <i> 87*f82bbc90SS.Chekanov '_\\1_', // <em> 88*f82bbc90SS.Chekanov '_\\1_', // <ins> 89*f82bbc90SS.Chekanov "\n\n", // <ul> and </ul> 90*f82bbc90SS.Chekanov "\n\n", // <ol> and </ol> 91*f82bbc90SS.Chekanov "\n\n", // <dl> and </dl> 92*f82bbc90SS.Chekanov "\t* \\1\n", // <li> and </li> 93*f82bbc90SS.Chekanov " \\1\n", // <dd> and </dd> 94*f82bbc90SS.Chekanov "\t* \\1", // <dt> and </dt> 95*f82bbc90SS.Chekanov "\n\t* ", // <li> 96*f82bbc90SS.Chekanov "\n-------------------------\n", // <hr> 97*f82bbc90SS.Chekanov "<div>\n", // <div> 98*f82bbc90SS.Chekanov "\n\n", // <table> and </table> 99*f82bbc90SS.Chekanov "\n", // <tr> and </tr> 100*f82bbc90SS.Chekanov "\t\t\\1\n", // <td> and </td> 101*f82bbc90SS.Chekanov "", // <span class="_html2text_ignore">...</span> 102*f82bbc90SS.Chekanov '[\\2]', // <img> with alt tag 103*f82bbc90SS.Chekanov ); 104*f82bbc90SS.Chekanov 105*f82bbc90SS.Chekanov /** 106*f82bbc90SS.Chekanov * List of preg* regular expression patterns to search for, 107*f82bbc90SS.Chekanov * used in conjunction with $entReplace. 108*f82bbc90SS.Chekanov * 109*f82bbc90SS.Chekanov * @var array $entSearch 110*f82bbc90SS.Chekanov * @see $entReplace 111*f82bbc90SS.Chekanov */ 112*f82bbc90SS.Chekanov protected $entSearch = array( 113*f82bbc90SS.Chekanov '/™/i', // TM symbol in win-1252 114*f82bbc90SS.Chekanov '/—/i', // m-dash in win-1252 115*f82bbc90SS.Chekanov '/&(amp|#38);/i', // Ampersand: see converter() 116*f82bbc90SS.Chekanov '/[ ]{2,}/', // Runs of spaces, post-handling 117*f82bbc90SS.Chekanov '/'/i', // The apostrophe symbol 118*f82bbc90SS.Chekanov ); 119*f82bbc90SS.Chekanov 120*f82bbc90SS.Chekanov /** 121*f82bbc90SS.Chekanov * List of pattern replacements corresponding to patterns searched. 122*f82bbc90SS.Chekanov * 123*f82bbc90SS.Chekanov * @var array $entReplace 124*f82bbc90SS.Chekanov * @see $entSearch 125*f82bbc90SS.Chekanov */ 126*f82bbc90SS.Chekanov protected $entReplace = array( 127*f82bbc90SS.Chekanov '™', // TM symbol 128*f82bbc90SS.Chekanov '—', // m-dash 129*f82bbc90SS.Chekanov '|+|amp|+|', // Ampersand: see converter() 130*f82bbc90SS.Chekanov ' ', // Runs of spaces, post-handling 131*f82bbc90SS.Chekanov '\'', // Apostrophe 132*f82bbc90SS.Chekanov ); 133*f82bbc90SS.Chekanov 134*f82bbc90SS.Chekanov /** 135*f82bbc90SS.Chekanov * List of preg* regular expression patterns to search for 136*f82bbc90SS.Chekanov * and replace using callback function. 137*f82bbc90SS.Chekanov * 138*f82bbc90SS.Chekanov * @var array $callbackSearch 139*f82bbc90SS.Chekanov */ 140*f82bbc90SS.Chekanov protected $callbackSearch = array( 141*f82bbc90SS.Chekanov '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 142*f82bbc90SS.Chekanov '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si', // <p> with surrounding whitespace. 143*f82bbc90SS.Chekanov '/<(br)[^>]*>[ ]*/i', // <br> with leading whitespace after the newline. 144*f82bbc90SS.Chekanov '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b> 145*f82bbc90SS.Chekanov '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong> 146*f82bbc90SS.Chekanov '/<(del)( [^>]*)?>(.*?)<\/del>/i', // <del> 147*f82bbc90SS.Chekanov '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th> 148*f82bbc90SS.Chekanov '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i' // <a href=""> 149*f82bbc90SS.Chekanov ); 150*f82bbc90SS.Chekanov 151*f82bbc90SS.Chekanov /** 152*f82bbc90SS.Chekanov * List of preg* regular expression patterns to search for in PRE body, 153*f82bbc90SS.Chekanov * used in conjunction with $preReplace. 154*f82bbc90SS.Chekanov * 155*f82bbc90SS.Chekanov * @var array $preSearch 156*f82bbc90SS.Chekanov * @see $preReplace 157*f82bbc90SS.Chekanov */ 158*f82bbc90SS.Chekanov protected $preSearch = array( 159*f82bbc90SS.Chekanov "/\n/", 160*f82bbc90SS.Chekanov "/\t/", 161*f82bbc90SS.Chekanov '/ /', 162*f82bbc90SS.Chekanov '/<pre[^>]*>/', 163*f82bbc90SS.Chekanov '/<\/pre>/' 164*f82bbc90SS.Chekanov ); 165*f82bbc90SS.Chekanov 166*f82bbc90SS.Chekanov /** 167*f82bbc90SS.Chekanov * List of pattern replacements corresponding to patterns searched for PRE body. 168*f82bbc90SS.Chekanov * 169*f82bbc90SS.Chekanov * @var array $preReplace 170*f82bbc90SS.Chekanov * @see $preSearch 171*f82bbc90SS.Chekanov */ 172*f82bbc90SS.Chekanov protected $preReplace = array( 173*f82bbc90SS.Chekanov '<br>', 174*f82bbc90SS.Chekanov ' ', 175*f82bbc90SS.Chekanov ' ', 176*f82bbc90SS.Chekanov '', 177*f82bbc90SS.Chekanov '', 178*f82bbc90SS.Chekanov ); 179*f82bbc90SS.Chekanov 180*f82bbc90SS.Chekanov /** 181*f82bbc90SS.Chekanov * Temporary workspace used during PRE processing. 182*f82bbc90SS.Chekanov * 183*f82bbc90SS.Chekanov * @var string $preContent 184*f82bbc90SS.Chekanov */ 185*f82bbc90SS.Chekanov protected $preContent = ''; 186*f82bbc90SS.Chekanov 187*f82bbc90SS.Chekanov /** 188*f82bbc90SS.Chekanov * Contains the base URL that relative links should resolve to. 189*f82bbc90SS.Chekanov * 190*f82bbc90SS.Chekanov * @var string $baseurl 191*f82bbc90SS.Chekanov */ 192*f82bbc90SS.Chekanov protected $baseurl = ''; 193*f82bbc90SS.Chekanov 194*f82bbc90SS.Chekanov /** 195*f82bbc90SS.Chekanov * Indicates whether content in the $html variable has been converted yet. 196*f82bbc90SS.Chekanov * 197*f82bbc90SS.Chekanov * @var boolean $converted 198*f82bbc90SS.Chekanov * @see $html, $text 199*f82bbc90SS.Chekanov */ 200*f82bbc90SS.Chekanov protected $converted = false; 201*f82bbc90SS.Chekanov 202*f82bbc90SS.Chekanov /** 203*f82bbc90SS.Chekanov * Contains URL addresses from links to be rendered in plain text. 204*f82bbc90SS.Chekanov * 205*f82bbc90SS.Chekanov * @var array $linkList 206*f82bbc90SS.Chekanov * @see buildlinkList() 207*f82bbc90SS.Chekanov */ 208*f82bbc90SS.Chekanov protected $linkList = array(); 209*f82bbc90SS.Chekanov 210*f82bbc90SS.Chekanov /** 211*f82bbc90SS.Chekanov * Various configuration options (able to be set in the constructor) 212*f82bbc90SS.Chekanov * 213*f82bbc90SS.Chekanov * @var array $options 214*f82bbc90SS.Chekanov */ 215*f82bbc90SS.Chekanov protected $options = array( 216*f82bbc90SS.Chekanov 'do_links' => 'inline', // 'none' 217*f82bbc90SS.Chekanov // 'inline' (show links inline) 218*f82bbc90SS.Chekanov // 'nextline' (show links on the next line) 219*f82bbc90SS.Chekanov // 'table' (if a table of link URLs should be listed after the text. 220*f82bbc90SS.Chekanov // 'bbcode' (show links as bbcode) 221*f82bbc90SS.Chekanov 222*f82bbc90SS.Chekanov 'width' => 70, // Maximum width of the formatted text, in columns. 223*f82bbc90SS.Chekanov // Set this value to 0 (or less) to ignore word wrapping 224*f82bbc90SS.Chekanov // and not constrain text to a fixed-width column. 225*f82bbc90SS.Chekanov ); 226*f82bbc90SS.Chekanov 227*f82bbc90SS.Chekanov private function legacyConstruct($html = '', $fromFile = false, array $options = array()) 228*f82bbc90SS.Chekanov { 229*f82bbc90SS.Chekanov $this->set_html($html, $fromFile); 230*f82bbc90SS.Chekanov $this->options = array_merge($this->options, $options); 231*f82bbc90SS.Chekanov } 232*f82bbc90SS.Chekanov 233*f82bbc90SS.Chekanov /** 234*f82bbc90SS.Chekanov * @param string $html Source HTML 235*f82bbc90SS.Chekanov * @param array $options Set configuration options 236*f82bbc90SS.Chekanov */ 237*f82bbc90SS.Chekanov public function __construct($html = '', $options = array()) 238*f82bbc90SS.Chekanov { 239*f82bbc90SS.Chekanov // for backwards compatibility 240*f82bbc90SS.Chekanov if (!is_array($options)) { 241*f82bbc90SS.Chekanov return call_user_func_array(array($this, 'legacyConstruct'), func_get_args()); 242*f82bbc90SS.Chekanov } 243*f82bbc90SS.Chekanov 244*f82bbc90SS.Chekanov $this->html = $html; 245*f82bbc90SS.Chekanov $this->options = array_merge($this->options, $options); 246*f82bbc90SS.Chekanov $this->htmlFuncFlags = (PHP_VERSION_ID < 50400) 247*f82bbc90SS.Chekanov ? ENT_QUOTES 248*f82bbc90SS.Chekanov : ENT_QUOTES | ENT_HTML5; 249*f82bbc90SS.Chekanov } 250*f82bbc90SS.Chekanov 251*f82bbc90SS.Chekanov /** 252*f82bbc90SS.Chekanov * Get the source HTML 253*f82bbc90SS.Chekanov * 254*f82bbc90SS.Chekanov * @return string 255*f82bbc90SS.Chekanov */ 256*f82bbc90SS.Chekanov public function getHtml() 257*f82bbc90SS.Chekanov { 258*f82bbc90SS.Chekanov return $this->html; 259*f82bbc90SS.Chekanov } 260*f82bbc90SS.Chekanov 261*f82bbc90SS.Chekanov /** 262*f82bbc90SS.Chekanov * Set the source HTML 263*f82bbc90SS.Chekanov * 264*f82bbc90SS.Chekanov * @param string $html HTML source content 265*f82bbc90SS.Chekanov */ 266*f82bbc90SS.Chekanov public function setHtml($html) 267*f82bbc90SS.Chekanov { 268*f82bbc90SS.Chekanov $this->html = $html; 269*f82bbc90SS.Chekanov $this->converted = false; 270*f82bbc90SS.Chekanov } 271*f82bbc90SS.Chekanov 272*f82bbc90SS.Chekanov /** 273*f82bbc90SS.Chekanov * @deprecated 274*f82bbc90SS.Chekanov */ 275*f82bbc90SS.Chekanov public function set_html($html, $from_file = false) 276*f82bbc90SS.Chekanov { 277*f82bbc90SS.Chekanov if ($from_file) { 278*f82bbc90SS.Chekanov throw new \InvalidArgumentException("Argument from_file no longer supported"); 279*f82bbc90SS.Chekanov } 280*f82bbc90SS.Chekanov 281*f82bbc90SS.Chekanov return $this->setHtml($html); 282*f82bbc90SS.Chekanov } 283*f82bbc90SS.Chekanov 284*f82bbc90SS.Chekanov /** 285*f82bbc90SS.Chekanov * Returns the text, converted from HTML. 286*f82bbc90SS.Chekanov * 287*f82bbc90SS.Chekanov * @return string Plain text 288*f82bbc90SS.Chekanov */ 289*f82bbc90SS.Chekanov public function getText() 290*f82bbc90SS.Chekanov { 291*f82bbc90SS.Chekanov if (!$this->converted) { 292*f82bbc90SS.Chekanov $this->convert(); 293*f82bbc90SS.Chekanov } 294*f82bbc90SS.Chekanov 295*f82bbc90SS.Chekanov return $this->text; 296*f82bbc90SS.Chekanov } 297*f82bbc90SS.Chekanov 298*f82bbc90SS.Chekanov /** 299*f82bbc90SS.Chekanov * @deprecated 300*f82bbc90SS.Chekanov */ 301*f82bbc90SS.Chekanov public function get_text() 302*f82bbc90SS.Chekanov { 303*f82bbc90SS.Chekanov return $this->getText(); 304*f82bbc90SS.Chekanov } 305*f82bbc90SS.Chekanov 306*f82bbc90SS.Chekanov /** 307*f82bbc90SS.Chekanov * @deprecated 308*f82bbc90SS.Chekanov */ 309*f82bbc90SS.Chekanov public function print_text() 310*f82bbc90SS.Chekanov { 311*f82bbc90SS.Chekanov print $this->getText(); 312*f82bbc90SS.Chekanov } 313*f82bbc90SS.Chekanov 314*f82bbc90SS.Chekanov /** 315*f82bbc90SS.Chekanov * @deprecated 316*f82bbc90SS.Chekanov */ 317*f82bbc90SS.Chekanov public function p() 318*f82bbc90SS.Chekanov { 319*f82bbc90SS.Chekanov return $this->print_text(); 320*f82bbc90SS.Chekanov } 321*f82bbc90SS.Chekanov 322*f82bbc90SS.Chekanov /** 323*f82bbc90SS.Chekanov * Sets a base URL to handle relative links. 324*f82bbc90SS.Chekanov * 325*f82bbc90SS.Chekanov * @param string $baseurl 326*f82bbc90SS.Chekanov */ 327*f82bbc90SS.Chekanov public function setBaseUrl($baseurl) 328*f82bbc90SS.Chekanov { 329*f82bbc90SS.Chekanov $this->baseurl = $baseurl; 330*f82bbc90SS.Chekanov } 331*f82bbc90SS.Chekanov 332*f82bbc90SS.Chekanov /** 333*f82bbc90SS.Chekanov * @deprecated 334*f82bbc90SS.Chekanov */ 335*f82bbc90SS.Chekanov public function set_base_url($baseurl) 336*f82bbc90SS.Chekanov { 337*f82bbc90SS.Chekanov return $this->setBaseUrl($baseurl); 338*f82bbc90SS.Chekanov } 339*f82bbc90SS.Chekanov 340*f82bbc90SS.Chekanov protected function convert() 341*f82bbc90SS.Chekanov { 342*f82bbc90SS.Chekanov $origEncoding = mb_internal_encoding(); 343*f82bbc90SS.Chekanov mb_internal_encoding(self::ENCODING); 344*f82bbc90SS.Chekanov 345*f82bbc90SS.Chekanov $this->doConvert(); 346*f82bbc90SS.Chekanov 347*f82bbc90SS.Chekanov mb_internal_encoding($origEncoding); 348*f82bbc90SS.Chekanov } 349*f82bbc90SS.Chekanov 350*f82bbc90SS.Chekanov protected function doConvert() 351*f82bbc90SS.Chekanov { 352*f82bbc90SS.Chekanov $this->linkList = array(); 353*f82bbc90SS.Chekanov 354*f82bbc90SS.Chekanov $text = trim($this->html); 355*f82bbc90SS.Chekanov 356*f82bbc90SS.Chekanov $this->converter($text); 357*f82bbc90SS.Chekanov 358*f82bbc90SS.Chekanov if ($this->linkList) { 359*f82bbc90SS.Chekanov $text .= "\n\nLinks:\n------\n"; 360*f82bbc90SS.Chekanov foreach ($this->linkList as $i => $url) { 361*f82bbc90SS.Chekanov $text .= '[' . ($i + 1) . '] ' . $url . "\n"; 362*f82bbc90SS.Chekanov } 363*f82bbc90SS.Chekanov } 364*f82bbc90SS.Chekanov 365*f82bbc90SS.Chekanov $this->text = $text; 366*f82bbc90SS.Chekanov 367*f82bbc90SS.Chekanov $this->converted = true; 368*f82bbc90SS.Chekanov } 369*f82bbc90SS.Chekanov 370*f82bbc90SS.Chekanov protected function converter(&$text) 371*f82bbc90SS.Chekanov { 372*f82bbc90SS.Chekanov $this->convertBlockquotes($text); 373*f82bbc90SS.Chekanov $this->convertPre($text); 374*f82bbc90SS.Chekanov $text = preg_replace($this->search, $this->replace, $text); 375*f82bbc90SS.Chekanov $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text); 376*f82bbc90SS.Chekanov $text = strip_tags($text); 377*f82bbc90SS.Chekanov $text = preg_replace($this->entSearch, $this->entReplace, $text); 378*f82bbc90SS.Chekanov $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING); 379*f82bbc90SS.Chekanov 380*f82bbc90SS.Chekanov // Remove unknown/unhandled entities (this cannot be done in search-and-replace block) 381*f82bbc90SS.Chekanov $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text); 382*f82bbc90SS.Chekanov 383*f82bbc90SS.Chekanov // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities 384*f82bbc90SS.Chekanov // This properly handles situation of "&quot;" in input string 385*f82bbc90SS.Chekanov $text = str_replace('|+|amp|+|', '&', $text); 386*f82bbc90SS.Chekanov 387*f82bbc90SS.Chekanov // Normalise empty lines 388*f82bbc90SS.Chekanov $text = preg_replace("/\n\s+\n/", "\n\n", $text); 389*f82bbc90SS.Chekanov $text = preg_replace("/[\n]{3,}/", "\n\n", $text); 390*f82bbc90SS.Chekanov 391*f82bbc90SS.Chekanov // remove leading empty lines (can be produced by eg. P tag on the beginning) 392*f82bbc90SS.Chekanov $text = ltrim($text, "\n"); 393*f82bbc90SS.Chekanov 394*f82bbc90SS.Chekanov if ($this->options['width'] > 0) { 395*f82bbc90SS.Chekanov $text = wordwrap($text, $this->options['width']); 396*f82bbc90SS.Chekanov } 397*f82bbc90SS.Chekanov } 398*f82bbc90SS.Chekanov 399*f82bbc90SS.Chekanov /** 400*f82bbc90SS.Chekanov * Helper function called by preg_replace() on link replacement. 401*f82bbc90SS.Chekanov * 402*f82bbc90SS.Chekanov * Maintains an internal list of links to be displayed at the end of the 403*f82bbc90SS.Chekanov * text, with numeric indices to the original point in the text they 404*f82bbc90SS.Chekanov * appeared. Also makes an effort at identifying and handling absolute 405*f82bbc90SS.Chekanov * and relative links. 406*f82bbc90SS.Chekanov * 407*f82bbc90SS.Chekanov * @param string $link URL of the link 408*f82bbc90SS.Chekanov * @param string $display Part of the text to associate number with 409*f82bbc90SS.Chekanov * @param null $linkOverride 410*f82bbc90SS.Chekanov * @return string 411*f82bbc90SS.Chekanov */ 412*f82bbc90SS.Chekanov protected function buildlinkList($link, $display, $linkOverride = null) 413*f82bbc90SS.Chekanov { 414*f82bbc90SS.Chekanov $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links']; 415*f82bbc90SS.Chekanov if ($linkMethod == 'none') { 416*f82bbc90SS.Chekanov return $display; 417*f82bbc90SS.Chekanov } 418*f82bbc90SS.Chekanov 419*f82bbc90SS.Chekanov // Ignored link types 420*f82bbc90SS.Chekanov if (preg_match('!^(javascript:|mailto:|#)!i', html_entity_decode($link, $this->htmlFuncFlags, self::ENCODING))) { 421*f82bbc90SS.Chekanov return $display; 422*f82bbc90SS.Chekanov } 423*f82bbc90SS.Chekanov 424*f82bbc90SS.Chekanov if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) { 425*f82bbc90SS.Chekanov $url = $link; 426*f82bbc90SS.Chekanov } else { 427*f82bbc90SS.Chekanov $url = $this->baseurl; 428*f82bbc90SS.Chekanov if (mb_substr($link, 0, 1) != '/') { 429*f82bbc90SS.Chekanov $url .= '/'; 430*f82bbc90SS.Chekanov } 431*f82bbc90SS.Chekanov $url .= $link; 432*f82bbc90SS.Chekanov } 433*f82bbc90SS.Chekanov 434*f82bbc90SS.Chekanov if ($linkMethod == 'table') { 435*f82bbc90SS.Chekanov if (($index = array_search($url, $this->linkList)) === false) { 436*f82bbc90SS.Chekanov $index = count($this->linkList); 437*f82bbc90SS.Chekanov $this->linkList[] = $url; 438*f82bbc90SS.Chekanov } 439*f82bbc90SS.Chekanov 440*f82bbc90SS.Chekanov return $display . ' [' . ($index + 1) . ']'; 441*f82bbc90SS.Chekanov } elseif ($linkMethod == 'nextline') { 442*f82bbc90SS.Chekanov if ($url === $display) { 443*f82bbc90SS.Chekanov return $display; 444*f82bbc90SS.Chekanov } 445*f82bbc90SS.Chekanov return $display . "\n[" . $url . ']'; 446*f82bbc90SS.Chekanov } elseif ($linkMethod == 'bbcode') { 447*f82bbc90SS.Chekanov return sprintf('[url=%s]%s[/url]', $url, $display); 448*f82bbc90SS.Chekanov } else { // link_method defaults to inline 449*f82bbc90SS.Chekanov if ($url === $display) { 450*f82bbc90SS.Chekanov return $display; 451*f82bbc90SS.Chekanov } 452*f82bbc90SS.Chekanov return $display . ' [' . $url . ']'; 453*f82bbc90SS.Chekanov } 454*f82bbc90SS.Chekanov } 455*f82bbc90SS.Chekanov 456*f82bbc90SS.Chekanov /** 457*f82bbc90SS.Chekanov * Helper function for PRE body conversion. 458*f82bbc90SS.Chekanov * 459*f82bbc90SS.Chekanov * @param string &$text HTML content 460*f82bbc90SS.Chekanov */ 461*f82bbc90SS.Chekanov protected function convertPre(&$text) 462*f82bbc90SS.Chekanov { 463*f82bbc90SS.Chekanov // get the content of PRE element 464*f82bbc90SS.Chekanov while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { 465*f82bbc90SS.Chekanov // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace 466*f82bbc90SS.Chekanov $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]); 467*f82bbc90SS.Chekanov 468*f82bbc90SS.Chekanov // Run our defined tags search-and-replace with callback 469*f82bbc90SS.Chekanov $this->preContent = preg_replace_callback( 470*f82bbc90SS.Chekanov $this->callbackSearch, 471*f82bbc90SS.Chekanov array($this, 'pregCallback'), 472*f82bbc90SS.Chekanov $this->preContent 473*f82bbc90SS.Chekanov ); 474*f82bbc90SS.Chekanov 475*f82bbc90SS.Chekanov // convert the content 476*f82bbc90SS.Chekanov $this->preContent = sprintf( 477*f82bbc90SS.Chekanov '<div><br>%s<br></div>', 478*f82bbc90SS.Chekanov preg_replace($this->preSearch, $this->preReplace, $this->preContent) 479*f82bbc90SS.Chekanov ); 480*f82bbc90SS.Chekanov 481*f82bbc90SS.Chekanov // replace the content (use callback because content can contain $0 variable) 482*f82bbc90SS.Chekanov $text = preg_replace_callback( 483*f82bbc90SS.Chekanov '/<pre[^>]*>.*<\/pre>/ismU', 484*f82bbc90SS.Chekanov array($this, 'pregPreCallback'), 485*f82bbc90SS.Chekanov $text, 486*f82bbc90SS.Chekanov 1 487*f82bbc90SS.Chekanov ); 488*f82bbc90SS.Chekanov 489*f82bbc90SS.Chekanov // free memory 490*f82bbc90SS.Chekanov $this->preContent = ''; 491*f82bbc90SS.Chekanov } 492*f82bbc90SS.Chekanov } 493*f82bbc90SS.Chekanov 494*f82bbc90SS.Chekanov /** 495*f82bbc90SS.Chekanov * Helper function for BLOCKQUOTE body conversion. 496*f82bbc90SS.Chekanov * 497*f82bbc90SS.Chekanov * @param string &$text HTML content 498*f82bbc90SS.Chekanov */ 499*f82bbc90SS.Chekanov protected function convertBlockquotes(&$text) 500*f82bbc90SS.Chekanov { 501*f82bbc90SS.Chekanov if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { 502*f82bbc90SS.Chekanov $originalText = $text; 503*f82bbc90SS.Chekanov $start = 0; 504*f82bbc90SS.Chekanov $taglen = 0; 505*f82bbc90SS.Chekanov $level = 0; 506*f82bbc90SS.Chekanov $diff = 0; 507*f82bbc90SS.Chekanov foreach ($matches[0] as $m) { 508*f82bbc90SS.Chekanov $m[1] = mb_strlen(substr($originalText, 0, $m[1])); 509*f82bbc90SS.Chekanov if ($m[0][0] == '<' && $m[0][1] == '/') { 510*f82bbc90SS.Chekanov $level--; 511*f82bbc90SS.Chekanov if ($level < 0) { 512*f82bbc90SS.Chekanov $level = 0; // malformed HTML: go to next blockquote 513*f82bbc90SS.Chekanov } elseif ($level > 0) { 514*f82bbc90SS.Chekanov // skip inner blockquote 515*f82bbc90SS.Chekanov } else { 516*f82bbc90SS.Chekanov $end = $m[1]; 517*f82bbc90SS.Chekanov $len = $end - $taglen - $start; 518*f82bbc90SS.Chekanov // Get blockquote content 519*f82bbc90SS.Chekanov $body = mb_substr($text, $start + $taglen - $diff, $len); 520*f82bbc90SS.Chekanov 521*f82bbc90SS.Chekanov // Set text width 522*f82bbc90SS.Chekanov $pWidth = $this->options['width']; 523*f82bbc90SS.Chekanov if ($this->options['width'] > 0) $this->options['width'] -= 2; 524*f82bbc90SS.Chekanov // Convert blockquote content 525*f82bbc90SS.Chekanov $body = trim($body); 526*f82bbc90SS.Chekanov $this->converter($body); 527*f82bbc90SS.Chekanov // Add citation markers and create PRE block 528*f82bbc90SS.Chekanov $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); 529*f82bbc90SS.Chekanov $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>'; 530*f82bbc90SS.Chekanov // Re-set text width 531*f82bbc90SS.Chekanov $this->options['width'] = $pWidth; 532*f82bbc90SS.Chekanov // Replace content 533*f82bbc90SS.Chekanov $text = mb_substr($text, 0, $start - $diff) 534*f82bbc90SS.Chekanov . $body 535*f82bbc90SS.Chekanov . mb_substr($text, $end + mb_strlen($m[0]) - $diff); 536*f82bbc90SS.Chekanov 537*f82bbc90SS.Chekanov $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body); 538*f82bbc90SS.Chekanov unset($body); 539*f82bbc90SS.Chekanov } 540*f82bbc90SS.Chekanov } else { 541*f82bbc90SS.Chekanov if ($level == 0) { 542*f82bbc90SS.Chekanov $start = $m[1]; 543*f82bbc90SS.Chekanov $taglen = mb_strlen($m[0]); 544*f82bbc90SS.Chekanov } 545*f82bbc90SS.Chekanov $level++; 546*f82bbc90SS.Chekanov } 547*f82bbc90SS.Chekanov } 548*f82bbc90SS.Chekanov } 549*f82bbc90SS.Chekanov } 550*f82bbc90SS.Chekanov 551*f82bbc90SS.Chekanov /** 552*f82bbc90SS.Chekanov * Callback function for preg_replace_callback use. 553*f82bbc90SS.Chekanov * 554*f82bbc90SS.Chekanov * @param array $matches PREG matches 555*f82bbc90SS.Chekanov * @return string 556*f82bbc90SS.Chekanov */ 557*f82bbc90SS.Chekanov protected function pregCallback($matches) 558*f82bbc90SS.Chekanov { 559*f82bbc90SS.Chekanov switch (mb_strtolower($matches[1])) { 560*f82bbc90SS.Chekanov case 'p': 561*f82bbc90SS.Chekanov // Replace newlines with spaces. 562*f82bbc90SS.Chekanov $para = str_replace("\n", " ", $matches[3]); 563*f82bbc90SS.Chekanov 564*f82bbc90SS.Chekanov // Trim trailing and leading whitespace within the tag. 565*f82bbc90SS.Chekanov $para = trim($para); 566*f82bbc90SS.Chekanov 567*f82bbc90SS.Chekanov // Add trailing newlines for this para. 568*f82bbc90SS.Chekanov return "\n" . $para . "\n"; 569*f82bbc90SS.Chekanov case 'br': 570*f82bbc90SS.Chekanov return "\n"; 571*f82bbc90SS.Chekanov // chekanov 572*f82bbc90SS.Chekanov case 'b': 573*f82bbc90SS.Chekanov case 'strong': 574*f82bbc90SS.Chekanov return $this->toupper($matches[3]); 575*f82bbc90SS.Chekanov case 'del': 576*f82bbc90SS.Chekanov return $this->tostrike($matches[3]); 577*f82bbc90SS.Chekanov case 'th': 578*f82bbc90SS.Chekanov return $this->toupper("\t\t" . $matches[3] . "\n"); 579*f82bbc90SS.Chekanov case 'h': 580*f82bbc90SS.Chekanov return $this->toupper("\n\n" . $matches[3] . "\n\n"); 581*f82bbc90SS.Chekanov case 'a': 582*f82bbc90SS.Chekanov // override the link method 583*f82bbc90SS.Chekanov $linkOverride = null; 584*f82bbc90SS.Chekanov if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) { 585*f82bbc90SS.Chekanov $linkOverride = $linkOverrideMatch[1]; 586*f82bbc90SS.Chekanov } 587*f82bbc90SS.Chekanov // Remove spaces in URL (#1487805) 588*f82bbc90SS.Chekanov $url = str_replace(' ', '', $matches[3]); 589*f82bbc90SS.Chekanov 590*f82bbc90SS.Chekanov return $this->buildlinkList($url, $matches[5], $linkOverride); 591*f82bbc90SS.Chekanov } 592*f82bbc90SS.Chekanov 593*f82bbc90SS.Chekanov return ''; 594*f82bbc90SS.Chekanov } 595*f82bbc90SS.Chekanov 596*f82bbc90SS.Chekanov /** 597*f82bbc90SS.Chekanov * Callback function for preg_replace_callback use in PRE content handler. 598*f82bbc90SS.Chekanov * 599*f82bbc90SS.Chekanov * @param array $matches PREG matches 600*f82bbc90SS.Chekanov * @return string 601*f82bbc90SS.Chekanov */ 602*f82bbc90SS.Chekanov protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches) 603*f82bbc90SS.Chekanov { 604*f82bbc90SS.Chekanov return $this->preContent; 605*f82bbc90SS.Chekanov } 606*f82bbc90SS.Chekanov 607*f82bbc90SS.Chekanov /** 608*f82bbc90SS.Chekanov * Strtoupper function with HTML tags and entities handling. 609*f82bbc90SS.Chekanov * 610*f82bbc90SS.Chekanov * @param string $str Text to convert 611*f82bbc90SS.Chekanov * @return string Converted text 612*f82bbc90SS.Chekanov */ 613*f82bbc90SS.Chekanov protected function toupper($str) 614*f82bbc90SS.Chekanov { 615*f82bbc90SS.Chekanov // string can contain HTML tags 616*f82bbc90SS.Chekanov $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); 617*f82bbc90SS.Chekanov 618*f82bbc90SS.Chekanov // convert toupper only the text between HTML tags 619*f82bbc90SS.Chekanov foreach ($chunks as $i => $chunk) { 620*f82bbc90SS.Chekanov if ($chunk[0] != '<') { 621*f82bbc90SS.Chekanov $chunks[$i] = $this->strtoupper($chunk); 622*f82bbc90SS.Chekanov } 623*f82bbc90SS.Chekanov } 624*f82bbc90SS.Chekanov 625*f82bbc90SS.Chekanov return implode($chunks); 626*f82bbc90SS.Chekanov } 627*f82bbc90SS.Chekanov 628*f82bbc90SS.Chekanov /** 629*f82bbc90SS.Chekanov * Strtoupper multibyte wrapper function with HTML entities handling. 630*f82bbc90SS.Chekanov * 631*f82bbc90SS.Chekanov * @param string $str Text to convert 632*f82bbc90SS.Chekanov * @return string Converted text 633*f82bbc90SS.Chekanov */ 634*f82bbc90SS.Chekanov protected function strtoupper($str) 635*f82bbc90SS.Chekanov { 636*f82bbc90SS.Chekanov $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING); 637*f82bbc90SS.Chekanov $str = mb_strtoupper($str); 638*f82bbc90SS.Chekanov $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING); 639*f82bbc90SS.Chekanov 640*f82bbc90SS.Chekanov return $str; 641*f82bbc90SS.Chekanov } 642*f82bbc90SS.Chekanov 643*f82bbc90SS.Chekanov /** 644*f82bbc90SS.Chekanov * Helper function for DEL conversion. 645*f82bbc90SS.Chekanov * 646*f82bbc90SS.Chekanov * @param string $text HTML content 647*f82bbc90SS.Chekanov * @return string Converted text 648*f82bbc90SS.Chekanov */ 649*f82bbc90SS.Chekanov protected function tostrike($str) 650*f82bbc90SS.Chekanov { 651*f82bbc90SS.Chekanov $rtn = ''; 652*f82bbc90SS.Chekanov for ($i = 0; $i < mb_strlen($str); $i++) { 653*f82bbc90SS.Chekanov $chr = mb_substr($str, $i, 1); 654*f82bbc90SS.Chekanov $combiningChr = chr(0xC0 | 0x336 >> 6). chr(0x80 | 0x336 & 0x3F); 655*f82bbc90SS.Chekanov $rtn .= $chr . $combiningChr; 656*f82bbc90SS.Chekanov } 657*f82bbc90SS.Chekanov return $rtn; 658*f82bbc90SS.Chekanov } 659*f82bbc90SS.Chekanov} 660*f82bbc90SS.Chekanov 661