1<?php 2 3// SPDX-FileCopyrightText: 2004-2023 Ryan Parman, Sam Sneddon, Ryan McCue 4// SPDX-License-Identifier: BSD-3-Clause 5 6declare(strict_types=1); 7 8 9/** 10 * Decode HTML Entities 11 * 12 * This implements HTML5 as of revision 967 (2007-06-28) 13 * 14 * @deprecated Use DOMDocument instead! 15 */ 16class SimplePie_Decode_HTML_Entities 17{ 18 /** 19 * Data to be parsed 20 * 21 * @access private 22 * @var string 23 */ 24 public $data = ''; 25 26 /** 27 * Currently consumed bytes 28 * 29 * @access private 30 * @var string 31 */ 32 public $consumed = ''; 33 34 /** 35 * Position of the current byte being parsed 36 * 37 * @access private 38 * @var int 39 */ 40 public $position = 0; 41 42 /** 43 * Create an instance of the class with the input data 44 * 45 * @access public 46 * @param string $data Input data 47 */ 48 public function __construct(string $data) 49 { 50 $this->data = $data; 51 } 52 53 /** 54 * Parse the input data 55 * 56 * @access public 57 * @return string Output data 58 */ 59 public function parse() 60 { 61 while (($position = strpos($this->data, '&', $this->position)) !== false) { 62 $this->position = $position; 63 $this->consume(); 64 $this->entity(); 65 $this->consumed = ''; 66 } 67 return $this->data; 68 } 69 70 /** 71 * Consume the next byte 72 * 73 * @access private 74 * @return string|false The next byte, or false, if there is no more data 75 */ 76 public function consume() 77 { 78 if (isset($this->data[$this->position])) { 79 $this->consumed .= $this->data[$this->position]; 80 return $this->data[$this->position++]; 81 } 82 83 return false; 84 } 85 86 /** 87 * Consume a range of characters 88 * 89 * @access private 90 * @param string $chars Characters to consume 91 * @return string|false A series of characters that match the range, or false 92 */ 93 public function consume_range(string $chars) 94 { 95 if ($len = strspn($this->data, $chars, $this->position)) { 96 $data = substr($this->data, $this->position, $len); 97 $this->consumed .= $data; 98 $this->position += $len; 99 return $data; 100 } 101 102 return false; 103 } 104 105 /** 106 * Unconsume one byte 107 * 108 * @access private 109 * @return void 110 */ 111 public function unconsume() 112 { 113 $this->consumed = substr($this->consumed, 0, -1); 114 $this->position--; 115 } 116 117 /** 118 * Decode an entity 119 * 120 * @access private 121 * @return void 122 */ 123 public function entity() 124 { 125 switch ($this->consume()) { 126 case "\x09": 127 case "\x0A": 128 case "\x0B": 129 case "\x0C": 130 case "\x20": 131 case "\x3C": 132 case "\x26": 133 case false: 134 break; 135 136 case "\x23": 137 switch ($this->consume()) { 138 case "\x78": 139 case "\x58": 140 $range = '0123456789ABCDEFabcdef'; 141 $hex = true; 142 break; 143 144 default: 145 $range = '0123456789'; 146 $hex = false; 147 $this->unconsume(); 148 break; 149 } 150 151 if ($codepoint = $this->consume_range($range)) { 152 static $windows_1252_specials = [0x0D => "\x0A", 0x80 => "\xE2\x82\xAC", 0x81 => "\xEF\xBF\xBD", 0x82 => "\xE2\x80\x9A", 0x83 => "\xC6\x92", 0x84 => "\xE2\x80\x9E", 0x85 => "\xE2\x80\xA6", 0x86 => "\xE2\x80\xA0", 0x87 => "\xE2\x80\xA1", 0x88 => "\xCB\x86", 0x89 => "\xE2\x80\xB0", 0x8A => "\xC5\xA0", 0x8B => "\xE2\x80\xB9", 0x8C => "\xC5\x92", 0x8D => "\xEF\xBF\xBD", 0x8E => "\xC5\xBD", 0x8F => "\xEF\xBF\xBD", 0x90 => "\xEF\xBF\xBD", 0x91 => "\xE2\x80\x98", 0x92 => "\xE2\x80\x99", 0x93 => "\xE2\x80\x9C", 0x94 => "\xE2\x80\x9D", 0x95 => "\xE2\x80\xA2", 0x96 => "\xE2\x80\x93", 0x97 => "\xE2\x80\x94", 0x98 => "\xCB\x9C", 0x99 => "\xE2\x84\xA2", 0x9A => "\xC5\xA1", 0x9B => "\xE2\x80\xBA", 0x9C => "\xC5\x93", 0x9D => "\xEF\xBF\xBD", 0x9E => "\xC5\xBE", 0x9F => "\xC5\xB8"]; 153 154 if ($hex) { 155 // Cap to PHP_INT_MAX to ensure consistent behaviour if $codepoint is so large 156 // it cannot fit into int – just casting float to int might return junk (e.g. a negative number). 157 // If it is so large, `Misc::codepoint_to_utf8` will just return a replacement character. 158 $codepoint = (int) min(hexdec($codepoint), \PHP_INT_MAX); 159 } else { 160 // Casting string to int caps at PHP_INT_MAX automatically. 161 $codepoint = (int) $codepoint; 162 } 163 164 if (isset($windows_1252_specials[$codepoint])) { 165 $replacement = $windows_1252_specials[$codepoint]; 166 } else { 167 $replacement = SimplePie_Misc::codepoint_to_utf8($codepoint); 168 } 169 170 if (!in_array($this->consume(), [';', false], true)) { 171 $this->unconsume(); 172 } 173 174 $consumed_length = strlen($this->consumed); 175 $this->data = substr_replace($this->data, $replacement, $this->position - $consumed_length, $consumed_length); 176 $this->position += strlen($replacement) - $consumed_length; 177 } 178 break; 179 180 default: 181 static $entities = [ 182 'Aacute' => "\xC3\x81", 183 'aacute' => "\xC3\xA1", 184 'Aacute;' => "\xC3\x81", 185 'aacute;' => "\xC3\xA1", 186 'Acirc' => "\xC3\x82", 187 'acirc' => "\xC3\xA2", 188 'Acirc;' => "\xC3\x82", 189 'acirc;' => "\xC3\xA2", 190 'acute' => "\xC2\xB4", 191 'acute;' => "\xC2\xB4", 192 'AElig' => "\xC3\x86", 193 'aelig' => "\xC3\xA6", 194 'AElig;' => "\xC3\x86", 195 'aelig;' => "\xC3\xA6", 196 'Agrave' => "\xC3\x80", 197 'agrave' => "\xC3\xA0", 198 'Agrave;' => "\xC3\x80", 199 'agrave;' => "\xC3\xA0", 200 'alefsym;' => "\xE2\x84\xB5", 201 'Alpha;' => "\xCE\x91", 202 'alpha;' => "\xCE\xB1", 203 'AMP' => "\x26", 204 'amp' => "\x26", 205 'AMP;' => "\x26", 206 'amp;' => "\x26", 207 'and;' => "\xE2\x88\xA7", 208 'ang;' => "\xE2\x88\xA0", 209 'apos;' => "\x27", 210 'Aring' => "\xC3\x85", 211 'aring' => "\xC3\xA5", 212 'Aring;' => "\xC3\x85", 213 'aring;' => "\xC3\xA5", 214 'asymp;' => "\xE2\x89\x88", 215 'Atilde' => "\xC3\x83", 216 'atilde' => "\xC3\xA3", 217 'Atilde;' => "\xC3\x83", 218 'atilde;' => "\xC3\xA3", 219 'Auml' => "\xC3\x84", 220 'auml' => "\xC3\xA4", 221 'Auml;' => "\xC3\x84", 222 'auml;' => "\xC3\xA4", 223 'bdquo;' => "\xE2\x80\x9E", 224 'Beta;' => "\xCE\x92", 225 'beta;' => "\xCE\xB2", 226 'brvbar' => "\xC2\xA6", 227 'brvbar;' => "\xC2\xA6", 228 'bull;' => "\xE2\x80\xA2", 229 'cap;' => "\xE2\x88\xA9", 230 'Ccedil' => "\xC3\x87", 231 'ccedil' => "\xC3\xA7", 232 'Ccedil;' => "\xC3\x87", 233 'ccedil;' => "\xC3\xA7", 234 'cedil' => "\xC2\xB8", 235 'cedil;' => "\xC2\xB8", 236 'cent' => "\xC2\xA2", 237 'cent;' => "\xC2\xA2", 238 'Chi;' => "\xCE\xA7", 239 'chi;' => "\xCF\x87", 240 'circ;' => "\xCB\x86", 241 'clubs;' => "\xE2\x99\xA3", 242 'cong;' => "\xE2\x89\x85", 243 'COPY' => "\xC2\xA9", 244 'copy' => "\xC2\xA9", 245 'COPY;' => "\xC2\xA9", 246 'copy;' => "\xC2\xA9", 247 'crarr;' => "\xE2\x86\xB5", 248 'cup;' => "\xE2\x88\xAA", 249 'curren' => "\xC2\xA4", 250 'curren;' => "\xC2\xA4", 251 'Dagger;' => "\xE2\x80\xA1", 252 'dagger;' => "\xE2\x80\xA0", 253 'dArr;' => "\xE2\x87\x93", 254 'darr;' => "\xE2\x86\x93", 255 'deg' => "\xC2\xB0", 256 'deg;' => "\xC2\xB0", 257 'Delta;' => "\xCE\x94", 258 'delta;' => "\xCE\xB4", 259 'diams;' => "\xE2\x99\xA6", 260 'divide' => "\xC3\xB7", 261 'divide;' => "\xC3\xB7", 262 'Eacute' => "\xC3\x89", 263 'eacute' => "\xC3\xA9", 264 'Eacute;' => "\xC3\x89", 265 'eacute;' => "\xC3\xA9", 266 'Ecirc' => "\xC3\x8A", 267 'ecirc' => "\xC3\xAA", 268 'Ecirc;' => "\xC3\x8A", 269 'ecirc;' => "\xC3\xAA", 270 'Egrave' => "\xC3\x88", 271 'egrave' => "\xC3\xA8", 272 'Egrave;' => "\xC3\x88", 273 'egrave;' => "\xC3\xA8", 274 'empty;' => "\xE2\x88\x85", 275 'emsp;' => "\xE2\x80\x83", 276 'ensp;' => "\xE2\x80\x82", 277 'Epsilon;' => "\xCE\x95", 278 'epsilon;' => "\xCE\xB5", 279 'equiv;' => "\xE2\x89\xA1", 280 'Eta;' => "\xCE\x97", 281 'eta;' => "\xCE\xB7", 282 'ETH' => "\xC3\x90", 283 'eth' => "\xC3\xB0", 284 'ETH;' => "\xC3\x90", 285 'eth;' => "\xC3\xB0", 286 'Euml' => "\xC3\x8B", 287 'euml' => "\xC3\xAB", 288 'Euml;' => "\xC3\x8B", 289 'euml;' => "\xC3\xAB", 290 'euro;' => "\xE2\x82\xAC", 291 'exist;' => "\xE2\x88\x83", 292 'fnof;' => "\xC6\x92", 293 'forall;' => "\xE2\x88\x80", 294 'frac12' => "\xC2\xBD", 295 'frac12;' => "\xC2\xBD", 296 'frac14' => "\xC2\xBC", 297 'frac14;' => "\xC2\xBC", 298 'frac34' => "\xC2\xBE", 299 'frac34;' => "\xC2\xBE", 300 'frasl;' => "\xE2\x81\x84", 301 'Gamma;' => "\xCE\x93", 302 'gamma;' => "\xCE\xB3", 303 'ge;' => "\xE2\x89\xA5", 304 'GT' => "\x3E", 305 'gt' => "\x3E", 306 'GT;' => "\x3E", 307 'gt;' => "\x3E", 308 'hArr;' => "\xE2\x87\x94", 309 'harr;' => "\xE2\x86\x94", 310 'hearts;' => "\xE2\x99\xA5", 311 'hellip;' => "\xE2\x80\xA6", 312 'Iacute' => "\xC3\x8D", 313 'iacute' => "\xC3\xAD", 314 'Iacute;' => "\xC3\x8D", 315 'iacute;' => "\xC3\xAD", 316 'Icirc' => "\xC3\x8E", 317 'icirc' => "\xC3\xAE", 318 'Icirc;' => "\xC3\x8E", 319 'icirc;' => "\xC3\xAE", 320 'iexcl' => "\xC2\xA1", 321 'iexcl;' => "\xC2\xA1", 322 'Igrave' => "\xC3\x8C", 323 'igrave' => "\xC3\xAC", 324 'Igrave;' => "\xC3\x8C", 325 'igrave;' => "\xC3\xAC", 326 'image;' => "\xE2\x84\x91", 327 'infin;' => "\xE2\x88\x9E", 328 'int;' => "\xE2\x88\xAB", 329 'Iota;' => "\xCE\x99", 330 'iota;' => "\xCE\xB9", 331 'iquest' => "\xC2\xBF", 332 'iquest;' => "\xC2\xBF", 333 'isin;' => "\xE2\x88\x88", 334 'Iuml' => "\xC3\x8F", 335 'iuml' => "\xC3\xAF", 336 'Iuml;' => "\xC3\x8F", 337 'iuml;' => "\xC3\xAF", 338 'Kappa;' => "\xCE\x9A", 339 'kappa;' => "\xCE\xBA", 340 'Lambda;' => "\xCE\x9B", 341 'lambda;' => "\xCE\xBB", 342 'lang;' => "\xE3\x80\x88", 343 'laquo' => "\xC2\xAB", 344 'laquo;' => "\xC2\xAB", 345 'lArr;' => "\xE2\x87\x90", 346 'larr;' => "\xE2\x86\x90", 347 'lceil;' => "\xE2\x8C\x88", 348 'ldquo;' => "\xE2\x80\x9C", 349 'le;' => "\xE2\x89\xA4", 350 'lfloor;' => "\xE2\x8C\x8A", 351 'lowast;' => "\xE2\x88\x97", 352 'loz;' => "\xE2\x97\x8A", 353 'lrm;' => "\xE2\x80\x8E", 354 'lsaquo;' => "\xE2\x80\xB9", 355 'lsquo;' => "\xE2\x80\x98", 356 'LT' => "\x3C", 357 'lt' => "\x3C", 358 'LT;' => "\x3C", 359 'lt;' => "\x3C", 360 'macr' => "\xC2\xAF", 361 'macr;' => "\xC2\xAF", 362 'mdash;' => "\xE2\x80\x94", 363 'micro' => "\xC2\xB5", 364 'micro;' => "\xC2\xB5", 365 'middot' => "\xC2\xB7", 366 'middot;' => "\xC2\xB7", 367 'minus;' => "\xE2\x88\x92", 368 'Mu;' => "\xCE\x9C", 369 'mu;' => "\xCE\xBC", 370 'nabla;' => "\xE2\x88\x87", 371 'nbsp' => "\xC2\xA0", 372 'nbsp;' => "\xC2\xA0", 373 'ndash;' => "\xE2\x80\x93", 374 'ne;' => "\xE2\x89\xA0", 375 'ni;' => "\xE2\x88\x8B", 376 'not' => "\xC2\xAC", 377 'not;' => "\xC2\xAC", 378 'notin;' => "\xE2\x88\x89", 379 'nsub;' => "\xE2\x8A\x84", 380 'Ntilde' => "\xC3\x91", 381 'ntilde' => "\xC3\xB1", 382 'Ntilde;' => "\xC3\x91", 383 'ntilde;' => "\xC3\xB1", 384 'Nu;' => "\xCE\x9D", 385 'nu;' => "\xCE\xBD", 386 'Oacute' => "\xC3\x93", 387 'oacute' => "\xC3\xB3", 388 'Oacute;' => "\xC3\x93", 389 'oacute;' => "\xC3\xB3", 390 'Ocirc' => "\xC3\x94", 391 'ocirc' => "\xC3\xB4", 392 'Ocirc;' => "\xC3\x94", 393 'ocirc;' => "\xC3\xB4", 394 'OElig;' => "\xC5\x92", 395 'oelig;' => "\xC5\x93", 396 'Ograve' => "\xC3\x92", 397 'ograve' => "\xC3\xB2", 398 'Ograve;' => "\xC3\x92", 399 'ograve;' => "\xC3\xB2", 400 'oline;' => "\xE2\x80\xBE", 401 'Omega;' => "\xCE\xA9", 402 'omega;' => "\xCF\x89", 403 'Omicron;' => "\xCE\x9F", 404 'omicron;' => "\xCE\xBF", 405 'oplus;' => "\xE2\x8A\x95", 406 'or;' => "\xE2\x88\xA8", 407 'ordf' => "\xC2\xAA", 408 'ordf;' => "\xC2\xAA", 409 'ordm' => "\xC2\xBA", 410 'ordm;' => "\xC2\xBA", 411 'Oslash' => "\xC3\x98", 412 'oslash' => "\xC3\xB8", 413 'Oslash;' => "\xC3\x98", 414 'oslash;' => "\xC3\xB8", 415 'Otilde' => "\xC3\x95", 416 'otilde' => "\xC3\xB5", 417 'Otilde;' => "\xC3\x95", 418 'otilde;' => "\xC3\xB5", 419 'otimes;' => "\xE2\x8A\x97", 420 'Ouml' => "\xC3\x96", 421 'ouml' => "\xC3\xB6", 422 'Ouml;' => "\xC3\x96", 423 'ouml;' => "\xC3\xB6", 424 'para' => "\xC2\xB6", 425 'para;' => "\xC2\xB6", 426 'part;' => "\xE2\x88\x82", 427 'permil;' => "\xE2\x80\xB0", 428 'perp;' => "\xE2\x8A\xA5", 429 'Phi;' => "\xCE\xA6", 430 'phi;' => "\xCF\x86", 431 'Pi;' => "\xCE\xA0", 432 'pi;' => "\xCF\x80", 433 'piv;' => "\xCF\x96", 434 'plusmn' => "\xC2\xB1", 435 'plusmn;' => "\xC2\xB1", 436 'pound' => "\xC2\xA3", 437 'pound;' => "\xC2\xA3", 438 'Prime;' => "\xE2\x80\xB3", 439 'prime;' => "\xE2\x80\xB2", 440 'prod;' => "\xE2\x88\x8F", 441 'prop;' => "\xE2\x88\x9D", 442 'Psi;' => "\xCE\xA8", 443 'psi;' => "\xCF\x88", 444 'QUOT' => "\x22", 445 'quot' => "\x22", 446 'QUOT;' => "\x22", 447 'quot;' => "\x22", 448 'radic;' => "\xE2\x88\x9A", 449 'rang;' => "\xE3\x80\x89", 450 'raquo' => "\xC2\xBB", 451 'raquo;' => "\xC2\xBB", 452 'rArr;' => "\xE2\x87\x92", 453 'rarr;' => "\xE2\x86\x92", 454 'rceil;' => "\xE2\x8C\x89", 455 'rdquo;' => "\xE2\x80\x9D", 456 'real;' => "\xE2\x84\x9C", 457 'REG' => "\xC2\xAE", 458 'reg' => "\xC2\xAE", 459 'REG;' => "\xC2\xAE", 460 'reg;' => "\xC2\xAE", 461 'rfloor;' => "\xE2\x8C\x8B", 462 'Rho;' => "\xCE\xA1", 463 'rho;' => "\xCF\x81", 464 'rlm;' => "\xE2\x80\x8F", 465 'rsaquo;' => "\xE2\x80\xBA", 466 'rsquo;' => "\xE2\x80\x99", 467 'sbquo;' => "\xE2\x80\x9A", 468 'Scaron;' => "\xC5\xA0", 469 'scaron;' => "\xC5\xA1", 470 'sdot;' => "\xE2\x8B\x85", 471 'sect' => "\xC2\xA7", 472 'sect;' => "\xC2\xA7", 473 'shy' => "\xC2\xAD", 474 'shy;' => "\xC2\xAD", 475 'Sigma;' => "\xCE\xA3", 476 'sigma;' => "\xCF\x83", 477 'sigmaf;' => "\xCF\x82", 478 'sim;' => "\xE2\x88\xBC", 479 'spades;' => "\xE2\x99\xA0", 480 'sub;' => "\xE2\x8A\x82", 481 'sube;' => "\xE2\x8A\x86", 482 'sum;' => "\xE2\x88\x91", 483 'sup;' => "\xE2\x8A\x83", 484 'sup1' => "\xC2\xB9", 485 'sup1;' => "\xC2\xB9", 486 'sup2' => "\xC2\xB2", 487 'sup2;' => "\xC2\xB2", 488 'sup3' => "\xC2\xB3", 489 'sup3;' => "\xC2\xB3", 490 'supe;' => "\xE2\x8A\x87", 491 'szlig' => "\xC3\x9F", 492 'szlig;' => "\xC3\x9F", 493 'Tau;' => "\xCE\xA4", 494 'tau;' => "\xCF\x84", 495 'there4;' => "\xE2\x88\xB4", 496 'Theta;' => "\xCE\x98", 497 'theta;' => "\xCE\xB8", 498 'thetasym;' => "\xCF\x91", 499 'thinsp;' => "\xE2\x80\x89", 500 'THORN' => "\xC3\x9E", 501 'thorn' => "\xC3\xBE", 502 'THORN;' => "\xC3\x9E", 503 'thorn;' => "\xC3\xBE", 504 'tilde;' => "\xCB\x9C", 505 'times' => "\xC3\x97", 506 'times;' => "\xC3\x97", 507 'TRADE;' => "\xE2\x84\xA2", 508 'trade;' => "\xE2\x84\xA2", 509 'Uacute' => "\xC3\x9A", 510 'uacute' => "\xC3\xBA", 511 'Uacute;' => "\xC3\x9A", 512 'uacute;' => "\xC3\xBA", 513 'uArr;' => "\xE2\x87\x91", 514 'uarr;' => "\xE2\x86\x91", 515 'Ucirc' => "\xC3\x9B", 516 'ucirc' => "\xC3\xBB", 517 'Ucirc;' => "\xC3\x9B", 518 'ucirc;' => "\xC3\xBB", 519 'Ugrave' => "\xC3\x99", 520 'ugrave' => "\xC3\xB9", 521 'Ugrave;' => "\xC3\x99", 522 'ugrave;' => "\xC3\xB9", 523 'uml' => "\xC2\xA8", 524 'uml;' => "\xC2\xA8", 525 'upsih;' => "\xCF\x92", 526 'Upsilon;' => "\xCE\xA5", 527 'upsilon;' => "\xCF\x85", 528 'Uuml' => "\xC3\x9C", 529 'uuml' => "\xC3\xBC", 530 'Uuml;' => "\xC3\x9C", 531 'uuml;' => "\xC3\xBC", 532 'weierp;' => "\xE2\x84\x98", 533 'Xi;' => "\xCE\x9E", 534 'xi;' => "\xCE\xBE", 535 'Yacute' => "\xC3\x9D", 536 'yacute' => "\xC3\xBD", 537 'Yacute;' => "\xC3\x9D", 538 'yacute;' => "\xC3\xBD", 539 'yen' => "\xC2\xA5", 540 'yen;' => "\xC2\xA5", 541 'yuml' => "\xC3\xBF", 542 'Yuml;' => "\xC5\xB8", 543 'yuml;' => "\xC3\xBF", 544 'Zeta;' => "\xCE\x96", 545 'zeta;' => "\xCE\xB6", 546 'zwj;' => "\xE2\x80\x8D", 547 'zwnj;' => "\xE2\x80\x8C" 548 ]; 549 550 for ($i = 0, $match = null; $i < 9 && $this->consume() !== false; $i++) { 551 // Cast for PHPStan on PHP < 8.0: We consumed as per the loop condition, 552 // so `$this->consumed` is non-empty and the substr offset is valid. 553 $consumed = (string) substr($this->consumed, 1); 554 if (isset($entities[$consumed])) { 555 $match = $consumed; 556 } 557 } 558 559 if ($match !== null) { 560 $this->data = substr_replace($this->data, $entities[$match], $this->position - strlen($consumed) - 1, strlen($match) + 1); 561 $this->position += strlen($entities[$match]) - strlen($consumed) - 1; 562 } 563 break; 564 } 565 } 566} 567