2026-03-24 10:00:08 +00:00

567 lines
22 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

<?php
// SPDX-FileCopyrightText: 2004-2023 Ryan Parman, Sam Sneddon, Ryan McCue
// SPDX-License-Identifier: BSD-3-Clause
declare(strict_types=1);
/**
* Decode HTML Entities
*
* This implements HTML5 as of revision 967 (2007-06-28)
*
* @deprecated Use DOMDocument instead!
*/
class SimplePie_Decode_HTML_Entities
{
/**
* Data to be parsed
*
* @access private
* @var string
*/
public $data = '';
/**
* Currently consumed bytes
*
* @access private
* @var string
*/
public $consumed = '';
/**
* Position of the current byte being parsed
*
* @access private
* @var int
*/
public $position = 0;
/**
* Create an instance of the class with the input data
*
* @access public
* @param string $data Input data
*/
public function __construct(string $data)
{
$this->data = $data;
}
/**
* Parse the input data
*
* @access public
* @return string Output data
*/
public function parse()
{
while (($position = strpos($this->data, '&', $this->position)) !== false) {
$this->position = $position;
$this->consume();
$this->entity();
$this->consumed = '';
}
return $this->data;
}
/**
* Consume the next byte
*
* @access private
* @return string|false The next byte, or false, if there is no more data
*/
public function consume()
{
if (isset($this->data[$this->position])) {
$this->consumed .= $this->data[$this->position];
return $this->data[$this->position++];
}
return false;
}
/**
* Consume a range of characters
*
* @access private
* @param string $chars Characters to consume
* @return string|false A series of characters that match the range, or false
*/
public function consume_range(string $chars)
{
if ($len = strspn($this->data, $chars, $this->position)) {
$data = substr($this->data, $this->position, $len);
$this->consumed .= $data;
$this->position += $len;
return $data;
}
return false;
}
/**
* Unconsume one byte
*
* @access private
* @return void
*/
public function unconsume()
{
$this->consumed = substr($this->consumed, 0, -1);
$this->position--;
}
/**
* Decode an entity
*
* @access private
* @return void
*/
public function entity()
{
switch ($this->consume()) {
case "\x09":
case "\x0A":
case "\x0B":
case "\x0C":
case "\x20":
case "\x3C":
case "\x26":
case false:
break;
case "\x23":
switch ($this->consume()) {
case "\x78":
case "\x58":
$range = '0123456789ABCDEFabcdef';
$hex = true;
break;
default:
$range = '0123456789';
$hex = false;
$this->unconsume();
break;
}
if ($codepoint = $this->consume_range($range)) {
static $windows_1252_specials = [0x0D => "\x0A", 0x80 => "\xE2\x82\xAC", 0x81 => "\xEF\xBF\xBD", 0x82 => "\xE2\x80\x9A", 0x83 => "\xC6\x92", 0x84 => "\xE2\x80\x9E", 0x85 => "\xE2\x80\xA6", 0x86 => "\xE2\x80\xA0", 0x87 => "\xE2\x80\xA1", 0x88 => "\xCB\x86", 0x89 => "\xE2\x80\xB0", 0x8A => "\xC5\xA0", 0x8B => "\xE2\x80\xB9", 0x8C => "\xC5\x92", 0x8D => "\xEF\xBF\xBD", 0x8E => "\xC5\xBD", 0x8F => "\xEF\xBF\xBD", 0x90 => "\xEF\xBF\xBD", 0x91 => "\xE2\x80\x98", 0x92 => "\xE2\x80\x99", 0x93 => "\xE2\x80\x9C", 0x94 => "\xE2\x80\x9D", 0x95 => "\xE2\x80\xA2", 0x96 => "\xE2\x80\x93", 0x97 => "\xE2\x80\x94", 0x98 => "\xCB\x9C", 0x99 => "\xE2\x84\xA2", 0x9A => "\xC5\xA1", 0x9B => "\xE2\x80\xBA", 0x9C => "\xC5\x93", 0x9D => "\xEF\xBF\xBD", 0x9E => "\xC5\xBE", 0x9F => "\xC5\xB8"];
if ($hex) {
// Cap to PHP_INT_MAX to ensure consistent behaviour if $codepoint is so large
// it cannot fit into int just casting float to int might return junk (e.g. a negative number).
// If it is so large, `Misc::codepoint_to_utf8` will just return a replacement character.
$codepoint = (int) min(hexdec($codepoint), \PHP_INT_MAX);
} else {
// Casting string to int caps at PHP_INT_MAX automatically.
$codepoint = (int) $codepoint;
}
if (isset($windows_1252_specials[$codepoint])) {
$replacement = $windows_1252_specials[$codepoint];
} else {
$replacement = SimplePie_Misc::codepoint_to_utf8($codepoint);
}
if (!in_array($this->consume(), [';', false], true)) {
$this->unconsume();
}
$consumed_length = strlen($this->consumed);
$this->data = substr_replace($this->data, $replacement, $this->position - $consumed_length, $consumed_length);
$this->position += strlen($replacement) - $consumed_length;
}
break;
default:
static $entities = [
'Aacute' => "\xC3\x81",
'aacute' => "\xC3\xA1",
'Aacute;' => "\xC3\x81",
'aacute;' => "\xC3\xA1",
'Acirc' => "\xC3\x82",
'acirc' => "\xC3\xA2",
'Acirc;' => "\xC3\x82",
'acirc;' => "\xC3\xA2",
'acute' => "\xC2\xB4",
'acute;' => "\xC2\xB4",
'AElig' => "\xC3\x86",
'aelig' => "\xC3\xA6",
'AElig;' => "\xC3\x86",
'aelig;' => "\xC3\xA6",
'Agrave' => "\xC3\x80",
'agrave' => "\xC3\xA0",
'Agrave;' => "\xC3\x80",
'agrave;' => "\xC3\xA0",
'alefsym;' => "\xE2\x84\xB5",
'Alpha;' => "\xCE\x91",
'alpha;' => "\xCE\xB1",
'AMP' => "\x26",
'amp' => "\x26",
'AMP;' => "\x26",
'amp;' => "\x26",
'and;' => "\xE2\x88\xA7",
'ang;' => "\xE2\x88\xA0",
'apos;' => "\x27",
'Aring' => "\xC3\x85",
'aring' => "\xC3\xA5",
'Aring;' => "\xC3\x85",
'aring;' => "\xC3\xA5",
'asymp;' => "\xE2\x89\x88",
'Atilde' => "\xC3\x83",
'atilde' => "\xC3\xA3",
'Atilde;' => "\xC3\x83",
'atilde;' => "\xC3\xA3",
'Auml' => "\xC3\x84",
'auml' => "\xC3\xA4",
'Auml;' => "\xC3\x84",
'auml;' => "\xC3\xA4",
'bdquo;' => "\xE2\x80\x9E",
'Beta;' => "\xCE\x92",
'beta;' => "\xCE\xB2",
'brvbar' => "\xC2\xA6",
'brvbar;' => "\xC2\xA6",
'bull;' => "\xE2\x80\xA2",
'cap;' => "\xE2\x88\xA9",
'Ccedil' => "\xC3\x87",
'ccedil' => "\xC3\xA7",
'Ccedil;' => "\xC3\x87",
'ccedil;' => "\xC3\xA7",
'cedil' => "\xC2\xB8",
'cedil;' => "\xC2\xB8",
'cent' => "\xC2\xA2",
'cent;' => "\xC2\xA2",
'Chi;' => "\xCE\xA7",
'chi;' => "\xCF\x87",
'circ;' => "\xCB\x86",
'clubs;' => "\xE2\x99\xA3",
'cong;' => "\xE2\x89\x85",
'COPY' => "\xC2\xA9",
'copy' => "\xC2\xA9",
'COPY;' => "\xC2\xA9",
'copy;' => "\xC2\xA9",
'crarr;' => "\xE2\x86\xB5",
'cup;' => "\xE2\x88\xAA",
'curren' => "\xC2\xA4",
'curren;' => "\xC2\xA4",
'Dagger;' => "\xE2\x80\xA1",
'dagger;' => "\xE2\x80\xA0",
'dArr;' => "\xE2\x87\x93",
'darr;' => "\xE2\x86\x93",
'deg' => "\xC2\xB0",
'deg;' => "\xC2\xB0",
'Delta;' => "\xCE\x94",
'delta;' => "\xCE\xB4",
'diams;' => "\xE2\x99\xA6",
'divide' => "\xC3\xB7",
'divide;' => "\xC3\xB7",
'Eacute' => "\xC3\x89",
'eacute' => "\xC3\xA9",
'Eacute;' => "\xC3\x89",
'eacute;' => "\xC3\xA9",
'Ecirc' => "\xC3\x8A",
'ecirc' => "\xC3\xAA",
'Ecirc;' => "\xC3\x8A",
'ecirc;' => "\xC3\xAA",
'Egrave' => "\xC3\x88",
'egrave' => "\xC3\xA8",
'Egrave;' => "\xC3\x88",
'egrave;' => "\xC3\xA8",
'empty;' => "\xE2\x88\x85",
'emsp;' => "\xE2\x80\x83",
'ensp;' => "\xE2\x80\x82",
'Epsilon;' => "\xCE\x95",
'epsilon;' => "\xCE\xB5",
'equiv;' => "\xE2\x89\xA1",
'Eta;' => "\xCE\x97",
'eta;' => "\xCE\xB7",
'ETH' => "\xC3\x90",
'eth' => "\xC3\xB0",
'ETH;' => "\xC3\x90",
'eth;' => "\xC3\xB0",
'Euml' => "\xC3\x8B",
'euml' => "\xC3\xAB",
'Euml;' => "\xC3\x8B",
'euml;' => "\xC3\xAB",
'euro;' => "\xE2\x82\xAC",
'exist;' => "\xE2\x88\x83",
'fnof;' => "\xC6\x92",
'forall;' => "\xE2\x88\x80",
'frac12' => "\xC2\xBD",
'frac12;' => "\xC2\xBD",
'frac14' => "\xC2\xBC",
'frac14;' => "\xC2\xBC",
'frac34' => "\xC2\xBE",
'frac34;' => "\xC2\xBE",
'frasl;' => "\xE2\x81\x84",
'Gamma;' => "\xCE\x93",
'gamma;' => "\xCE\xB3",
'ge;' => "\xE2\x89\xA5",
'GT' => "\x3E",
'gt' => "\x3E",
'GT;' => "\x3E",
'gt;' => "\x3E",
'hArr;' => "\xE2\x87\x94",
'harr;' => "\xE2\x86\x94",
'hearts;' => "\xE2\x99\xA5",
'hellip;' => "\xE2\x80\xA6",
'Iacute' => "\xC3\x8D",
'iacute' => "\xC3\xAD",
'Iacute;' => "\xC3\x8D",
'iacute;' => "\xC3\xAD",
'Icirc' => "\xC3\x8E",
'icirc' => "\xC3\xAE",
'Icirc;' => "\xC3\x8E",
'icirc;' => "\xC3\xAE",
'iexcl' => "\xC2\xA1",
'iexcl;' => "\xC2\xA1",
'Igrave' => "\xC3\x8C",
'igrave' => "\xC3\xAC",
'Igrave;' => "\xC3\x8C",
'igrave;' => "\xC3\xAC",
'image;' => "\xE2\x84\x91",
'infin;' => "\xE2\x88\x9E",
'int;' => "\xE2\x88\xAB",
'Iota;' => "\xCE\x99",
'iota;' => "\xCE\xB9",
'iquest' => "\xC2\xBF",
'iquest;' => "\xC2\xBF",
'isin;' => "\xE2\x88\x88",
'Iuml' => "\xC3\x8F",
'iuml' => "\xC3\xAF",
'Iuml;' => "\xC3\x8F",
'iuml;' => "\xC3\xAF",
'Kappa;' => "\xCE\x9A",
'kappa;' => "\xCE\xBA",
'Lambda;' => "\xCE\x9B",
'lambda;' => "\xCE\xBB",
'lang;' => "\xE3\x80\x88",
'laquo' => "\xC2\xAB",
'laquo;' => "\xC2\xAB",
'lArr;' => "\xE2\x87\x90",
'larr;' => "\xE2\x86\x90",
'lceil;' => "\xE2\x8C\x88",
'ldquo;' => "\xE2\x80\x9C",
'le;' => "\xE2\x89\xA4",
'lfloor;' => "\xE2\x8C\x8A",
'lowast;' => "\xE2\x88\x97",
'loz;' => "\xE2\x97\x8A",
'lrm;' => "\xE2\x80\x8E",
'lsaquo;' => "\xE2\x80\xB9",
'lsquo;' => "\xE2\x80\x98",
'LT' => "\x3C",
'lt' => "\x3C",
'LT;' => "\x3C",
'lt;' => "\x3C",
'macr' => "\xC2\xAF",
'macr;' => "\xC2\xAF",
'mdash;' => "\xE2\x80\x94",
'micro' => "\xC2\xB5",
'micro;' => "\xC2\xB5",
'middot' => "\xC2\xB7",
'middot;' => "\xC2\xB7",
'minus;' => "\xE2\x88\x92",
'Mu;' => "\xCE\x9C",
'mu;' => "\xCE\xBC",
'nabla;' => "\xE2\x88\x87",
'nbsp' => "\xC2\xA0",
'nbsp;' => "\xC2\xA0",
'ndash;' => "\xE2\x80\x93",
'ne;' => "\xE2\x89\xA0",
'ni;' => "\xE2\x88\x8B",
'not' => "\xC2\xAC",
'not;' => "\xC2\xAC",
'notin;' => "\xE2\x88\x89",
'nsub;' => "\xE2\x8A\x84",
'Ntilde' => "\xC3\x91",
'ntilde' => "\xC3\xB1",
'Ntilde;' => "\xC3\x91",
'ntilde;' => "\xC3\xB1",
'Nu;' => "\xCE\x9D",
'nu;' => "\xCE\xBD",
'Oacute' => "\xC3\x93",
'oacute' => "\xC3\xB3",
'Oacute;' => "\xC3\x93",
'oacute;' => "\xC3\xB3",
'Ocirc' => "\xC3\x94",
'ocirc' => "\xC3\xB4",
'Ocirc;' => "\xC3\x94",
'ocirc;' => "\xC3\xB4",
'OElig;' => "\xC5\x92",
'oelig;' => "\xC5\x93",
'Ograve' => "\xC3\x92",
'ograve' => "\xC3\xB2",
'Ograve;' => "\xC3\x92",
'ograve;' => "\xC3\xB2",
'oline;' => "\xE2\x80\xBE",
'Omega;' => "\xCE\xA9",
'omega;' => "\xCF\x89",
'Omicron;' => "\xCE\x9F",
'omicron;' => "\xCE\xBF",
'oplus;' => "\xE2\x8A\x95",
'or;' => "\xE2\x88\xA8",
'ordf' => "\xC2\xAA",
'ordf;' => "\xC2\xAA",
'ordm' => "\xC2\xBA",
'ordm;' => "\xC2\xBA",
'Oslash' => "\xC3\x98",
'oslash' => "\xC3\xB8",
'Oslash;' => "\xC3\x98",
'oslash;' => "\xC3\xB8",
'Otilde' => "\xC3\x95",
'otilde' => "\xC3\xB5",
'Otilde;' => "\xC3\x95",
'otilde;' => "\xC3\xB5",
'otimes;' => "\xE2\x8A\x97",
'Ouml' => "\xC3\x96",
'ouml' => "\xC3\xB6",
'Ouml;' => "\xC3\x96",
'ouml;' => "\xC3\xB6",
'para' => "\xC2\xB6",
'para;' => "\xC2\xB6",
'part;' => "\xE2\x88\x82",
'permil;' => "\xE2\x80\xB0",
'perp;' => "\xE2\x8A\xA5",
'Phi;' => "\xCE\xA6",
'phi;' => "\xCF\x86",
'Pi;' => "\xCE\xA0",
'pi;' => "\xCF\x80",
'piv;' => "\xCF\x96",
'plusmn' => "\xC2\xB1",
'plusmn;' => "\xC2\xB1",
'pound' => "\xC2\xA3",
'pound;' => "\xC2\xA3",
'Prime;' => "\xE2\x80\xB3",
'prime;' => "\xE2\x80\xB2",
'prod;' => "\xE2\x88\x8F",
'prop;' => "\xE2\x88\x9D",
'Psi;' => "\xCE\xA8",
'psi;' => "\xCF\x88",
'QUOT' => "\x22",
'quot' => "\x22",
'QUOT;' => "\x22",
'quot;' => "\x22",
'radic;' => "\xE2\x88\x9A",
'rang;' => "\xE3\x80\x89",
'raquo' => "\xC2\xBB",
'raquo;' => "\xC2\xBB",
'rArr;' => "\xE2\x87\x92",
'rarr;' => "\xE2\x86\x92",
'rceil;' => "\xE2\x8C\x89",
'rdquo;' => "\xE2\x80\x9D",
'real;' => "\xE2\x84\x9C",
'REG' => "\xC2\xAE",
'reg' => "\xC2\xAE",
'REG;' => "\xC2\xAE",
'reg;' => "\xC2\xAE",
'rfloor;' => "\xE2\x8C\x8B",
'Rho;' => "\xCE\xA1",
'rho;' => "\xCF\x81",
'rlm;' => "\xE2\x80\x8F",
'rsaquo;' => "\xE2\x80\xBA",
'rsquo;' => "\xE2\x80\x99",
'sbquo;' => "\xE2\x80\x9A",
'Scaron;' => "\xC5\xA0",
'scaron;' => "\xC5\xA1",
'sdot;' => "\xE2\x8B\x85",
'sect' => "\xC2\xA7",
'sect;' => "\xC2\xA7",
'shy' => "\xC2\xAD",
'shy;' => "\xC2\xAD",
'Sigma;' => "\xCE\xA3",
'sigma;' => "\xCF\x83",
'sigmaf;' => "\xCF\x82",
'sim;' => "\xE2\x88\xBC",
'spades;' => "\xE2\x99\xA0",
'sub;' => "\xE2\x8A\x82",
'sube;' => "\xE2\x8A\x86",
'sum;' => "\xE2\x88\x91",
'sup;' => "\xE2\x8A\x83",
'sup1' => "\xC2\xB9",
'sup1;' => "\xC2\xB9",
'sup2' => "\xC2\xB2",
'sup2;' => "\xC2\xB2",
'sup3' => "\xC2\xB3",
'sup3;' => "\xC2\xB3",
'supe;' => "\xE2\x8A\x87",
'szlig' => "\xC3\x9F",
'szlig;' => "\xC3\x9F",
'Tau;' => "\xCE\xA4",
'tau;' => "\xCF\x84",
'there4;' => "\xE2\x88\xB4",
'Theta;' => "\xCE\x98",
'theta;' => "\xCE\xB8",
'thetasym;' => "\xCF\x91",
'thinsp;' => "\xE2\x80\x89",
'THORN' => "\xC3\x9E",
'thorn' => "\xC3\xBE",
'THORN;' => "\xC3\x9E",
'thorn;' => "\xC3\xBE",
'tilde;' => "\xCB\x9C",
'times' => "\xC3\x97",
'times;' => "\xC3\x97",
'TRADE;' => "\xE2\x84\xA2",
'trade;' => "\xE2\x84\xA2",
'Uacute' => "\xC3\x9A",
'uacute' => "\xC3\xBA",
'Uacute;' => "\xC3\x9A",
'uacute;' => "\xC3\xBA",
'uArr;' => "\xE2\x87\x91",
'uarr;' => "\xE2\x86\x91",
'Ucirc' => "\xC3\x9B",
'ucirc' => "\xC3\xBB",
'Ucirc;' => "\xC3\x9B",
'ucirc;' => "\xC3\xBB",
'Ugrave' => "\xC3\x99",
'ugrave' => "\xC3\xB9",
'Ugrave;' => "\xC3\x99",
'ugrave;' => "\xC3\xB9",
'uml' => "\xC2\xA8",
'uml;' => "\xC2\xA8",
'upsih;' => "\xCF\x92",
'Upsilon;' => "\xCE\xA5",
'upsilon;' => "\xCF\x85",
'Uuml' => "\xC3\x9C",
'uuml' => "\xC3\xBC",
'Uuml;' => "\xC3\x9C",
'uuml;' => "\xC3\xBC",
'weierp;' => "\xE2\x84\x98",
'Xi;' => "\xCE\x9E",
'xi;' => "\xCE\xBE",
'Yacute' => "\xC3\x9D",
'yacute' => "\xC3\xBD",
'Yacute;' => "\xC3\x9D",
'yacute;' => "\xC3\xBD",
'yen' => "\xC2\xA5",
'yen;' => "\xC2\xA5",
'yuml' => "\xC3\xBF",
'Yuml;' => "\xC5\xB8",
'yuml;' => "\xC3\xBF",
'Zeta;' => "\xCE\x96",
'zeta;' => "\xCE\xB6",
'zwj;' => "\xE2\x80\x8D",
'zwnj;' => "\xE2\x80\x8C"
];
for ($i = 0, $match = null; $i < 9 && $this->consume() !== false; $i++) {
// Cast for PHPStan on PHP < 8.0: We consumed as per the loop condition,
// so `$this->consumed` is non-empty and the substr offset is valid.
$consumed = (string) substr($this->consumed, 1);
if (isset($entities[$consumed])) {
$match = $consumed;
}
}
if ($match !== null) {
$this->data = substr_replace($this->data, $entities[$match], $this->position - strlen($consumed) - 1, strlen($match) + 1);
$this->position += strlen($entities[$match]) - strlen($consumed) - 1;
}
break;
}
}
}