1<?php 2/** 3 * Copyright (c) 2021. ComboStrap, Inc. and its affiliates. All Rights Reserved. 4 * 5 * This source code is licensed under the GPL license found in the 6 * COPYING file in the root directory of this source tree. 7 * 8 * @license GPL 3 (https://www.gnu.org/licenses/gpl-3.0.en.html) 9 * @author ComboStrap <support@combostrap.com> 10 * 11 */ 12 13namespace ComboStrap; 14 15use DOMAttr; 16use DOMDocument; 17use DOMElement; 18use DOMNodeList; 19use DOMXPath; 20use Exception; 21use LibXMLError; 22 23 24require_once(__DIR__ . '/File.php'); 25 26class XmlDocument 27{ 28 const HTML_TYPE = "html"; 29 const XML_TYPE = "xml"; 30 /** 31 * The error that the HTML loading 32 * may returns 33 */ 34 const KNOWN_HTML_LOADING_ERRORS = [ 35 "Tag section invalid\n", // section is HTML5 tag 36 "Tag footer invalid\n", // footer is HTML5 tag 37 "error parsing attribute name\n", // name is an HTML5 attribute 38 "Unexpected end tag : blockquote\n", // name is an HTML5 attribute 39 "Tag bdi invalid\n", 40 "Tag path invalid\n", // svg 41 "Tag svg invalid\n", // svg 42 "Unexpected end tag : a\n", // when the document is only a anchor 43 "Unexpected end tag : p\n", // when the document is only a p 44 "Unexpected end tag : button\n" // // when the document is only a button 45 46 ]; 47 48 const CANONICAL = "xml"; 49 50 /** 51 * @var DOMDocument 52 */ 53 private $xmlDom = null; 54 55 /** 56 * XmlFile constructor. 57 * @param $text 58 * @param string $type - HTML or not 59 * @throws ExceptionCombo - if the file does not exist or is not valid 60 * 61 * Getting the width of an error HTML document if the file was downloaded 62 * from a server has no use at all 63 */ 64 public function __construct($text, string $type = self::XML_TYPE) 65 { 66 67 68 if ($this->isXmlExtensionLoaded()) { 69 70 // https://www.php.net/manual/en/libxml.constants.php 71 $options = LIBXML_NOCDATA 72 // | LIBXML_NOBLANKS // same as preserveWhiteSpace=true, not set to be able to format the output 73 | LIBXML_NOXMLDECL // Drop the XML declaration when saving a document 74 | LIBXML_NONET // No network during load 75 | LIBXML_NSCLEAN // Remove redundant namespace declarations - for whatever reason, the formatting does not work if this is set 76 ; 77 78 // HTML 79 if ($type == self::HTML_TYPE) { 80 81 // Options that cause the processus to hang if this is not for a html file 82 // Empty tag option may also be used only on save 83 // at https://www.php.net/manual/en/domdocument.save.php 84 // and https://www.php.net/manual/en/domdocument.savexml.php 85 $options = $options 86 // | LIBXML_NOEMPTYTAG // Expand empty tags (e.g. <br/> to <br></br>) 87 | LIBXML_HTML_NODEFDTD // No doctype 88 | LIBXML_HTML_NOIMPLIED; 89 90 91 } 92 93 /** 94 * No warning reporting 95 * Load XML issue E_STRICT warning seen in the log 96 */ 97 if (!defined('DOKU_UNITTEST')) { 98 $oldLevel = error_reporting(E_ERROR); 99 } 100 101 $this->xmlDom = new DOMDocument('1.0', 'UTF-8'); 102 103 $this->mandatoryFormatConfigBeforeLoading(); 104 105 106 $text = $this->processTextBeforeLoading($text); 107 108 /** 109 * Because the load does handle HTML5tag as error 110 * (ie section for instance) 111 * We take over the errors and handle them after the below load 112 * 113 * https://www.php.net/manual/en/function.libxml-use-internal-errors.php 114 * 115 * @noinspection PhpComposerExtensionStubsInspection 116 */ 117 libxml_use_internal_errors(true); 118 119 if ($type == self::XML_TYPE) { 120 121 $result = $this->xmlDom->loadXML($text, $options); 122 123 } else { 124 125 /** 126 * Unlike loading XML, HTML does not have to be well-formed to load. 127 * While malformed HTML should load successfully, this function may generate E_WARNING errors 128 * @deprecated as we try to be XHTML compliantXML but yeah this is not always possible 129 */ 130 131 /** 132 * Bug: Even if we set that the document is an UTF-8 133 * loadHTML treat the string as being in ISO-8859-1 if without any heading 134 * (ie <xml encoding="utf-8"..> 135 * https://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly 136 * Otherwise French and other language are not well loaded 137 * 138 * We use the trick to transform UTF-8 to HTML 139 */ 140 $htmlEntityEncoded = mb_convert_encoding($text, 'HTML-ENTITIES', 'UTF-8'); 141 $result = $this->xmlDom->loadHTML($htmlEntityEncoded, $options); 142 143 } 144 if ($result === false) { 145 146 /** 147 * Error 148 */ 149 /** @noinspection PhpComposerExtensionStubsInspection */ 150 $errors = libxml_get_errors(); 151 152 foreach ($errors as $error) { 153 154 /* @var LibXMLError 155 * @noinspection PhpComposerExtensionStubsInspection 156 * 157 * Section is an html5 tag (and is invalid for libxml) 158 */ 159 if (!in_array($error->message, self::KNOWN_HTML_LOADING_ERRORS)) { 160 /** 161 * This error is an XML and HTML error 162 */ 163 if ( 164 strpos($error->message, "htmlParseEntityRef: expecting ';' in Entity") !== false 165 || 166 $error->message == "EntityRef: expecting ';'\n" 167 ) { 168 $message = "There is big probability that there is an ampersand alone `&`. ie You forgot to call html/Xml entities in a `src` or `url` attribute."; 169 } else { 170 $message = "Error while loading HTML"; 171 } 172 $message .= "Error: " . $error->message . ", Loaded text: " . $text; 173 174 /** 175 * We clean the errors, otherwise 176 * in a test series, they failed the next test 177 * 178 * @noinspection PhpComposerExtensionStubsInspection 179 */ 180 libxml_clear_errors(); 181 182 // The xml dom object is null, we got NULL pointer exception everywhere 183 // just throw, the code will see it 184 throw new ExceptionCombo($message, self::CANONICAL); 185 186 } 187 188 } 189 } 190 191 /** 192 * We clean the known errors (otherwise they are added in a queue) 193 * @noinspection PhpComposerExtensionStubsInspection 194 */ 195 libxml_clear_errors(); 196 197 /** 198 * Error reporting back 199 */ 200 if (!defined('DOKU_UNITTEST')) { 201 error_reporting($oldLevel); 202 } 203 204 // namespace error : Namespace prefix dc on format is not defined 205 // missing the ns declaration in the file. example: 206 // xmlns:dc="http://purl.org/dc/elements/1.1/" 207 208 209 } else { 210 211 /** 212 * If the XML module is not present 213 */ 214 LogUtility::msg("The php `libxml` module was not found on your installation, the xml/svg file could not be modified / instantiated", LogUtility::LVL_MSG_ERROR, "support"); 215 216 217 } 218 219 } 220 221 /** 222 * To not have a collusion with {@link SvgDocument::createSvgDocumentFromPath()} 223 * @param Path $path 224 * @return XmlDocument 225 */ 226 public 227 static function createXmlDocFromPath(Path $path): XmlDocument 228 { 229 $mime = XmlDocument::XML_TYPE; 230 if (in_array($path->getExtension(), ["html", "htm"])) { 231 $mime = XmlDocument::HTML_TYPE; 232 } 233 $content = FileSystems::getContent($path); 234 return new XmlDocument($content, $mime); 235 } 236 237 /** 238 * @throws ExceptionCombo 239 */ 240 public 241 static function createXmlDocFromMarkup($string, $asHtml = false): XmlDocument 242 { 243 244 $mime = XmlDocument::XML_TYPE; 245 if ($asHtml) { 246 $mime = XmlDocument::HTML_TYPE; 247 } 248 return new XmlDocument($string, $mime); 249 } 250 251 /** 252 * @throws ExceptionCombo 253 */ 254 public static function createHtmlDocFromMarkup($markup): XmlDocument 255 { 256 return self::createXmlDocFromMarkup($markup, true); 257 } 258 259 public 260 function &getXmlDom() 261 { 262 return $this->xmlDom; 263 } 264 265 public 266 function setRootAttribute($name, $value) 267 { 268 if ($this->isXmlExtensionLoaded()) { 269 $this->xmlDom->documentElement->setAttribute($name, $value); 270 } 271 } 272 273 /** 274 * @param $name 275 * @return string null if not found 276 */ 277 public function getRootAttributeValue($name): ?string 278 { 279 $value = $this->xmlDom->documentElement->getAttribute($name); 280 if ($value === "") { 281 return null; 282 } 283 return $value; 284 } 285 286 public function getXmlText() 287 { 288 289 $xmlText = $this->getXmlDom()->saveXML( 290 $this->getXmlDom()->documentElement, 291 LIBXML_NOXMLDECL // no xml declaration 292 ); 293 // Delete doctype (for svg optimization) 294 // php has only doctype manipulation for HTML 295 $xmlText = preg_replace('/^<!DOCTYPE.+?>/', '', $xmlText); 296 return trim($xmlText); 297 298 } 299 300 /** 301 * https://www.php.net/manual/en/dom.installation.php 302 * 303 * Check it with 304 * ``` 305 * php -m 306 * ``` 307 * Install with 308 * ``` 309 * sudo apt-get install php-xml 310 * ``` 311 * @return bool 312 */ 313 public 314 function isXmlExtensionLoaded() 315 { 316 // A suffix used in the bad message 317 $suffixBadMessage = "php extension is not installed. To install it, you need to install xml. Example: `sudo apt-get install php-xml`, `yum install php-xml`"; 318 319 // https://www.php.net/manual/en/dom.requirements.php 320 $loaded = extension_loaded("libxml"); 321 if ($loaded === false) { 322 LogUtility::msg("The libxml {$suffixBadMessage}"); 323 } else { 324 $loaded = extension_loaded("xml"); 325 if ($loaded === false) { 326 LogUtility::msg("The xml {$suffixBadMessage}"); 327 } else { 328 $loaded = extension_loaded("dom"); 329 if ($loaded === false) { 330 LogUtility::msg("The dom {$suffixBadMessage}"); 331 } 332 } 333 } 334 return $loaded; 335 } 336 337 /** 338 * https://stackoverflow.com/questions/30257438/how-to-completely-remove-a-namespace-using-domdocument 339 * @param $namespaceUri 340 */ 341 function removeNamespace($namespaceUri) 342 { 343 if (empty($namespaceUri)) { 344 throw new \RuntimeException("The namespace is empty and should be specified"); 345 } 346 347 if (strpos($namespaceUri, "http") === false) { 348 LogUtility::msg("Internal warning: The namespaceURI ($namespaceUri) does not seems to be an URI", LogUtility::LVL_MSG_WARNING, "support"); 349 } 350 351 /** 352 * @var DOMNodeList $nodes 353 * finds all nodes that have a namespace node called $ns where their parent node doesn't also have the same namespace. 354 * @var DOMNodeList $nodes 355 */ 356 $nodes = $this->xpath("//*[namespace-uri()='$namespaceUri']"); 357 foreach ($nodes as $node) { 358 /** @var DOMElement $node */ 359 $node->parentNode->removeChild($node); 360 } 361 362 $nodes = $this->xpath("//@*[namespace-uri()='$namespaceUri']"); 363 foreach ($nodes as $node) { 364 /** @var DOMAttr $node */ 365 /** @var DOMElement $DOMNode */ 366 $DOMNode = $node->parentNode; 367 $DOMNode->removeAttributeNode($node); 368 } 369 370 //Node namespace can be select only from the document 371 $xpath = new DOMXPath($this->getXmlDom()); 372 $DOMNodeList = $xpath->query("namespace::*", $this->getXmlDom()->ownerDocument); 373 foreach ($DOMNodeList as $node) { 374 $namespaceURI = $node->namespaceURI; 375 if ($namespaceURI == $namespaceUri) { 376 $parentNode = $node->parentNode; 377 $parentNode->removeAttributeNS($namespaceUri, $node->localName); 378 } 379 } 380 381 382 } 383 384 public 385 function getDocNamespaces() 386 { 387 $xpath = new DOMXPath($this->getXmlDom()); 388 // `namespace::*` means selects all the namespace attribute of the context node 389 // namespace is an axes 390 // See https://www.w3.org/TR/1999/REC-xpath-19991116/#axes 391 // the namespace axis contains the namespace nodes of the context node; the axis will be empty unless the context node is an element 392 $DOMNodeList = $xpath->query('namespace::*', $this->getXmlDom()->ownerDocument); 393 $nameSpace = array(); 394 foreach ($DOMNodeList as $node) { 395 /** @var DOMElement $node */ 396 397 $namespaceURI = $node->namespaceURI; 398 $localName = $node->prefix; 399 if ($namespaceURI != null) { 400 $nameSpace[$localName] = $namespaceURI; 401 } 402 } 403 return $nameSpace; 404 } 405 406 /** 407 * A wrapper that register namespace for the query 408 * with the defined prefix 409 * See comment: 410 * https://www.php.net/manual/en/domxpath.registernamespace.php#51480 411 * @param $query 412 * @param string $defaultNamespace 413 * @return DOMNodeList|false 414 * 415 * Note that this is possible to do evaluation to return a string instead 416 * https://www.php.net/manual/en/domxpath.evaluate.php 417 */ 418 public 419 function xpath($query) 420 { 421 $xpath = new DOMXPath($this->getXmlDom()); 422 423 /** 424 * Prefix mapping 425 * It is necessary to use xpath to handle documents which have default namespaces. 426 * The xpath expression will search for items with no namespace by default. 427 */ 428 foreach ($this->getDocNamespaces() as $prefix => $namespaceUri) { 429 /** 430 * You can't register an empty prefix 431 * Default namespace (without a prefix) can only be accessed by the local-name() and namespace-uri() attributes. 432 */ 433 if (!empty($prefix)) { 434 $result = $xpath->registerNamespace($prefix, $namespaceUri); 435 if (!$result) { 436 LogUtility::msg("Not able to register the prefix ($prefix) for the namespace uri ($namespaceUri)"); 437 } 438 } 439 } 440 441 return $xpath->query($query); 442 443 } 444 445 446 public 447 function removeRootAttribute($attribute) 448 { 449 450 // This function does not work 451 // $result = $this->getXmlDom()->documentElement->removeAttribute($attribute); 452 453 for ($i = 0; $i < $this->getXmlDom()->documentElement->attributes->length; $i++) { 454 if ($this->getXmlDom()->documentElement->attributes[$i]->name == $attribute) { 455 $result = $this->getXmlDom()->documentElement->removeAttributeNode($this->getXmlDom()->documentElement->attributes[$i]); 456 if ($result === false) { 457 throw new \RuntimeException("Not able to delete the $attribute"); 458 } 459 // There is no break here because you may find multiple version attribute for instance 460 } 461 } 462 463 } 464 465 public 466 function removeRootChildNode($nodeName) 467 { 468 for ($i = 0; $i < $this->getXmlDom()->documentElement->childNodes->length; $i++) { 469 $childNode = &$this->getXmlDom()->documentElement->childNodes[$i]; 470 if ($childNode->nodeName == $nodeName) { 471 $result = $this->getXmlDom()->documentElement->removeChild($childNode); 472 if ($result == false) { 473 throw new \RuntimeException("Not able to delete the child node $nodeName"); 474 } 475 break; 476 } 477 } 478 } 479 480 /** 481 * 482 * Add a value to an attribute value 483 * Example 484 * <a class="actual"> 485 * 486 * if you add "new" 487 * <a class="actual new"> 488 * 489 * @param $attName 490 * @param $attValue 491 * @param DOMElement $xml 492 */ 493 public 494 function addAttributeValue($attName, $attValue, $xml) 495 { 496 497 /** 498 * Empty condition is better than {@link DOMElement::hasAttribute()} 499 * because even if the dom element has the attribute, the value 500 * may be empty 501 */ 502 $value = $xml->getAttribute($attName); 503 if (empty($value)) { 504 $xml->setAttribute($attName, $attValue); 505 } else { 506 $actualAttValue = $xml->getAttribute($attName); 507 $explodeArray = explode(" ", $actualAttValue); 508 if (!in_array($attValue, $explodeArray)) { 509 $xml->setAttribute($attName, (string)$actualAttValue . " $attValue"); 510 } 511 } 512 513 } 514 515 public function diff(XmlDocument $rightDocument) 516 { 517 $error = ""; 518 XmlUtility::diffNode($this->getXmlDom(), $rightDocument->getXmlDom(), $error); 519 return $error; 520 } 521 522 /** 523 * @return string a XML formatted 524 * 525 * !!!! The parameter preserveWhiteSpace should have been set to false before loading 526 * https://www.php.net/manual/en/class.domdocument.php#domdocument.props.formatoutput 527 * $this->xmlDom->preserveWhiteSpace = false; 528 * 529 * We do it with the function {@link XmlDocument::mandatoryFormatConfigBeforeLoading()} 530 * 531 */ 532 public function getXmlTextFormatted() 533 { 534 535 $this->xmlDom->formatOutput = true; 536 return $this->getXmlText(); 537 538 } 539 540 /** 541 * @return string that can be diff 542 * * EOL diff are not seen 543 * * space are 544 * 545 * See also {@link XmlDocument::processTextBeforeLoading()} 546 * that is needed before loading 547 */ 548 public function getXmlTextNormalized() 549 { 550 551 /** 552 * If the text was a list 553 * of sibling text without parent 554 * We may get a body 555 * @deprecated letting the code until 556 * TODO: delete this code when the test pass 557 */ 558// $body = $doc->getElementsByTagName("body"); 559// if ($body->length != 0) { 560// $DOMNodeList = $body->item(0)->childNodes; 561// $output = ""; 562// foreach ($DOMNodeList as $value) { 563// $output .= $doc->saveXML($value) . DOKU_LF; 564// } 565// } 566 567 $this->xmlDom->documentElement->normalize(); 568 return $this->getXmlTextFormatted(); 569 } 570 571 /** 572 * Not really conventional but 573 * to be able to {@link getXmlTextNormalized} 574 * the EOL should be deleted 575 * We do it before loading and not with a XML documentation 576 */ 577 private function processTextBeforeLoading($text) 578 { 579 $text = str_replace(DOKU_LF, "", $text); 580 $text = preg_replace("/\r\n\s*\r\n/", "\r\n", $text); 581 $text = preg_replace("/\n\s*\n/", "\n", $text); 582 $text = preg_replace("/\n\n/", "\n", $text); 583 return $text; 584 585 } 586 587 588 /** 589 * This function is called just before loading 590 * in order to be able to {@link XmlDocument::getXmlTextFormatted() format the output } 591 * https://www.php.net/manual/en/class.domdocument.php#domdocument.props.formatoutput 592 * Mandatory for a a good formatting before loading 593 * 594 */ 595 private function mandatoryFormatConfigBeforeLoading() 596 { 597 // not that 598 // the loading option: LIBXML_NOBLANKS 599 // is equivalent to $this->xmlDom->preserveWhiteSpace = true; 600 $this->xmlDom->preserveWhiteSpace = false; 601 } 602 603 public function removeAttributeValue(string $attributeName, DOMElement $nodeElement) 604 { 605 $attr = $nodeElement->getAttributeNode($attributeName); 606 if ($attr == false) { 607 return; 608 } 609 $result = $nodeElement->removeAttributeNode($attr); 610 if ($result === false) { 611 LogUtility::msg("Not able to delete the attribute $attributeName of the node element $nodeElement in the Xml document $this"); 612 } 613 } 614 615 /** 616 * @throws ExceptionCombo 617 */ 618 public function queryXpath(string $string): ?DOMElement 619 { 620 621 $elements = $this->queryXpaths($string); 622 if ($elements !== null && sizeof($elements) > 0) { 623 return $elements[0]; 624 } 625 return null; 626 } 627 628 /** 629 * @return null|DOMElement[] 630 * @throws ExceptionCombo 631 */ 632 public function queryXpaths(string $string): ?array 633 { 634 $nodes = $this->xpath($string); 635 if ($nodes === false) { 636 throw new ExceptionCombo("Bad xpath expression ($string)"); 637 } 638 if ($nodes->count() === 0) { 639 return null; 640 } 641 $elements = null; 642 for ($i = 0; $i < $nodes->count(); $i++) { 643 $element = $nodes->item($i); 644 if (!($element instanceof DOMElement)) { 645 throw new ExceptionCombo("The xpath expression has selected a Node that is not an element"); 646 } 647 $elements[] = $element; 648 649 } 650 651 return $elements; 652 } 653 654 655} 656