* */ namespace ComboStrap; use DOMAttr; use DOMDocument; use DOMElement; use DOMNodeList; use DOMXPath; use Exception; use LibXMLError; require_once(__DIR__ . '/File.php'); class XmlDocument { const HTML_TYPE = "html"; const XML_TYPE = "xml"; /** * The error that the HTML loading * may returns */ const KNOWN_HTML_LOADING_ERRORS = [ "Tag section invalid\n", // section is HTML5 tag "Tag footer invalid\n", // footer is HTML5 tag "error parsing attribute name\n", // name is an HTML5 attribute "Unexpected end tag : blockquote\n", // name is an HTML5 attribute "Tag bdi invalid\n", "Tag path invalid\n", // svg "Tag svg invalid\n", // svg "Unexpected end tag : a\n", // when the document is only a anchor "Unexpected end tag : p\n", // when the document is only a p "Unexpected end tag : button\n" // // when the document is only a button ]; const CANONICAL = "xml"; /** * @var DOMDocument */ private $xmlDom = null; /** * XmlFile constructor. * @param $text * @param string $type - HTML or not * @throws ExceptionCombo - if the file does not exist or is not valid * * Getting the width of an error HTML document if the file was downloaded * from a server has no use at all */ public function __construct($text, string $type = self::XML_TYPE) { if ($this->isXmlExtensionLoaded()) { // https://www.php.net/manual/en/libxml.constants.php $options = LIBXML_NOCDATA // | LIBXML_NOBLANKS // same as preserveWhiteSpace=true, not set to be able to format the output | LIBXML_NOXMLDECL // Drop the XML declaration when saving a document | LIBXML_NONET // No network during load | LIBXML_NSCLEAN // Remove redundant namespace declarations - for whatever reason, the formatting does not work if this is set ; // HTML if ($type == self::HTML_TYPE) { // Options that cause the processus to hang if this is not for a html file // Empty tag option may also be used only on save // at https://www.php.net/manual/en/domdocument.save.php // and https://www.php.net/manual/en/domdocument.savexml.php $options = $options // | LIBXML_NOEMPTYTAG // Expand empty tags (e.g.
to

) | LIBXML_HTML_NODEFDTD // No doctype | LIBXML_HTML_NOIMPLIED; } /** * No warning reporting * Load XML issue E_STRICT warning seen in the log */ if (!defined('DOKU_UNITTEST')) { $oldLevel = error_reporting(E_ERROR); } $this->xmlDom = new DOMDocument('1.0', 'UTF-8'); $this->mandatoryFormatConfigBeforeLoading(); $text = $this->processTextBeforeLoading($text); /** * Because the load does handle HTML5tag as error * (ie section for instance) * We take over the errors and handle them after the below load * * https://www.php.net/manual/en/function.libxml-use-internal-errors.php * * @noinspection PhpComposerExtensionStubsInspection */ libxml_use_internal_errors(true); if ($type == self::XML_TYPE) { $result = $this->xmlDom->loadXML($text, $options); } else { /** * Unlike loading XML, HTML does not have to be well-formed to load. * While malformed HTML should load successfully, this function may generate E_WARNING errors * @deprecated as we try to be XHTML compliantXML but yeah this is not always possible */ /** * Bug: Even if we set that the document is an UTF-8 * loadHTML treat the string as being in ISO-8859-1 if without any heading * (ie * https://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly * Otherwise French and other language are not well loaded * * We use the trick to transform UTF-8 to HTML */ $htmlEntityEncoded = mb_convert_encoding($text, 'HTML-ENTITIES', 'UTF-8'); $result = $this->xmlDom->loadHTML($htmlEntityEncoded, $options); } if ($result === false) { /** * Error */ /** @noinspection PhpComposerExtensionStubsInspection */ $errors = libxml_get_errors(); foreach ($errors as $error) { /* @var LibXMLError * @noinspection PhpComposerExtensionStubsInspection * * Section is an html5 tag (and is invalid for libxml) */ if (!in_array($error->message, self::KNOWN_HTML_LOADING_ERRORS)) { /** * This error is an XML and HTML error */ if ( strpos($error->message, "htmlParseEntityRef: expecting ';' in Entity") !== false || $error->message == "EntityRef: expecting ';'\n" ) { $message = "There is big probability that there is an ampersand alone `&`. ie You forgot to call html/Xml entities in a `src` or `url` attribute."; } else { $message = "Error while loading HTML"; } $message .= "Error: " . $error->message . ", Loaded text: " . $text; /** * We clean the errors, otherwise * in a test series, they failed the next test * * @noinspection PhpComposerExtensionStubsInspection */ libxml_clear_errors(); // The xml dom object is null, we got NULL pointer exception everywhere // just throw, the code will see it throw new ExceptionCombo($message, self::CANONICAL); } } } /** * We clean the known errors (otherwise they are added in a queue) * @noinspection PhpComposerExtensionStubsInspection */ libxml_clear_errors(); /** * Error reporting back */ if (!defined('DOKU_UNITTEST')) { error_reporting($oldLevel); } // namespace error : Namespace prefix dc on format is not defined // missing the ns declaration in the file. example: // xmlns:dc="http://purl.org/dc/elements/1.1/" } else { /** * If the XML module is not present */ LogUtility::msg("The php `libxml` module was not found on your installation, the xml/svg file could not be modified / instantiated", LogUtility::LVL_MSG_ERROR, "support"); } } /** * To not have a collusion with {@link SvgDocument::createSvgDocumentFromPath()} * @param Path $path * @return XmlDocument */ public static function createXmlDocFromPath(Path $path): XmlDocument { $mime = XmlDocument::XML_TYPE; if (in_array($path->getExtension(), ["html", "htm"])) { $mime = XmlDocument::HTML_TYPE; } $content = FileSystems::getContent($path); return new XmlDocument($content, $mime); } /** * @throws ExceptionCombo */ public static function createXmlDocFromMarkup($string, $asHtml = false): XmlDocument { $mime = XmlDocument::XML_TYPE; if ($asHtml) { $mime = XmlDocument::HTML_TYPE; } return new XmlDocument($string, $mime); } /** * @throws ExceptionCombo */ public static function createHtmlDocFromMarkup($markup): XmlDocument { return self::createXmlDocFromMarkup($markup, true); } public function &getXmlDom() { return $this->xmlDom; } public function setRootAttribute($name, $value) { if ($this->isXmlExtensionLoaded()) { $this->xmlDom->documentElement->setAttribute($name, $value); } } /** * @param $name * @return string null if not found */ public function getRootAttributeValue($name): ?string { $value = $this->xmlDom->documentElement->getAttribute($name); if ($value === "") { return null; } return $value; } public function getXmlText() { $xmlText = $this->getXmlDom()->saveXML( $this->getXmlDom()->documentElement, LIBXML_NOXMLDECL // no xml declaration ); // Delete doctype (for svg optimization) // php has only doctype manipulation for HTML $xmlText = preg_replace('/^/', '', $xmlText); return trim($xmlText); } /** * https://www.php.net/manual/en/dom.installation.php * * Check it with * ``` * php -m * ``` * Install with * ``` * sudo apt-get install php-xml * ``` * @return bool */ public function isXmlExtensionLoaded() { // A suffix used in the bad message $suffixBadMessage = "php extension is not installed. To install it, you need to install xml. Example: `sudo apt-get install php-xml`, `yum install php-xml`"; // https://www.php.net/manual/en/dom.requirements.php $loaded = extension_loaded("libxml"); if ($loaded === false) { LogUtility::msg("The libxml {$suffixBadMessage}"); } else { $loaded = extension_loaded("xml"); if ($loaded === false) { LogUtility::msg("The xml {$suffixBadMessage}"); } else { $loaded = extension_loaded("dom"); if ($loaded === false) { LogUtility::msg("The dom {$suffixBadMessage}"); } } } return $loaded; } /** * https://stackoverflow.com/questions/30257438/how-to-completely-remove-a-namespace-using-domdocument * @param $namespaceUri */ function removeNamespace($namespaceUri) { if (empty($namespaceUri)) { throw new \RuntimeException("The namespace is empty and should be specified"); } if (strpos($namespaceUri, "http") === false) { LogUtility::msg("Internal warning: The namespaceURI ($namespaceUri) does not seems to be an URI", LogUtility::LVL_MSG_WARNING, "support"); } /** * @var DOMNodeList $nodes * finds all nodes that have a namespace node called $ns where their parent node doesn't also have the same namespace. * @var DOMNodeList $nodes */ $nodes = $this->xpath("//*[namespace-uri()='$namespaceUri']"); foreach ($nodes as $node) { /** @var DOMElement $node */ $node->parentNode->removeChild($node); } $nodes = $this->xpath("//@*[namespace-uri()='$namespaceUri']"); foreach ($nodes as $node) { /** @var DOMAttr $node */ /** @var DOMElement $DOMNode */ $DOMNode = $node->parentNode; $DOMNode->removeAttributeNode($node); } //Node namespace can be select only from the document $xpath = new DOMXPath($this->getXmlDom()); $DOMNodeList = $xpath->query("namespace::*", $this->getXmlDom()->ownerDocument); foreach ($DOMNodeList as $node) { $namespaceURI = $node->namespaceURI; if ($namespaceURI == $namespaceUri) { $parentNode = $node->parentNode; $parentNode->removeAttributeNS($namespaceUri, $node->localName); } } } public function getDocNamespaces() { $xpath = new DOMXPath($this->getXmlDom()); // `namespace::*` means selects all the namespace attribute of the context node // namespace is an axes // See https://www.w3.org/TR/1999/REC-xpath-19991116/#axes // the namespace axis contains the namespace nodes of the context node; the axis will be empty unless the context node is an element $DOMNodeList = $xpath->query('namespace::*', $this->getXmlDom()->ownerDocument); $nameSpace = array(); foreach ($DOMNodeList as $node) { /** @var DOMElement $node */ $namespaceURI = $node->namespaceURI; $localName = $node->prefix; if ($namespaceURI != null) { $nameSpace[$localName] = $namespaceURI; } } return $nameSpace; } /** * A wrapper that register namespace for the query * with the defined prefix * See comment: * https://www.php.net/manual/en/domxpath.registernamespace.php#51480 * @param $query * @param string $defaultNamespace * @return DOMNodeList|false * * Note that this is possible to do evaluation to return a string instead * https://www.php.net/manual/en/domxpath.evaluate.php */ public function xpath($query) { $xpath = new DOMXPath($this->getXmlDom()); /** * Prefix mapping * It is necessary to use xpath to handle documents which have default namespaces. * The xpath expression will search for items with no namespace by default. */ foreach ($this->getDocNamespaces() as $prefix => $namespaceUri) { /** * You can't register an empty prefix * Default namespace (without a prefix) can only be accessed by the local-name() and namespace-uri() attributes. */ if (!empty($prefix)) { $result = $xpath->registerNamespace($prefix, $namespaceUri); if (!$result) { LogUtility::msg("Not able to register the prefix ($prefix) for the namespace uri ($namespaceUri)"); } } } return $xpath->query($query); } public function removeRootAttribute($attribute) { // This function does not work // $result = $this->getXmlDom()->documentElement->removeAttribute($attribute); for ($i = 0; $i < $this->getXmlDom()->documentElement->attributes->length; $i++) { if ($this->getXmlDom()->documentElement->attributes[$i]->name == $attribute) { $result = $this->getXmlDom()->documentElement->removeAttributeNode($this->getXmlDom()->documentElement->attributes[$i]); if ($result === false) { throw new \RuntimeException("Not able to delete the $attribute"); } // There is no break here because you may find multiple version attribute for instance } } } public function removeRootChildNode($nodeName) { for ($i = 0; $i < $this->getXmlDom()->documentElement->childNodes->length; $i++) { $childNode = &$this->getXmlDom()->documentElement->childNodes[$i]; if ($childNode->nodeName == $nodeName) { $result = $this->getXmlDom()->documentElement->removeChild($childNode); if ($result == false) { throw new \RuntimeException("Not able to delete the child node $nodeName"); } break; } } } /** * * Add a value to an attribute value * Example * * * if you add "new" * * * @param $attName * @param $attValue * @param DOMElement $xml */ public function addAttributeValue($attName, $attValue, $xml) { /** * Empty condition is better than {@link DOMElement::hasAttribute()} * because even if the dom element has the attribute, the value * may be empty */ $value = $xml->getAttribute($attName); if (empty($value)) { $xml->setAttribute($attName, $attValue); } else { $actualAttValue = $xml->getAttribute($attName); $explodeArray = explode(" ", $actualAttValue); if (!in_array($attValue, $explodeArray)) { $xml->setAttribute($attName, (string)$actualAttValue . " $attValue"); } } } public function diff(XmlDocument $rightDocument) { $error = ""; XmlUtility::diffNode($this->getXmlDom(), $rightDocument->getXmlDom(), $error); return $error; } /** * @return string a XML formatted * * !!!! The parameter preserveWhiteSpace should have been set to false before loading * https://www.php.net/manual/en/class.domdocument.php#domdocument.props.formatoutput * $this->xmlDom->preserveWhiteSpace = false; * * We do it with the function {@link XmlDocument::mandatoryFormatConfigBeforeLoading()} * */ public function getXmlTextFormatted() { $this->xmlDom->formatOutput = true; return $this->getXmlText(); } /** * @return string that can be diff * * EOL diff are not seen * * space are * * See also {@link XmlDocument::processTextBeforeLoading()} * that is needed before loading */ public function getXmlTextNormalized() { /** * If the text was a list * of sibling text without parent * We may get a body * @deprecated letting the code until * TODO: delete this code when the test pass */ // $body = $doc->getElementsByTagName("body"); // if ($body->length != 0) { // $DOMNodeList = $body->item(0)->childNodes; // $output = ""; // foreach ($DOMNodeList as $value) { // $output .= $doc->saveXML($value) . DOKU_LF; // } // } $this->xmlDom->documentElement->normalize(); return $this->getXmlTextFormatted(); } /** * Not really conventional but * to be able to {@link getXmlTextNormalized} * the EOL should be deleted * We do it before loading and not with a XML documentation */ private function processTextBeforeLoading($text) { $text = str_replace(DOKU_LF, "", $text); $text = preg_replace("/\r\n\s*\r\n/", "\r\n", $text); $text = preg_replace("/\n\s*\n/", "\n", $text); $text = preg_replace("/\n\n/", "\n", $text); return $text; } /** * This function is called just before loading * in order to be able to {@link XmlDocument::getXmlTextFormatted() format the output } * https://www.php.net/manual/en/class.domdocument.php#domdocument.props.formatoutput * Mandatory for a a good formatting before loading * */ private function mandatoryFormatConfigBeforeLoading() { // not that // the loading option: LIBXML_NOBLANKS // is equivalent to $this->xmlDom->preserveWhiteSpace = true; $this->xmlDom->preserveWhiteSpace = false; } public function removeAttributeValue(string $attributeName, DOMElement $nodeElement) { $attr = $nodeElement->getAttributeNode($attributeName); if ($attr == false) { return; } $result = $nodeElement->removeAttributeNode($attr); if ($result === false) { LogUtility::msg("Not able to delete the attribute $attributeName of the node element $nodeElement in the Xml document $this"); } } /** * @throws ExceptionCombo */ public function queryXpath(string $string): ?DOMElement { $elements = $this->queryXpaths($string); if ($elements !== null && sizeof($elements) > 0) { return $elements[0]; } return null; } /** * @return null|DOMElement[] * @throws ExceptionCombo */ public function queryXpaths(string $string): ?array { $nodes = $this->xpath($string); if ($nodes === false) { throw new ExceptionCombo("Bad xpath expression ($string)"); } if ($nodes->count() === 0) { return null; } $elements = null; for ($i = 0; $i < $nodes->count(); $i++) { $element = $nodes->item($i); if (!($element instanceof DOMElement)) { throw new ExceptionCombo("The xpath expression has selected a Node that is not an element"); } $elements[] = $element; } return $elements; } }