* */ namespace ComboStrap\Xml; use ComboStrap\ExceptionBadState; use ComboStrap\ExceptionBadSyntax; use ComboStrap\ExceptionNotFound; use ComboStrap\FileSystems; use ComboStrap\LogUtility; use ComboStrap\Path; use ComboStrap\PluginUtility; use DOMAttr; use DOMDocument; use DOMElement; use DOMNodeList; use DOMXPath; use LibXMLError; use PhpCss; /** * A xml document that follows the Web Api interface. * * Note Dokuwiki now uses since [jack_jackrum](https://www.dokuwiki.org/changes#release_2023-04-04_jack_jackrum): * the [dom-wrapper](https://github.com/scotteh/php-dom-wrapper) * that follow the Jquery API and uses [css-selector](https://symfony.com/doc/current/components/css_selector.html) * to get Xpath expression from Css selector * */ class XmlDocument { const HTML_TYPE = "html"; const XML_TYPE = "xml"; /** * The error that the HTML loading * may returns */ const KNOWN_HTML_LOADING_ERRORS = [ "Tag section invalid\n", // section is HTML5 tag "Tag footer invalid\n", // footer is HTML5 tag "error parsing attribute name\n", // name is an HTML5 attribute "Unexpected end tag : blockquote\n", // name is an HTML5 attribute "Tag bdi invalid\n", "Tag path invalid\n", // svg "Tag svg invalid\n", // svg "Unexpected end tag : a\n", // when the document is only a anchor "Unexpected end tag : p\n", // when the document is only a p "Unexpected end tag : button\n", // when the document is only a button ]; const CANONICAL = "xml"; /** * @var DOMDocument */ private DOMDocument $domDocument; /** * @var DOMXPath */ private DOMXPath $domXpath; /** * XmlFile constructor. * @param $text * @param string $type - HTML or not * @throws ExceptionBadSyntax - if the document is not valid or the lib xml is not available * * Getting the width of an error HTML document if the file was downloaded * from a server has no use at all */ public function __construct($text, string $type = self::XML_TYPE) { if (empty($text)) { throw new ExceptionBadSyntax("The xml text markup should not be empty.", self::CANONICAL); } if (!$this->isXmlExtensionLoaded()) { /** * If the XML module is not present */ throw new ExceptionBadSyntax("The php `libxml` module was not found on your installation, the xml/svg file could not be modified / instantiated", self::CANONICAL); } // https://www.php.net/manual/en/libxml.constants.php $options = LIBXML_NOCDATA // | LIBXML_NOBLANKS // same as preserveWhiteSpace=true, not set to be able to format the output | LIBXML_NOXMLDECL // Drop the XML declaration when saving a document | LIBXML_NONET // No network during load | LIBXML_NSCLEAN // Remove redundant namespace declarations - for whatever reason, the formatting does not work if this is set ; // HTML if ($type == self::HTML_TYPE) { // Options that cause the process to hang if this is not for a html file // Empty tag option may also be used only on save // at https://www.php.net/manual/en/domdocument.save.php // and https://www.php.net/manual/en/domdocument.savexml.php $options = $options // | LIBXML_NOEMPTYTAG // Expand empty tags (e.g.
to

) | LIBXML_HTML_NODEFDTD // No doctype | LIBXML_HTML_NOIMPLIED; } /** * No warning reporting * Load XML issue E_STRICT warning seen in the log */ if (!PluginUtility::isTest()) { $oldLevel = error_reporting(E_ERROR); } $this->domDocument = new DOMDocument('1.0', 'UTF-8'); $this->mandatoryFormatConfigBeforeLoading(); $text = $this->processTextBeforeLoading($text); /** * Because the load does handle HTML5tag as error * (ie section for instance) * We take over the errors and handle them after the below load * * https://www.php.net/manual/en/function.libxml-use-internal-errors.php * */ libxml_use_internal_errors(true); if ($type == self::XML_TYPE) { $result = $this->domDocument->loadXML($text, $options); } else { /** * Unlike loading XML, HTML does not have to be well-formed to load. * While malformed HTML should load successfully, this function may generate E_WARNING errors * @deprecated as we try to be XHTML compliantXML but yeah this is not always possible */ /** * Bug: Even if we set that the document is an UTF-8 * loadHTML treat the string as being in ISO-8859-1 if without any heading * (ie * https://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly * Otherwise French and other language are not well loaded * * We use the trick to transform UTF-8 to HTML */ $htmlEntityEncoded = mb_convert_encoding($text, 'HTML-ENTITIES', 'UTF-8'); $result = $this->domDocument->loadHTML($htmlEntityEncoded, $options); } if ($result === false) { /** * Error */ $errors = libxml_get_errors(); foreach ($errors as $error) { /* @var LibXMLError * @noinspection PhpComposerExtensionStubsInspection * * Section is an html5 tag (and is invalid for libxml) */ if (!in_array($error->message, self::KNOWN_HTML_LOADING_ERRORS)) { /** * This error is an XML and HTML error */ if ( strpos($error->message, "htmlParseEntityRef: expecting ';' in Entity") !== false || $error->message == "EntityRef: expecting ';'\n" ) { $message = "There is big probability that there is an ampersand alone `&`. ie You forgot to call html/Xml entities in a `src` or `url` attribute."; } else { $message = "Error while loading HTML"; } /** * inboolean attribute XML loading error */ if (strpos($error->message, "Specification mandates value for attribute") !== false) { $message = "Xml does not allow boolean attribute (ie without any value). If you skip this error, you will get a general attribute constructing error as next error. Load as HTML."; } $message .= "Error: " . $error->message . ", Loaded text: " . $text; /** * We clean the errors, otherwise * in a test series, they failed the next test * */ libxml_clear_errors(); // The xml dom object is null, we got NULL pointer exception everywhere // just throw, the code will see it throw new ExceptionBadSyntax($message, self::CANONICAL); } } } /** * We clean the known errors (otherwise they are added in a queue) */ libxml_clear_errors(); /** * Error reporting back */ if (!PluginUtility::isTest() && isset($oldLevel)) { error_reporting($oldLevel); } // namespace error : Namespace prefix dc on format is not defined // missing the ns declaration in the file. example: // xmlns:dc="http://purl.org/dc/elements/1.1/" } /** * To not have a collusion with {@link FetcherSvg::createFetchImageSvgFromPath()} * @param Path $path * @return XmlDocument * @throws ExceptionNotFound - if the file does not exist * @throws ExceptionBadSyntax - if the content is not valid */ public static function createXmlDocFromPath(Path $path): XmlDocument { $mime = XmlDocument::XML_TYPE; if (in_array($path->getExtension(), ["html", "htm"])) { $mime = XmlDocument::HTML_TYPE; } $content = FileSystems::getContent($path); return (new XmlDocument($content, $mime)); } /** * * @throws ExceptionBadSyntax */ public static function createXmlDocFromMarkup($string, $asHtml = false): XmlDocument { $mime = XmlDocument::XML_TYPE; if ($asHtml) { $mime = XmlDocument::HTML_TYPE; } return new XmlDocument($string, $mime); } /** * HTML loading is more permissive * * For instance, you would not get an error on boolean attribute * ``` * Error while loading HTMLError: Specification mandates value for attribute defer * ``` * In Xml, it's mandatory but not in HTML, they are known as: * https://html.spec.whatwg.org/multipage/common-microsyntaxes.html#boolean-attribute * * * @throws ExceptionBadSyntax */ public static function createHtmlDocFromMarkup($markup): XmlDocument { return self::createXmlDocFromMarkup($markup, true); } public function &getDomDocument(): DOMDocument { return $this->domDocument; } /** * @param $name * @param $value * @return void * @deprecated use {@link XmlDocument::getElement()} instead */ public function setRootAttribute($name, $value) { if ($this->isXmlExtensionLoaded()) { $this->domDocument->documentElement->setAttribute($name, $value); } } /** * @param $name * @return string null if not found * @deprecated uses {@link XmlElement::getAttribute()} of {@link self::getElement()} */ public function getRootAttributeValue($name): ?string { $value = $this->domDocument->documentElement->getAttribute($name); if ($value === "") { return null; } return $value; } public function toXhtml(DOMElement $element = null): string { return $this->toXml($element); } public function toXml(DOMElement $element = null): string { if ($element === null) { $element = $this->getDomDocument()->documentElement; } /** * LIBXML_NOXMLDECL (no xml declaration) does not work because only empty tag is recognized * https://www.php.net/manual/en/domdocument.savexml.php */ $xmlText = $this->getDomDocument()->saveXML( $element, LIBXML_NOXMLDECL ); // Delete doctype (for svg optimization) // php has only doctype manipulation for HTML $xmlText = preg_replace('/^/', '', $xmlText); return trim($xmlText); } /** * https://www.php.net/manual/en/dom.installation.php * * Check it with * ``` * php -m * ``` * Install with * ``` * sudo apt-get install php-xml * ``` * @return bool */ public function isXmlExtensionLoaded(): bool { // A suffix used in the bad message $suffixBadMessage = "php extension is not installed. To install it, you need to install xml. Example: `sudo apt-get install php-xml`, `yum install php-xml`"; // https://www.php.net/manual/en/dom.requirements.php $loaded = extension_loaded("libxml"); if ($loaded === false) { LogUtility::msg("The libxml {$suffixBadMessage}"); } else { $loaded = extension_loaded("xml"); if ($loaded === false) { LogUtility::msg("The xml {$suffixBadMessage}"); } else { $loaded = extension_loaded("dom"); if ($loaded === false) { LogUtility::msg("The dom {$suffixBadMessage}"); } } } return $loaded; } /** * https://stackoverflow.com/questions/30257438/how-to-completely-remove-a-namespace-using-domdocument * @param $namespaceUri */ function removeNamespace($namespaceUri) { if (empty($namespaceUri)) { throw new \RuntimeException("The namespace is empty and should be specified"); } if (strpos($namespaceUri, "http") === false) { LogUtility::msg("Internal warning: The namespaceURI ($namespaceUri) does not seems to be an URI", LogUtility::LVL_MSG_WARNING, "support"); } /** * @var DOMNodeList $nodes * finds all nodes that have a namespace node called $ns where their parent node doesn't also have the same namespace. * @var DOMNodeList $nodes */ try { $nodes = $this->xpath("//*[namespace-uri()='$namespaceUri']"); foreach ($nodes as $node) { /** @var DOMElement $node */ $node->parentNode->removeChild($node); } } catch (ExceptionBadSyntax $e) { LogUtility::error("Internal Error on xpath: {$e->getMessage()}"); } try { $nodes = $this->xpath("//@*[namespace-uri()='$namespaceUri']"); foreach ($nodes as $node) { /** @var DOMAttr $node */ /** @var DOMElement $DOMNode */ $DOMNode = $node->parentNode; $DOMNode->removeAttributeNode($node); } } catch (ExceptionBadSyntax $e) { LogUtility::error("Internal Error on xpath: {$e->getMessage()}"); } //Node namespace can be select only from the document $xpath = new DOMXPath($this->getDomDocument()); $DOMNodeList = $xpath->query("namespace::*", $this->getDomDocument()->ownerDocument); foreach ($DOMNodeList as $node) { $namespaceURI = $node->namespaceURI; if ($namespaceURI == $namespaceUri) { $parentNode = $node->parentNode; $parentNode->removeAttributeNS($namespaceUri, $node->localName); } } } public function getNamespaces(): array { /** * We can't query with the library {@link XmlDocument::xpath()} function because * we register in the xpath the namespace */ $xpath = new DOMXPath($this->getDomDocument()); // `namespace::*` means selects all the namespace attribute of the context node // namespace is an axes // See https://www.w3.org/TR/1999/REC-xpath-19991116/#axes // the namespace axis contains the namespace nodes of the context node; the axis will be empty unless the context node is an element $DOMNodeList = $xpath->query('namespace::*', $this->getDomDocument()->ownerDocument); $nameSpace = array(); foreach ($DOMNodeList as $node) { /** @var DOMElement $node */ $namespaceURI = $node->namespaceURI; $localName = $node->prefix; if ($namespaceURI != null) { $nameSpace[$localName] = $namespaceURI; } } return $nameSpace; } /** * A wrapper that register namespace for the query * with the defined prefix * See comment: * https://www.php.net/manual/en/domxpath.registernamespace.php#51480 * @param $query * @param DOMElement|null $contextNode * @return DOMNodeList * * Note that this is possible to do evaluation to return a string instead * https://www.php.net/manual/en/domxpath.evaluate.php * @throws ExceptionBadSyntax - if the query is invalid */ public function xpath($query, DOMElement $contextNode = null): DOMNodeList { if (!isset($this->domXpath)) { $this->domXpath = new DOMXPath($this->getDomDocument()); /** * Prefix mapping * It is necessary to use xpath to handle documents which have default namespaces. * The xpath expression will search for items with no namespace by default. */ foreach ($this->getNamespaces() as $prefix => $namespaceUri) { /** * You can't register an empty prefix * Default namespace (without a prefix) can only be accessed by the local-name() and namespace-uri() attributes. */ if (!empty($prefix)) { $result = $this->domXpath->registerNamespace($prefix, $namespaceUri); if (!$result) { LogUtility::msg("Not able to register the prefix ($prefix) for the namespace uri ($namespaceUri)"); } } } } if ($contextNode === null) { $contextNode = $this->domDocument; } $domList = $this->domXpath->query($query, $contextNode); if ($domList === false) { throw new ExceptionBadSyntax("The query expression ($query) may be malformed"); } return $domList; } public function removeRootAttribute($attribute) { // This function does not work // $result = $this->getXmlDom()->documentElement->removeAttribute($attribute); for ($i = 0; $i < $this->getDomDocument()->documentElement->attributes->length; $i++) { if ($this->getDomDocument()->documentElement->attributes[$i]->name == $attribute) { $result = $this->getDomDocument()->documentElement->removeAttributeNode($this->getDomDocument()->documentElement->attributes[$i]); if ($result === false) { throw new \RuntimeException("Not able to delete the $attribute"); } // There is no break here because you may find multiple version attribute for instance } } } public function removeRootChildNode($nodeName) { for ($i = 0; $i < $this->getDomDocument()->documentElement->childNodes->length; $i++) { $childNode = &$this->getDomDocument()->documentElement->childNodes[$i]; if ($childNode->nodeName == $nodeName) { $result = $this->getDomDocument()->documentElement->removeChild($childNode); if ($result == false) { throw new \RuntimeException("Not able to delete the child node $nodeName"); } break; } } } /** * * Add a value to an attribute value * Example * * * if you add "new" * * * @param $attName * @param $attValue * @param DOMElement $xml */ public function addAttributeValue($attName, $attValue, $xml) { /** * Empty condition is better than {@link DOMElement::hasAttribute()} * because even if the dom element has the attribute, the value * may be empty */ $value = $xml->getAttribute($attName); if (empty($value)) { $xml->setAttribute($attName, $attValue); } else { $actualAttValue = $xml->getAttribute($attName); $explodeArray = explode(" ", $actualAttValue); if (!in_array($attValue, $explodeArray)) { $xml->setAttribute($attName, (string)$actualAttValue . " $attValue"); } } } public function diff(XmlDocument $rightDocument): string { $error = ""; XmlSystems::diffNode($this->getDomDocument(), $rightDocument->getDomDocument(), $error); return $error; } /** * @return string a XML formatted * * !!!! The parameter preserveWhiteSpace should have been set to false before loading * https://www.php.net/manual/en/class.domdocument.php#domdocument.props.formatoutput * $this->xmlDom->preserveWhiteSpace = false; * * We do it with the function {@link XmlDocument::mandatoryFormatConfigBeforeLoading()} * */ public function toXmlFormatted(DOMElement $element = null): string { $this->domDocument->formatOutput = true; return $this->toXml($element); } /** * @return string that can be diff * * EOL diff are not seen * * space are * * See also {@link XmlDocument::processTextBeforeLoading()} * that is needed before loading */ public function toXmlNormalized(DOMElement $element = null): string { /** * If the text was a list * of sibling text without parent * We may get a body * @deprecated letting the code until * TODO: delete this code when the test pass */ // $body = $doc->getElementsByTagName("body"); // if ($body->length != 0) { // $DOMNodeList = $body->item(0)->childNodes; // $output = ""; // foreach ($DOMNodeList as $value) { // $output .= $doc->saveXML($value) . DOKU_LF; // } // } if ($element == null) { $element = $this->domDocument->documentElement; } $element->normalize(); return $this->toXmlFormatted($element); } /** * Not really conventional but * to be able to {@link toXmlNormalized} * the EOL should be deleted * We do it before loading and not with a XML documentation */ private function processTextBeforeLoading($text) { $text = str_replace(DOKU_LF, "", $text); $text = preg_replace("/\r\n\s*\r\n/", "\r\n", $text); $text = preg_replace("/\n\s*\n/", "\n", $text); $text = preg_replace("/\n\n/", "\n", $text); return $text; } /** * This function is called just before loading * in order to be able to {@link XmlDocument::toXmlFormatted() format the output } * https://www.php.net/manual/en/class.domdocument.php#domdocument.props.formatoutput * Mandatory for a a good formatting before loading * */ private function mandatoryFormatConfigBeforeLoading() { // not that // the loading option: LIBXML_NOBLANKS // is equivalent to $this->xmlDom->preserveWhiteSpace = true; $this->domDocument->preserveWhiteSpace = false; } /** * @param string $attributeName * @param DOMElement $nodeElement * @return void * @deprecated use the {@link XmlElement::removeAttribute()} if possible */ public function removeAttributeValue(string $attributeName, DOMElement $nodeElement) { $attr = $nodeElement->getAttributeNode($attributeName); if (!$attr) { return; } $result = $nodeElement->removeAttributeNode($attr); if ($result === false) { LogUtility::msg("Not able to delete the attribute $attributeName of the node element $nodeElement->tagName in the Xml document"); } } /** * Query via a CSS selector * (not that it will not work with other namespace than the default one, ie xmlns will not work) * @throws ExceptionBadSyntax - if the selector is not valid * @throws ExceptionNotFound - if the selector selects nothing */ public function querySelector(string $selector): XmlElement { $domNodeList = $this->querySelectorAll($selector); if (sizeof($domNodeList) >= 1) { return $domNodeList[0]; } throw new ExceptionNotFound("No element was found with the selector $selector"); } /** * @return XmlElement[] * @throws ExceptionBadSyntax */ public function querySelectorAll(string $selector): array { $xpath = $this->cssSelectorToXpath($selector); $domNodeList = $this->xpath($xpath); $domNodes = []; foreach ($domNodeList as $domNode) { if ($domNode instanceof DOMElement) { $domNodes[] = new XmlElement($domNode, $this); } } return $domNodes; } /** * @throws ExceptionBadSyntax */ public function cssSelectorToXpath(string $selector): string { try { return PhpCss::toXpath($selector); } catch (PhpCss\Exception\ParserException $e) { throw new ExceptionBadSyntax("The selector ($selector) is not valid. Error: {$e->getMessage()}"); } } /** * An utility function to know how to remove a node * @param \DOMNode $nodeElement * @deprecated use {@link XmlElement::remove} instead */ public function removeNode(\DOMNode $nodeElement) { $nodeElement->parentNode->removeChild($nodeElement); } public function getElement(): XmlElement { return XmlElement::create($this->getDomDocument()->documentElement, $this); } public function toHtml() { return $this->domDocument->saveHTML(); } /** * @throws \DOMException - if invalid local name */ public function createElement(string $localName): XmlElement { $element = $this->domDocument->createElement($localName); return XmlElement::create($element, $this); } /** * @throws ExceptionBadSyntax * @throws ExceptionBadState */ public function xpathFirstDomElement(string $xpath): DOMElement { $domList = $this->xpath($xpath); $domElement = $domList->item(0); if ($domElement instanceof DOMElement) { return $domElement; } else { throw new ExceptionBadState("The first DOM node is not a DOM element"); } } }