1<?php
2/**
3 * Copyright (c) 2021. ComboStrap, Inc. and its affiliates. All Rights Reserved.
4 *
5 * This source code is licensed under the GPL license found in the
6 * COPYING  file in the root directory of this source tree.
7 *
8 * @license  GPL 3 (https://www.gnu.org/licenses/gpl-3.0.en.html)
9 * @author   ComboStrap <support@combostrap.com>
10 *
11 */
12
13namespace ComboStrap;
14
15use DOMAttr;
16use DOMDocument;
17use DOMElement;
18use DOMNodeList;
19use DOMXPath;
20use Exception;
21use LibXMLError;
22
23
24require_once(__DIR__ . '/File.php');
25
26class XmlDocument
27{
28    const HTML_TYPE = "html";
29    const XML_TYPE = "xml";
30    /**
31     * The error that the HTML loading
32     * may returns
33     */
34    const KNOWN_HTML_LOADING_ERRORS = [
35        "Tag section invalid\n", // section is HTML5 tag
36        "Tag footer invalid\n", // footer is HTML5 tag
37        "error parsing attribute name\n", // name is an HTML5 attribute
38        "Unexpected end tag : blockquote\n", // name is an HTML5 attribute
39        "Tag bdi invalid\n",
40        "Tag path invalid\n", // svg
41        "Tag svg invalid\n", // svg
42        "Unexpected end tag : a\n", // when the document is only a anchor
43        "Unexpected end tag : p\n", // when the document is only a p
44        "Unexpected end tag : button\n" // // when the document is only a button
45
46    ];
47
48    const CANONICAL = "xml";
49
50    /**
51     * @var DOMDocument
52     */
53    private $xmlDom = null;
54
55    /**
56     * XmlFile constructor.
57     * @param $text
58     * @param string $type - HTML or not
59     * @throws ExceptionCombo - if the file does not exist or is not valid
60     *
61     * Getting the width of an error HTML document if the file was downloaded
62     * from a server has no use at all
63     */
64    public function __construct($text, string $type = self::XML_TYPE)
65    {
66
67
68        if ($this->isXmlExtensionLoaded()) {
69
70            // https://www.php.net/manual/en/libxml.constants.php
71            $options = LIBXML_NOCDATA
72                // | LIBXML_NOBLANKS // same as preserveWhiteSpace=true, not set to be able to format the output
73                | LIBXML_NOXMLDECL // Drop the XML declaration when saving a document
74                | LIBXML_NONET // No network during load
75                | LIBXML_NSCLEAN // Remove redundant namespace declarations - for whatever reason, the formatting does not work if this is set
76            ;
77
78            // HTML
79            if ($type == self::HTML_TYPE) {
80
81                // Options that cause the processus to hang if this is not for a html file
82                // Empty tag option may also be used only on save
83                //   at https://www.php.net/manual/en/domdocument.save.php
84                //   and https://www.php.net/manual/en/domdocument.savexml.php
85                $options = $options
86                    // | LIBXML_NOEMPTYTAG // Expand empty tags (e.g. <br/> to <br></br>)
87                    | LIBXML_HTML_NODEFDTD // No doctype
88                    | LIBXML_HTML_NOIMPLIED;
89
90
91            }
92
93            /**
94             * No warning reporting
95             * Load XML issue E_STRICT warning seen in the log
96             */
97            if (!defined('DOKU_UNITTEST')) {
98                $oldLevel = error_reporting(E_ERROR);
99            }
100
101            $this->xmlDom = new DOMDocument('1.0', 'UTF-8');
102
103            $this->mandatoryFormatConfigBeforeLoading();
104
105
106            $text = $this->processTextBeforeLoading($text);
107
108            /**
109             * Because the load does handle HTML5tag as error
110             * (ie section for instance)
111             * We take over the errors and handle them after the below load
112             *
113             * https://www.php.net/manual/en/function.libxml-use-internal-errors.php
114             *
115             * @noinspection PhpComposerExtensionStubsInspection
116             */
117            libxml_use_internal_errors(true);
118
119            if ($type == self::XML_TYPE) {
120
121                $result = $this->xmlDom->loadXML($text, $options);
122
123            } else {
124
125                /**
126                 * Unlike loading XML, HTML does not have to be well-formed to load.
127                 * While malformed HTML should load successfully, this function may generate E_WARNING errors
128                 * @deprecated as we try to be XHTML compliantXML but yeah this is not always possible
129                 */
130
131                /**
132                 * Bug: Even if we set that the document is an UTF-8
133                 * loadHTML treat the string as being in ISO-8859-1 if without any heading
134                 * (ie <xml encoding="utf-8"..>
135                 * https://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly
136                 * Otherwise French and other language are not well loaded
137                 *
138                 * We use the trick to transform UTF-8 to HTML
139                 */
140                $htmlEntityEncoded = mb_convert_encoding($text, 'HTML-ENTITIES', 'UTF-8');
141                $result = $this->xmlDom->loadHTML($htmlEntityEncoded, $options);
142
143            }
144            if ($result === false) {
145
146                /**
147                 * Error
148                 */
149                /** @noinspection PhpComposerExtensionStubsInspection */
150                $errors = libxml_get_errors();
151
152                foreach ($errors as $error) {
153
154                    /* @var LibXMLError
155                     * @noinspection PhpComposerExtensionStubsInspection
156                     *
157                     * Section is an html5 tag (and is invalid for libxml)
158                     */
159                    if (!in_array($error->message, self::KNOWN_HTML_LOADING_ERRORS)) {
160                        /**
161                         * This error is an XML and HTML error
162                         */
163                        if (
164                            strpos($error->message, "htmlParseEntityRef: expecting ';' in Entity") !== false
165                            ||
166                            $error->message == "EntityRef: expecting ';'\n"
167                        ) {
168                            $message = "There is big probability that there is an ampersand alone `&`. ie You forgot to call html/Xml entities in a `src` or `url` attribute.";
169                        } else {
170                            $message = "Error while loading HTML";
171                        }
172                        $message .= "Error: " . $error->message . ", Loaded text: " . $text;
173
174                        /**
175                         * We clean the errors, otherwise
176                         * in a test series, they failed the next test
177                         *
178                         * @noinspection PhpComposerExtensionStubsInspection
179                         */
180                        libxml_clear_errors();
181
182                        // The xml dom object is null, we got NULL pointer exception everywhere
183                        // just throw, the code will see it
184                        throw new ExceptionCombo($message, self::CANONICAL);
185
186                    }
187
188                }
189            }
190
191            /**
192             * We clean the known errors (otherwise they are added in a queue)
193             * @noinspection PhpComposerExtensionStubsInspection
194             */
195            libxml_clear_errors();
196
197            /**
198             * Error reporting back
199             */
200            if (!defined('DOKU_UNITTEST')) {
201                error_reporting($oldLevel);
202            }
203
204            // namespace error : Namespace prefix dc on format is not defined
205            // missing the ns declaration in the file. example:
206            // xmlns:dc="http://purl.org/dc/elements/1.1/"
207
208
209        } else {
210
211            /**
212             * If the XML module is not present
213             */
214            LogUtility::msg("The php `libxml` module was not found on your installation, the xml/svg file could not be modified / instantiated", LogUtility::LVL_MSG_ERROR, "support");
215
216
217        }
218
219    }
220
221    /**
222     * To not have a collusion with {@link SvgDocument::createSvgDocumentFromPath()}
223     * @param Path $path
224     * @return XmlDocument
225     */
226    public
227    static function createXmlDocFromPath(Path $path): XmlDocument
228    {
229        $mime = XmlDocument::XML_TYPE;
230        if (in_array($path->getExtension(), ["html", "htm"])) {
231            $mime = XmlDocument::HTML_TYPE;
232        }
233        $content = FileSystems::getContent($path);
234        return new XmlDocument($content, $mime);
235    }
236
237    /**
238     * @throws ExceptionCombo
239     */
240    public
241    static function createXmlDocFromMarkup($string, $asHtml = false): XmlDocument
242    {
243
244        $mime = XmlDocument::XML_TYPE;
245        if ($asHtml) {
246            $mime = XmlDocument::HTML_TYPE;
247        }
248        return new XmlDocument($string, $mime);
249    }
250
251    /**
252     * @throws ExceptionCombo
253     */
254    public static function createHtmlDocFromMarkup($markup): XmlDocument
255    {
256        return self::createXmlDocFromMarkup($markup, true);
257    }
258
259    public
260    function &getXmlDom()
261    {
262        return $this->xmlDom;
263    }
264
265    public
266    function setRootAttribute($name, $value)
267    {
268        if ($this->isXmlExtensionLoaded()) {
269            $this->xmlDom->documentElement->setAttribute($name, $value);
270        }
271    }
272
273    /**
274     * @param $name
275     * @return string null if not found
276     */
277    public function getRootAttributeValue($name): ?string
278    {
279        $value = $this->xmlDom->documentElement->getAttribute($name);
280        if ($value === "") {
281            return null;
282        }
283        return $value;
284    }
285
286    public function getXmlText()
287    {
288
289        $xmlText = $this->getXmlDom()->saveXML(
290            $this->getXmlDom()->documentElement,
291            LIBXML_NOXMLDECL // no xml declaration
292        );
293        // Delete doctype (for svg optimization)
294        // php has only doctype manipulation for HTML
295        $xmlText = preg_replace('/^<!DOCTYPE.+?>/', '', $xmlText);
296        return trim($xmlText);
297
298    }
299
300    /**
301     * https://www.php.net/manual/en/dom.installation.php
302     *
303     * Check it with
304     * ```
305     * php -m
306     * ```
307     * Install with
308     * ```
309     * sudo apt-get install php-xml
310     * ```
311     * @return bool
312     */
313    public
314    function isXmlExtensionLoaded()
315    {
316        // A suffix used in the bad message
317        $suffixBadMessage = "php extension is not installed. To install it, you need to install xml. Example: `sudo apt-get install php-xml`, `yum install php-xml`";
318
319        // https://www.php.net/manual/en/dom.requirements.php
320        $loaded = extension_loaded("libxml");
321        if ($loaded === false) {
322            LogUtility::msg("The libxml {$suffixBadMessage}");
323        } else {
324            $loaded = extension_loaded("xml");
325            if ($loaded === false) {
326                LogUtility::msg("The xml {$suffixBadMessage}");
327            } else {
328                $loaded = extension_loaded("dom");
329                if ($loaded === false) {
330                    LogUtility::msg("The dom {$suffixBadMessage}");
331                }
332            }
333        }
334        return $loaded;
335    }
336
337    /**
338     * https://stackoverflow.com/questions/30257438/how-to-completely-remove-a-namespace-using-domdocument
339     * @param $namespaceUri
340     */
341    function removeNamespace($namespaceUri)
342    {
343        if (empty($namespaceUri)) {
344            throw new \RuntimeException("The namespace is empty and should be specified");
345        }
346
347        if (strpos($namespaceUri, "http") === false) {
348            LogUtility::msg("Internal warning: The namespaceURI ($namespaceUri) does not seems to be an URI", LogUtility::LVL_MSG_WARNING, "support");
349        }
350
351        /**
352         * @var DOMNodeList $nodes
353         * finds all nodes that have a namespace node called $ns where their parent node doesn't also have the same namespace.
354         * @var DOMNodeList $nodes
355         */
356        $nodes = $this->xpath("//*[namespace-uri()='$namespaceUri']");
357        foreach ($nodes as $node) {
358            /** @var DOMElement $node */
359            $node->parentNode->removeChild($node);
360        }
361
362        $nodes = $this->xpath("//@*[namespace-uri()='$namespaceUri']");
363        foreach ($nodes as $node) {
364            /** @var DOMAttr $node */
365            /** @var DOMElement $DOMNode */
366            $DOMNode = $node->parentNode;
367            $DOMNode->removeAttributeNode($node);
368        }
369
370        //Node namespace can be select only from the document
371        $xpath = new DOMXPath($this->getXmlDom());
372        $DOMNodeList = $xpath->query("namespace::*", $this->getXmlDom()->ownerDocument);
373        foreach ($DOMNodeList as $node) {
374            $namespaceURI = $node->namespaceURI;
375            if ($namespaceURI == $namespaceUri) {
376                $parentNode = $node->parentNode;
377                $parentNode->removeAttributeNS($namespaceUri, $node->localName);
378            }
379        }
380
381
382    }
383
384    public
385    function getDocNamespaces()
386    {
387        $xpath = new DOMXPath($this->getXmlDom());
388        // `namespace::*` means selects all the namespace attribute of the context node
389        // namespace is an axes
390        // See https://www.w3.org/TR/1999/REC-xpath-19991116/#axes
391        // the namespace axis contains the namespace nodes of the context node; the axis will be empty unless the context node is an element
392        $DOMNodeList = $xpath->query('namespace::*', $this->getXmlDom()->ownerDocument);
393        $nameSpace = array();
394        foreach ($DOMNodeList as $node) {
395            /** @var DOMElement $node */
396
397            $namespaceURI = $node->namespaceURI;
398            $localName = $node->prefix;
399            if ($namespaceURI != null) {
400                $nameSpace[$localName] = $namespaceURI;
401            }
402        }
403        return $nameSpace;
404    }
405
406    /**
407     * A wrapper that register namespace for the query
408     * with the defined prefix
409     * See comment:
410     * https://www.php.net/manual/en/domxpath.registernamespace.php#51480
411     * @param $query
412     * @param string $defaultNamespace
413     * @return DOMNodeList|false
414     *
415     * Note that this is possible to do evaluation to return a string instead
416     * https://www.php.net/manual/en/domxpath.evaluate.php
417     */
418    public
419    function xpath($query)
420    {
421        $xpath = new DOMXPath($this->getXmlDom());
422
423        /**
424         * Prefix mapping
425         * It is necessary to use xpath to handle documents which have default namespaces.
426         * The xpath expression will search for items with no namespace by default.
427         */
428        foreach ($this->getDocNamespaces() as $prefix => $namespaceUri) {
429            /**
430             * You can't register an empty prefix
431             * Default namespace (without a prefix) can only be accessed by the local-name() and namespace-uri() attributes.
432             */
433            if (!empty($prefix)) {
434                $result = $xpath->registerNamespace($prefix, $namespaceUri);
435                if (!$result) {
436                    LogUtility::msg("Not able to register the prefix ($prefix) for the namespace uri ($namespaceUri)");
437                }
438            }
439        }
440
441        return $xpath->query($query);
442
443    }
444
445
446    public
447    function removeRootAttribute($attribute)
448    {
449
450        // This function does not work
451        // $result = $this->getXmlDom()->documentElement->removeAttribute($attribute);
452
453        for ($i = 0; $i < $this->getXmlDom()->documentElement->attributes->length; $i++) {
454            if ($this->getXmlDom()->documentElement->attributes[$i]->name == $attribute) {
455                $result = $this->getXmlDom()->documentElement->removeAttributeNode($this->getXmlDom()->documentElement->attributes[$i]);
456                if ($result === false) {
457                    throw new \RuntimeException("Not able to delete the $attribute");
458                }
459                // There is no break here because you may find multiple version attribute for instance
460            }
461        }
462
463    }
464
465    public
466    function removeRootChildNode($nodeName)
467    {
468        for ($i = 0; $i < $this->getXmlDom()->documentElement->childNodes->length; $i++) {
469            $childNode = &$this->getXmlDom()->documentElement->childNodes[$i];
470            if ($childNode->nodeName == $nodeName) {
471                $result = $this->getXmlDom()->documentElement->removeChild($childNode);
472                if ($result == false) {
473                    throw new \RuntimeException("Not able to delete the child node $nodeName");
474                }
475                break;
476            }
477        }
478    }
479
480    /**
481     *
482     * Add a value to an attribute value
483     * Example
484     * <a class="actual">
485     *
486     * if you add "new"
487     * <a class="actual new">
488     *
489     * @param $attName
490     * @param $attValue
491     * @param DOMElement $xml
492     */
493    public
494    function addAttributeValue($attName, $attValue, $xml)
495    {
496
497        /**
498         * Empty condition is better than {@link DOMElement::hasAttribute()}
499         * because even if the dom element has the attribute, the value
500         * may be empty
501         */
502        $value = $xml->getAttribute($attName);
503        if (empty($value)) {
504            $xml->setAttribute($attName, $attValue);
505        } else {
506            $actualAttValue = $xml->getAttribute($attName);
507            $explodeArray = explode(" ", $actualAttValue);
508            if (!in_array($attValue, $explodeArray)) {
509                $xml->setAttribute($attName, (string)$actualAttValue . " $attValue");
510            }
511        }
512
513    }
514
515    public function diff(XmlDocument $rightDocument)
516    {
517        $error = "";
518        XmlUtility::diffNode($this->getXmlDom(), $rightDocument->getXmlDom(), $error);
519        return $error;
520    }
521
522    /**
523     * @return string a XML formatted
524     *
525     * !!!! The parameter preserveWhiteSpace should have been set to false before loading
526     * https://www.php.net/manual/en/class.domdocument.php#domdocument.props.formatoutput
527     * $this->xmlDom->preserveWhiteSpace = false;
528     *
529     * We do it with the function {@link XmlDocument::mandatoryFormatConfigBeforeLoading()}
530     *
531     */
532    public function getXmlTextFormatted()
533    {
534
535        $this->xmlDom->formatOutput = true;
536        return $this->getXmlText();
537
538    }
539
540    /**
541     * @return string that can be diff
542     *   * EOL diff are not seen
543     *   * space are
544     *
545     * See also {@link XmlDocument::processTextBeforeLoading()}
546     * that is needed before loading
547     */
548    public function getXmlTextNormalized()
549    {
550
551        /**
552         * If the text was a list
553         * of sibling text without parent
554         * We may get a body
555         * @deprecated letting the code until
556         * TODO: delete this code when the test pass
557         */
558//        $body = $doc->getElementsByTagName("body");
559//        if ($body->length != 0) {
560//            $DOMNodeList = $body->item(0)->childNodes;
561//            $output = "";
562//            foreach ($DOMNodeList as $value) {
563//                $output .= $doc->saveXML($value) . DOKU_LF;
564//            }
565//        }
566
567        $this->xmlDom->documentElement->normalize();
568        return $this->getXmlTextFormatted();
569    }
570
571    /**
572     * Not really conventional but
573     * to be able to {@link getXmlTextNormalized}
574     * the EOL should be deleted
575     * We do it before loading and not with a XML documentation
576     */
577    private function processTextBeforeLoading($text)
578    {
579        $text = str_replace(DOKU_LF, "", $text);
580        $text = preg_replace("/\r\n\s*\r\n/", "\r\n", $text);
581        $text = preg_replace("/\n\s*\n/", "\n", $text);
582        $text = preg_replace("/\n\n/", "\n", $text);
583        return $text;
584
585    }
586
587
588    /**
589     * This function is called just before loading
590     * in order to be able to {@link XmlDocument::getXmlTextFormatted() format the output }
591     * https://www.php.net/manual/en/class.domdocument.php#domdocument.props.formatoutput
592     * Mandatory for a a good formatting before loading
593     *
594     */
595    private function mandatoryFormatConfigBeforeLoading()
596    {
597        // not that
598        // the loading option: LIBXML_NOBLANKS
599        // is equivalent to $this->xmlDom->preserveWhiteSpace = true;
600        $this->xmlDom->preserveWhiteSpace = false;
601    }
602
603    public function removeAttributeValue(string $attributeName, DOMElement $nodeElement)
604    {
605        $attr = $nodeElement->getAttributeNode($attributeName);
606        if ($attr == false) {
607            return;
608        }
609        $result = $nodeElement->removeAttributeNode($attr);
610        if ($result === false) {
611            LogUtility::msg("Not able to delete the attribute $attributeName of the node element $nodeElement in the Xml document $this");
612        }
613    }
614
615    /**
616     * @throws ExceptionCombo
617     */
618    public function queryXpath(string $string): ?DOMElement
619    {
620
621        $elements = $this->queryXpaths($string);
622        if ($elements !== null && sizeof($elements) > 0) {
623            return $elements[0];
624        }
625        return null;
626    }
627
628    /**
629     * @return null|DOMElement[]
630     * @throws ExceptionCombo
631     */
632    public function queryXpaths(string $string): ?array
633    {
634        $nodes = $this->xpath($string);
635        if ($nodes === false) {
636            throw new ExceptionCombo("Bad xpath expression ($string)");
637        }
638        if ($nodes->count() === 0) {
639            return null;
640        }
641        $elements = null;
642        for ($i = 0; $i < $nodes->count(); $i++) {
643            $element = $nodes->item($i);
644            if (!($element instanceof DOMElement)) {
645                throw new ExceptionCombo("The xpath expression has selected a Node that is not an element");
646            }
647            $elements[] = $element;
648
649        }
650
651        return $elements;
652    }
653
654
655}
656