1<?php
2/**
3 * Copyright (c) 2021. ComboStrap, Inc. and its affiliates. All Rights Reserved.
4 *
5 * This source code is licensed under the GPL license found in the
6 * COPYING  file in the root directory of this source tree.
7 *
8 * @license  GPL 3 (https://www.gnu.org/licenses/gpl-3.0.en.html)
9 * @author   ComboStrap <support@combostrap.com>
10 *
11 */
12
13namespace ComboStrap;
14
15use DOMAttr;
16use DOMDocument;
17use DOMElement;
18use DOMNodeList;
19use DOMXPath;
20use Exception;
21use LibXMLError;
22
23
24require_once(__DIR__ . '/File.php');
25
26class XmlDocument
27{
28    const HTML_TYPE = "html";
29    const XML_TYPE = "xml";
30    /**
31     * The error that the HTML loading
32     * may returns
33     */
34    const KNOWN_HTML_LOADING_ERRORS = [
35        "Tag section invalid\n", // section is HTML5 tag
36        "Tag footer invalid\n", // footer is HTML5 tag
37        "error parsing attribute name\n", // name is an HTML5 attribute
38        "Unexpected end tag : blockquote\n", // name is an HTML5 attribute
39        "Tag bdi invalid\n",
40        "Tag path invalid\n", // svg
41        "Tag svg invalid\n", // svg
42        "Unexpected end tag : a\n", // when the document is only a anchor
43        "Unexpected end tag : p\n", // when the document is only a p
44        "Unexpected end tag : button\n" // // when the document is only a button
45
46    ];
47
48    const CANONICAL = "xml";
49
50    /**
51     * @var DOMDocument
52     */
53    private $xmlDom = null;
54
55    /**
56     * XmlFile constructor.
57     * @param $text
58     * @param $type - HTML or not
59     */
60    public function __construct($text, $type = self::XML_TYPE)
61    {
62
63
64        if ($this->isXmlExtensionLoaded()) {
65
66            // https://www.php.net/manual/en/libxml.constants.php
67            $options = LIBXML_NOCDATA
68                // | LIBXML_NOBLANKS // same as preserveWhiteSpace=true, not set to be able to format the output
69                | LIBXML_NOXMLDECL // Drop the XML declaration when saving a document
70                | LIBXML_NONET // No network during load
71                | LIBXML_NSCLEAN // Remove redundant namespace declarations - for whatever reason, the formatting does not work if this is set
72            ;
73
74            // HTML
75            if ($type == self::HTML_TYPE) {
76
77                // Options that cause the processus to hang if this is not for a html file
78                // Empty tag option may also be used only on save
79                //   at https://www.php.net/manual/en/domdocument.save.php
80                //   and https://www.php.net/manual/en/domdocument.savexml.php
81                $options = $options
82                    // | LIBXML_NOEMPTYTAG // Expand empty tags (e.g. <br/> to <br></br>)
83                    | LIBXML_HTML_NODEFDTD // No doctype
84                    | LIBXML_HTML_NOIMPLIED;
85
86
87            }
88
89            /**
90             * No warning reporting
91             * Load XML issue E_STRICT warning seen in the log
92             */
93            if (!defined('DOKU_UNITTEST')) {
94                $oldLevel = error_reporting(E_ERROR);
95            }
96
97            $this->xmlDom = new DOMDocument('1.0', 'UTF-8');
98
99            $this->mandatoryFormatConfigBeforeLoading();
100
101
102            $text = $this->processTextBeforeLoading($text);
103
104            /**
105             * Because the load does handle HTML5tag as error
106             * (ie section for instance)
107             * We take over the errors and handle them after the below load
108             *
109             * https://www.php.net/manual/en/function.libxml-use-internal-errors.php
110             *
111             * @noinspection PhpComposerExtensionStubsInspection
112             */
113            libxml_use_internal_errors(true);
114
115            if ($type == self::XML_TYPE) {
116
117                $result = $this->xmlDom->loadXML($text, $options);
118
119            } else {
120
121                /**
122                 * Unlike loading XML, HTML does not have to be well-formed to load.
123                 * While malformed HTML should load successfully, this function may generate E_WARNING errors
124                 * @deprecated as we try to be XHTML compliantXML but yeah this is not always possible
125                 */
126
127                /**
128                 * Bug: Even if we set that the document is an UTF-8
129                 * loadHTML treat the string as being in ISO-8859-1 if without any heading
130                 * (ie <xml encoding="utf-8"..>
131                 * https://stackoverflow.com/questions/8218230/php-domdocument-loadhtml-not-encoding-utf-8-correctly
132                 * Otherwise French and other language are not well loaded
133                 *
134                 * We use the trick to transform UTF-8 to HTML
135                 */
136                $htmlEntityEncoded = mb_convert_encoding($text, 'HTML-ENTITIES', 'UTF-8');
137                $result = $this->xmlDom->loadHTML($htmlEntityEncoded, $options);
138
139            }
140            if ($result === false) {
141
142                /**
143                 * Error
144                 */
145                /** @noinspection PhpComposerExtensionStubsInspection */
146                $errors = libxml_get_errors();
147
148                foreach ($errors as $error) {
149
150                    /* @var LibXMLError
151                     * @noinspection PhpComposerExtensionStubsInspection
152                     *
153                     * Section is an html5 tag (and is invalid for libxml)
154                     */
155                    if (!in_array($error->message, self::KNOWN_HTML_LOADING_ERRORS)) {
156                        /**
157                         * This error is an XML and HTML error
158                         */
159                        if (
160                            strpos($error->message, "htmlParseEntityRef: expecting ';' in Entity") !== false
161                            ||
162                            $error->message == "EntityRef: expecting ';'\n"
163                        ) {
164                            $message = "There is big probability that there is an ampersand alone `&`. ie You forgot to call html/Xml entities in a `src` or `url` attribute.";
165                        } else {
166                            $message = "Error while loading HTML";
167                        }
168                        $message .= "Error: " . $error->message . ", Loaded text: " . $text;
169
170                        /**
171                         * We clean the errors, otherwise
172                         * in a test series, they failed the next test
173                         *
174                         * @noinspection PhpComposerExtensionStubsInspection
175                         */
176                        libxml_clear_errors();
177
178                        // In test, this will send a exception
179                        LogUtility::msg($message, LogUtility::LVL_MSG_ERROR, self::CANONICAL);
180
181                    }
182
183                }
184            }
185
186            /**
187             * We clean the known errors (otherwise they are added in a queue)
188             * @noinspection PhpComposerExtensionStubsInspection
189             */
190            libxml_clear_errors();
191
192            /**
193             * Error reporting back
194             */
195            if (!defined('DOKU_UNITTEST')) {
196                error_reporting($oldLevel);
197            }
198
199            // namespace error : Namespace prefix dc on format is not defined
200            // missing the ns declaration in the file. example:
201            // xmlns:dc="http://purl.org/dc/elements/1.1/"
202
203
204        } else {
205
206            /**
207             * If the XML module is not present
208             */
209            LogUtility::msg("The php `libxml` module was not found on your installation, the xml/svg file could not be modified / instantiated", LogUtility::LVL_MSG_ERROR, "support");
210
211
212        }
213
214    }
215
216    /**
217     * To not have a collusion with {@link SvgDocument::createSvgDocumentFromPath()}
218     * @param Path $path
219     * @return XmlDocument
220     */
221    public
222    static function createXmlDocFromPath(Path $path): XmlDocument
223    {
224        $mime = XmlDocument::XML_TYPE;
225        if (in_array($path->getExtension(), ["html", "htm"])) {
226            $mime = XmlDocument::HTML_TYPE;
227        }
228        $content = FileSystems::getContent($path);
229        return new XmlDocument($content, $mime);
230    }
231
232    public
233    static function createXmlDocFromMarkup($string): XmlDocument
234    {
235        $mime = XmlDocument::XML_TYPE;
236        return new XmlDocument($string, $mime);
237    }
238
239    public
240    function &getXmlDom()
241    {
242        return $this->xmlDom;
243    }
244
245    public
246    function setRootAttribute($string, $name)
247    {
248        if ($this->isXmlExtensionLoaded()) {
249            $this->xmlDom->documentElement->setAttribute($string, $name);
250        }
251    }
252
253    public function getXmlText()
254    {
255
256        $xmlText = $this->getXmlDom()->saveXML(
257            $this->getXmlDom()->documentElement,
258            LIBXML_NOXMLDECL // no xml declaration
259        );
260        // Delete doctype (for svg optimization)
261        // php has only doctype manipulation for HTML
262        $xmlText = preg_replace('/^<!DOCTYPE.+?>/', '', $xmlText);
263        return trim($xmlText);
264
265    }
266
267    /**
268     * https://www.php.net/manual/en/dom.installation.php
269     *
270     * Check it with
271     * ```
272     * php -m
273     * ```
274     * Install with
275     * ```
276     * sudo apt-get install php-xml
277     * ```
278     * @return bool
279     */
280    public
281    function isXmlExtensionLoaded()
282    {
283        // A suffix used in the bad message
284        $suffixBadMessage = "php extension is not installed. To install it, you need to install xml. Example: `sudo apt-get install php-xml`, `yum install php-xml`";
285
286        // https://www.php.net/manual/en/dom.requirements.php
287        $loaded = extension_loaded("libxml");
288        if ($loaded === false) {
289            LogUtility::msg("The libxml {$suffixBadMessage}");
290        } else {
291            $loaded = extension_loaded("xml");
292            if ($loaded === false) {
293                LogUtility::msg("The xml {$suffixBadMessage}");
294            } else {
295                $loaded = extension_loaded("dom");
296                if ($loaded === false) {
297                    LogUtility::msg("The dom {$suffixBadMessage}");
298                }
299            }
300        }
301        return $loaded;
302    }
303
304    /**
305     * https://stackoverflow.com/questions/30257438/how-to-completely-remove-a-namespace-using-domdocument
306     * @param $namespaceUri
307     */
308    function removeNamespace($namespaceUri)
309    {
310        if (empty($namespaceUri)) {
311            throw new \RuntimeException("The namespace is empty and should be specified");
312        }
313
314        if (strpos($namespaceUri, "http") === false) {
315            LogUtility::msg("Internal warning: The namespaceURI ($namespaceUri) does not seems to be an URI", LogUtility::LVL_MSG_WARNING, "support");
316        }
317
318        /**
319         * @var DOMNodeList $nodes
320         * finds all nodes that have a namespace node called $ns where their parent node doesn't also have the same namespace.
321         * @var DOMNodeList $nodes
322         */
323        $nodes = $this->xpath("//*[namespace-uri()='$namespaceUri']");
324        foreach ($nodes as $node) {
325            /** @var DOMElement $node */
326            $node->parentNode->removeChild($node);
327        }
328
329        $nodes = $this->xpath("//@*[namespace-uri()='$namespaceUri']");
330        foreach ($nodes as $node) {
331            /** @var DOMAttr $node */
332            /** @var DOMElement $DOMNode */
333            $DOMNode = $node->parentNode;
334            $DOMNode->removeAttributeNode($node);
335        }
336
337        //Node namespace can be select only from the document
338        $xpath = new DOMXPath($this->getXmlDom());
339        $DOMNodeList = $xpath->query("namespace::*", $this->getXmlDom()->ownerDocument);
340        foreach ($DOMNodeList as $node) {
341            $namespaceURI = $node->namespaceURI;
342            if ($namespaceURI == $namespaceUri) {
343                $parentNode = $node->parentNode;
344                $parentNode->removeAttributeNS($namespaceUri, $node->localName);
345            }
346        }
347
348
349    }
350
351    public
352    function getDocNamespaces()
353    {
354        $xpath = new DOMXPath($this->getXmlDom());
355        // `namespace::*` means selects all the namespace attribute of the context node
356        // namespace is an axes
357        // See https://www.w3.org/TR/1999/REC-xpath-19991116/#axes
358        // the namespace axis contains the namespace nodes of the context node; the axis will be empty unless the context node is an element
359        $DOMNodeList = $xpath->query('namespace::*', $this->getXmlDom()->ownerDocument);
360        $nameSpace = array();
361        foreach ($DOMNodeList as $node) {
362            /** @var DOMElement $node */
363
364            $namespaceURI = $node->namespaceURI;
365            $localName = $node->prefix;
366            if ($namespaceURI != null) {
367                $nameSpace[$localName] = $namespaceURI;
368            }
369        }
370        return $nameSpace;
371    }
372
373    /**
374     * A wrapper that register namespace for the query
375     * with the defined prefix
376     * See comment:
377     * https://www.php.net/manual/en/domxpath.registernamespace.php#51480
378     * @param $query
379     * @param string $defaultNamespace
380     * @return DOMNodeList|false
381     */
382    public
383    function xpath($query)
384    {
385        $xpath = new DOMXPath($this->getXmlDom());
386        foreach ($this->getDocNamespaces() as $prefix => $namespaceUri) {
387            /**
388             * You can't register an empty prefix
389             * Default namespace (without a prefix) can only be accessed by the local-name() and namespace-uri() attributes.
390             */
391            if (!empty($prefix)) {
392                $result = $xpath->registerNamespace($prefix, $namespaceUri);
393                if (!$result) {
394                    LogUtility::msg("Not able to register the prefix ($prefix) for the namespace uri ($namespaceUri)");
395                }
396            }
397        }
398
399        return $xpath->query($query);
400
401    }
402
403
404    public
405    function removeRootAttribute($attribute)
406    {
407
408        // This function does not work
409        // $result = $this->getXmlDom()->documentElement->removeAttribute($attribute);
410
411        for ($i = 0; $i < $this->getXmlDom()->documentElement->attributes->length; $i++) {
412            if ($this->getXmlDom()->documentElement->attributes[$i]->name == $attribute) {
413                $result = $this->getXmlDom()->documentElement->removeAttributeNode($this->getXmlDom()->documentElement->attributes[$i]);
414                if ($result === false) {
415                    throw new \RuntimeException("Not able to delete the $attribute");
416                }
417                // There is no break here because you may find multiple version attribute for instance
418            }
419        }
420
421    }
422
423    public
424    function removeRootChildNode($nodeName)
425    {
426        for ($i = 0; $i < $this->getXmlDom()->documentElement->childNodes->length; $i++) {
427            $childNode = &$this->getXmlDom()->documentElement->childNodes[$i];
428            if ($childNode->nodeName == $nodeName) {
429                $result = $this->getXmlDom()->documentElement->removeChild($childNode);
430                if ($result === false) {
431                    throw new \RuntimeException("Not able to delete the child node $nodeName");
432                }
433                break;
434            }
435        }
436    }
437
438    /**
439     *
440     * Add a value to an attribute value
441     * Example
442     * <a class="actual">
443     *
444     * if you add "new"
445     * <a class="actual new">
446     *
447     * @param $attName
448     * @param $attValue
449     * @param DOMElement $xml
450     */
451    public
452    function addAttributeValue($attName, $attValue, $xml)
453    {
454
455        /**
456         * Empty condition is better than {@link DOMElement::hasAttribute()}
457         * because even if the dom element has the attribute, the value
458         * may be empty
459         */
460        $value = $xml->getAttribute($attName);
461        if (empty($value)) {
462            $xml->setAttribute($attName, $attValue);
463        } else {
464            $actualAttValue = $xml->getAttribute($attName);
465            $explodeArray = explode(" ", $actualAttValue);
466            if (!in_array($attValue, $explodeArray)) {
467                $xml->setAttribute($attName, (string)$actualAttValue . " $attValue");
468            }
469        }
470
471    }
472
473    public function diff(XmlDocument $rightDocument)
474    {
475        $error = "";
476        XmlUtility::diffNode($this->getXmlDom(), $rightDocument->getXmlDom(), $error);
477        return $error;
478    }
479
480    /**
481     * @return string a XML formatted
482     *
483     * !!!! The parameter preserveWhiteSpace should have been set to false before loading
484     * https://www.php.net/manual/en/class.domdocument.php#domdocument.props.formatoutput
485     * $this->xmlDom->preserveWhiteSpace = false;
486     *
487     * We do it with the function {@link XmlDocument::mandatoryFormatConfigBeforeLoading()}
488     *
489     */
490    public function getXmlTextFormatted()
491    {
492
493        $this->xmlDom->formatOutput = true;
494        return $this->getXmlText();
495
496    }
497
498    /**
499     * @return string that can be diff
500     *   * EOL diff are not seen
501     *   * space are
502     *
503     * See also {@link XmlDocument::processTextBeforeLoading()}
504     * that is needed before loading
505     */
506    public function getXmlTextNormalized()
507    {
508
509        /**
510         * If the text was a list
511         * of sibling text without parent
512         * We may get a body
513         * @deprecated letting the code until
514         * TODO: delete this code when the test pass
515         */
516//        $body = $doc->getElementsByTagName("body");
517//        if ($body->length != 0) {
518//            $DOMNodeList = $body->item(0)->childNodes;
519//            $output = "";
520//            foreach ($DOMNodeList as $value) {
521//                $output .= $doc->saveXML($value) . DOKU_LF;
522//            }
523//        }
524
525        $this->xmlDom->documentElement->normalize();
526        return $this->getXmlTextFormatted();
527    }
528
529    /**
530     * Not really conventional but
531     * to be able to {@link getXmlTextNormalized}
532     * the EOL should be deleted
533     * We do it before loading and not with a XML documentation
534     */
535    private function processTextBeforeLoading($text)
536    {
537        $text = str_replace(DOKU_LF, "", $text);
538        $text = preg_replace("/\r\n\s*\r\n/", "\r\n", $text);
539        $text = preg_replace("/\n\s*\n/", "\n", $text);
540        $text = preg_replace("/\n\n/", "\n", $text);
541        return $text;
542
543    }
544
545
546    /**
547     * This function is called just before loading
548     * in order to be able to {@link XmlDocument::getXmlTextFormatted() format the output }
549     * https://www.php.net/manual/en/class.domdocument.php#domdocument.props.formatoutput
550     * Mandatory for a a good formatting before loading
551     *
552     */
553    private function mandatoryFormatConfigBeforeLoading()
554    {
555        // not that
556        // the loading option: LIBXML_NOBLANKS
557        // is equivalent to $this->xmlDom->preserveWhiteSpace = true;
558        $this->xmlDom->preserveWhiteSpace = false;
559    }
560
561
562}
563