Re: Regression with large XML data input - Mailing list pgsql-hackers

From Tom Lane
Subject Re: Regression with large XML data input
Date
Msg-id 628881.1753733767@sss.pgh.pa.us
Whole thread Raw
In response to Re: Regression with large XML data input  (Tom Lane <tgl@sss.pgh.pa.us>)
Responses Re: Regression with large XML data input
Re: Regression with large XML data input
List pgsql-hackers
I wrote:
> I've not looked at the details of the proposed patches, but will
> do so now that the direction to go in is apparent.

Erik's v2 is slightly wrong as to the save-and-restore logic for
the KeepBlanks setting: we need to restore in the error path too,
and we'd better mark the save variable volatile since it's modified
inside the PG_TRY.  I made some other cosmetic changes, mainly to
avoid calculating "options" when it won't be used.  I tested the
attached v3 against RHEL8's libxml2-2.9.7, as well as against today's
libxml2 git master, and it accepts the problematic input on both.

            regards, tom lane

diff --git a/src/backend/utils/adt/xml.c b/src/backend/utils/adt/xml.c
index f7b731825fc..3379d392260 100644
--- a/src/backend/utils/adt/xml.c
+++ b/src/backend/utils/adt/xml.c
@@ -1769,7 +1769,7 @@ xml_doctype_in_content(const xmlChar *str)
  * xmloption_arg, but a DOCTYPE node in the input can force DOCUMENT mode).
  *
  * If parsed_nodes isn't NULL and we parse in CONTENT mode, the list
- * of parsed nodes from the xmlParseInNodeContext call will be returned
+ * of parsed nodes from the xmlParseBalancedChunkMemory call will be returned
  * to *parsed_nodes.  (It is caller's responsibility to free that.)
  *
  * Errors normally result in ereport(ERROR), but if escontext is an
@@ -1795,6 +1795,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
     PgXmlErrorContext *xmlerrcxt;
     volatile xmlParserCtxtPtr ctxt = NULL;
     volatile xmlDocPtr doc = NULL;
+    volatile int save_keep_blanks = -1;
 
     /*
      * This step looks annoyingly redundant, but we must do it to have a
@@ -1822,7 +1823,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
     PG_TRY();
     {
         bool        parse_as_document = false;
-        int            options;
         int            res_code;
         size_t        count = 0;
         xmlChar    *version = NULL;
@@ -1853,18 +1853,6 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
                 parse_as_document = true;
         }
 
-        /*
-         * Select parse options.
-         *
-         * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR)
-         * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined by
-         * internal DTD are applied'.  As for external DTDs, we try to support
-         * them too (see SQL/XML:2008 GR 10.16.7.e), but that doesn't really
-         * happen because xmlPgEntityLoader prevents it.
-         */
-        options = XML_PARSE_NOENT | XML_PARSE_DTDATTR
-            | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS);
-
         /* initialize output parameters */
         if (parsed_xmloptiontype != NULL)
             *parsed_xmloptiontype = parse_as_document ? XMLOPTION_DOCUMENT :
@@ -1874,11 +1862,26 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
 
         if (parse_as_document)
         {
+            int            options;
+
+            /* set up parser context used by xmlCtxtReadDoc */
             ctxt = xmlNewParserCtxt();
             if (ctxt == NULL || xmlerrcxt->err_occurred)
                 xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
                             "could not allocate parser context");
 
+            /*
+             * Select parse options.
+             *
+             * Note that here we try to apply DTD defaults (XML_PARSE_DTDATTR)
+             * according to SQL/XML:2008 GR 10.16.7.d: 'Default values defined
+             * by internal DTD are applied'.  As for external DTDs, we try to
+             * support them too (see SQL/XML:2008 GR 10.16.7.e), but that
+             * doesn't really happen because xmlPgEntityLoader prevents it.
+             */
+            options = XML_PARSE_NOENT | XML_PARSE_DTDATTR
+                | (preserve_whitespace ? 0 : XML_PARSE_NOBLANKS);
+
             doc = xmlCtxtReadDoc(ctxt, utf8string,
                                  NULL,    /* no URL */
                                  "UTF-8",
@@ -1900,10 +1903,7 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
         }
         else
         {
-            xmlNodePtr    root;
-            xmlNodePtr    oldroot PG_USED_FOR_ASSERTS_ONLY;
-
-            /* set up document with empty root node to be the context node */
+            /* set up document that xmlParseBalancedChunkMemory will add to */
             doc = xmlNewDoc(version);
             if (doc == NULL || xmlerrcxt->err_occurred)
                 xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
@@ -1916,36 +1916,23 @@ xml_parse(text *data, XmlOptionType xmloption_arg,
                             "could not allocate XML document");
             doc->standalone = standalone;
 
-            root = xmlNewNode(NULL, (const xmlChar *) "content-root");
-            if (root == NULL || xmlerrcxt->err_occurred)
-                xml_ereport(xmlerrcxt, ERROR, ERRCODE_OUT_OF_MEMORY,
-                            "could not allocate xml node");
-
-            /*
-             * This attaches root to doc, so we need not free it separately;
-             * and there can't yet be any old root to free.
-             */
-            oldroot = xmlDocSetRootElement(doc, root);
-            Assert(oldroot == NULL);
+            /* set parse options --- have to do this the ugly way */
+            save_keep_blanks = xmlKeepBlanksDefault(preserve_whitespace ? 1 : 0);
 
             /* allow empty content */
             if (*(utf8string + count))
             {
                 xmlNodePtr    node_list = NULL;
-                xmlParserErrors res;
-
-                res = xmlParseInNodeContext(root,
-                                            (char *) utf8string + count,
-                                            strlen((char *) utf8string + count),
-                                            options,
-                                            &node_list);
 
-                if (res != XML_ERR_OK || xmlerrcxt->err_occurred)
+                res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0,
+                                                       utf8string + count,
+                                                       &node_list);
+                if (res_code != 0 || xmlerrcxt->err_occurred)
                 {
-                    xmlFreeNodeList(node_list);
                     xml_errsave(escontext, xmlerrcxt,
                                 ERRCODE_INVALID_XML_CONTENT,
                                 "invalid XML content");
+                    xmlFreeNodeList(node_list);
                     goto fail;
                 }
 
@@ -1961,6 +1948,8 @@ fail:
     }
     PG_CATCH();
     {
+        if (save_keep_blanks != -1)
+            xmlKeepBlanksDefault(save_keep_blanks);
         if (doc != NULL)
             xmlFreeDoc(doc);
         if (ctxt != NULL)
@@ -1972,6 +1961,9 @@ fail:
     }
     PG_END_TRY();
 
+    if (save_keep_blanks != -1)
+        xmlKeepBlanksDefault(save_keep_blanks);
+
     if (ctxt != NULL)
         xmlFreeParserCtxt(ctxt);


pgsql-hackers by date:

Previous
From: Tom Lane
Date:
Subject: Re: Making pgfdw_report_error statically analyzable
Next
From: Dimitrios Apostolou
Date:
Subject: Re: [PING] fallocate() causes btrfs to never compress postgresql files