Re: Adding xpath_exists function - Mailing list pgsql-hackers

From Mike Fowler
Subject Re: Adding xpath_exists function
Date
Msg-id 4BE921A7.7060401@mlfowler.com
Whole thread Raw
In response to Re: Adding xpath_exists function  (Robert Haas <robertmhaas@gmail.com>)
Responses Re: Adding xpath_exists function
List pgsql-hackers
Robert Haas wrote:
> Please email your patch to the list (replying to this email is fine)
> and add it here:
> https://commitfest.postgresql.org/action/commitfest_view/open
>
Here's my patch, developed against HEAD, that adds the function
'xpath_exists'. The function is a lot simpler than originally thought,
so none of the string manipulation previously discussed was required.
I've also included some regression tests that test the function with and
without xml namespaces. I should  note that before I added my tests all
existing tests passed.

One observation that can be made is that I've largely copied the
existing xpath function and altered it to use a different method from
the libxml API. I've done it to save me redoing all the namespace
handling, however it's apparent to me that if we wanted to expose more
of the libxml api we will quickly start having a lot of duplicate code.
I notice that refactoring existing code whilst adding new code is
generally frowned upon, so once this patch is accepted I will look to
refactor the xpath and xpath_exists function. I could even add an
xpath_count method at the same time ;) .

Thanks in advance for any and all feedback,

--
Mike Fowler
Registered Linux user: 379787

"I could be a genius if I just put my mind to it, and I,
I could do anything, if only I could get 'round to it"
-PULP 'Glory Days'

Index: src/backend/utils/adt/xml.c
===================================================================
RCS file: /home/mfowler/cvsrepo/pgrepo/pgsql/src/backend/utils/adt/xml.c,v
retrieving revision 1.97
diff -c -r1.97 xml.c
*** src/backend/utils/adt/xml.c    3 Mar 2010 17:29:45 -0000    1.97
--- src/backend/utils/adt/xml.c    11 May 2010 07:54:53 -0000
***************
*** 3495,3497 ****
--- 3495,3670 ----
      return 0;
  #endif
  }
+
+ /*
+  * Determines if the node specified by the supplied XPath exists
+  * in a given XML document, returning a boolean.
+  *
+  * It is up to the user to ensure that the XML passed is in fact
+  * an XML document - XPath doesn't work easily on fragments without
+  * a context node being known.
+  */
+ Datum
+ xpath_exists(PG_FUNCTION_ARGS)
+ {
+ #ifdef USE_LIBXML
+     text       *xpath_expr_text = PG_GETARG_TEXT_P(0);
+     xmltype    *data = PG_GETARG_XML_P(1);
+     ArrayType  *namespaces = PG_GETARG_ARRAYTYPE_P(2);
+     ArrayBuildState *astate = NULL;
+     xmlParserCtxtPtr ctxt = NULL;
+     xmlDocPtr    doc = NULL;
+     xmlXPathContextPtr xpathctx = NULL;
+     xmlXPathCompExprPtr xpathcomp = NULL;
+     char       *datastr;
+     int32        len;
+     int32        xpath_len;
+     xmlChar    *string;
+     xmlChar    *xpath_expr;
+     int            i;
+     int            res_nitems;
+     int            ndim;
+     Datum       *ns_names_uris;
+     bool       *ns_names_uris_nulls;
+     int            ns_count;
+     int            result;
+
+     /*
+      * Namespace mappings are passed as text[].  If an empty array is passed
+      * (ndim = 0, "0-dimensional"), then there are no namespace mappings.
+      * Else, a 2-dimensional array with length of the second axis being equal
+      * to 2 should be passed, i.e., every subarray contains 2 elements, the
+      * first element defining the name, the second one the URI.  Example:
+      * ARRAY[ARRAY['myns', 'http://example.com'], ARRAY['myns2',
+      * 'http://example2.com']].
+      */
+     ndim = ARR_NDIM(namespaces);
+     if (ndim != 0)
+     {
+         int           *dims;
+
+         dims = ARR_DIMS(namespaces);
+
+         if (ndim != 2 || dims[1] != 2)
+             ereport(ERROR,
+                     (errcode(ERRCODE_DATA_EXCEPTION),
+                      errmsg("invalid array for XML namespace mapping"),
+                      errdetail("The array must be two-dimensional with length of the second axis equal to 2.")));
+
+         Assert(ARR_ELEMTYPE(namespaces) == TEXTOID);
+
+         deconstruct_array(namespaces, TEXTOID, -1, false, 'i',
+                           &ns_names_uris, &ns_names_uris_nulls,
+                           &ns_count);
+
+         Assert((ns_count % 2) == 0);    /* checked above */
+         ns_count /= 2;            /* count pairs only */
+     }
+     else
+     {
+         ns_names_uris = NULL;
+         ns_names_uris_nulls = NULL;
+         ns_count = 0;
+     }
+
+     datastr = VARDATA(data);
+     len = VARSIZE(data) - VARHDRSZ;
+     xpath_len = VARSIZE(xpath_expr_text) - VARHDRSZ;
+     if (xpath_len == 0)
+         ereport(ERROR,
+                 (errcode(ERRCODE_DATA_EXCEPTION),
+                  errmsg("empty XPath expression")));
+
+     string = (xmlChar *) palloc((len + 1) * sizeof(xmlChar));
+     memcpy(string, datastr, len);
+     string[len] = '\0';
+
+     xpath_expr = (xmlChar *) palloc((xpath_len + 1) * sizeof(xmlChar));
+     memcpy(xpath_expr, VARDATA(xpath_expr_text), xpath_len);
+     xpath_expr[xpath_len] = '\0';
+
+     pg_xml_init();
+     xmlInitParser();
+
+     PG_TRY();
+     {
+         /*
+          * redundant XML parsing (two parsings for the same value during one
+          * command execution are possible)
+          */
+         ctxt = xmlNewParserCtxt();
+         if (ctxt == NULL)
+             xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY,
+                         "could not allocate parser context");
+         doc = xmlCtxtReadMemory(ctxt, (char *) string, len, NULL, NULL, 0);
+         if (doc == NULL)
+             xml_ereport(ERROR, ERRCODE_INVALID_XML_DOCUMENT,
+                         "could not parse XML document");
+         xpathctx = xmlXPathNewContext(doc);
+         if (xpathctx == NULL)
+             xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY,
+                         "could not allocate XPath context");
+         xpathctx->node = xmlDocGetRootElement(doc);
+         if (xpathctx->node == NULL)
+             xml_ereport(ERROR, ERRCODE_INTERNAL_ERROR,
+                         "could not find root XML element");
+
+         /* register namespaces, if any */
+         if (ns_count > 0)
+         {
+             for (i = 0; i < ns_count; i++)
+             {
+                 char       *ns_name;
+                 char       *ns_uri;
+
+                 if (ns_names_uris_nulls[i * 2] ||
+                     ns_names_uris_nulls[i * 2 + 1])
+                     ereport(ERROR,
+                             (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+                       errmsg("neither namespace name nor URI may be null")));
+                 ns_name = TextDatumGetCString(ns_names_uris[i * 2]);
+                 ns_uri = TextDatumGetCString(ns_names_uris[i * 2 + 1]);
+                 if (xmlXPathRegisterNs(xpathctx,
+                                        (xmlChar *) ns_name,
+                                        (xmlChar *) ns_uri) != 0)
+                     ereport(ERROR,        /* is this an internal error??? */
+                             (errmsg("could not register XML namespace with name \"%s\" and URI \"%s\"",
+                                     ns_name, ns_uri)));
+             }
+         }
+
+         xpathcomp = xmlXPathCompile(xpath_expr);
+         if (xpathcomp == NULL)    /* TODO: show proper XPath error details */
+             xml_ereport(ERROR, ERRCODE_INTERNAL_ERROR,
+                         "invalid XPath expression");
+
+         result = xmlXPathCompiledEvalToBoolean(xpathcomp, xpathctx);
+         if (result == -1)    /* TODO: reason? */
+             xml_ereport(ERROR, ERRCODE_INTERNAL_ERROR,
+                         "could not create XPath object");
+     }
+     PG_CATCH();
+     {
+         if (xpathcomp)
+             xmlXPathFreeCompExpr(xpathcomp);
+         if (xpathctx)
+             xmlXPathFreeContext(xpathctx);
+         if (doc)
+             xmlFreeDoc(doc);
+         if (ctxt)
+             xmlFreeParserCtxt(ctxt);
+         PG_RE_THROW();
+     }
+     PG_END_TRY();
+
+     xmlXPathFreeCompExpr(xpathcomp);
+     xmlXPathFreeContext(xpathctx);
+     xmlFreeDoc(doc);
+     xmlFreeParserCtxt(ctxt);
+
+     PG_RETURN_BOOL(result);
+ #else
+     NO_XML_SUPPORT();
+     return 0;
+ #endif
+ }
Index: src/include/catalog/pg_proc.h
===================================================================
RCS file: /home/mfowler/cvsrepo/pgrepo/pgsql/src/include/catalog/pg_proc.h,v
retrieving revision 1.570
diff -c -r1.570 pg_proc.h
*** src/include/catalog/pg_proc.h    26 Feb 2010 02:01:21 -0000    1.570
--- src/include/catalog/pg_proc.h    11 May 2010 07:46:09 -0000
***************
*** 4385,4390 ****
--- 4385,4395 ----
  DATA(insert OID = 2932 (  xpath         PGNSP PGUID 14 1 0 0 f f f t f i 2 0 143 "25 142" _null_ _null_ _null_ _null_
"selectpg_catalog.xpath($1, $2, ''{}''::pg_catalog.text[])" _null_ _null_ _null_ )); 
  DESCR("evaluate XPath expression");

+ DATA(insert OID = 3037 (  xpath_exists     PGNSP PGUID 12 1 0 0 f f f t f i 3 0 16 "25 142 1009" _null_ _null_ _null_
_null_xpath_exists _null_ _null_ _null_ )); 
+ DESCR("evaluate XPath expression in a boolean context, with namespaces support");
+ DATA(insert OID = 3038 (  xpath_exists     PGNSP PGUID 14 1 0 0 f f f t f i 2 0 16 "25 142" _null_ _null_ _null_
_null_"select pg_catalog.xpath_exists($1, $2, ''{}''::pg_catalog.text[])" _null_ _null_ _null_ )); 
+ DESCR("evaluate XPath expression in a boolean context");
+
  /* uuid */
  DATA(insert OID = 2952 (  uuid_in           PGNSP PGUID 12 1 0 0 f f f t f i 1 0 2950 "2275" _null_ _null_ _null_
_null_uuid_in _null_ _null_ _null_ )); 
  DESCR("I/O");
Index: src/include/utils/xml.h
===================================================================
RCS file: /home/mfowler/cvsrepo/pgrepo/pgsql/src/include/utils/xml.h,v
retrieving revision 1.31
diff -c -r1.31 xml.h
*** src/include/utils/xml.h    3 Mar 2010 17:29:45 -0000    1.31
--- src/include/utils/xml.h    10 May 2010 21:28:48 -0000
***************
*** 37,42 ****
--- 37,43 ----
  extern Datum xmltotext(PG_FUNCTION_ARGS);
  extern Datum xmlvalidate(PG_FUNCTION_ARGS);
  extern Datum xpath(PG_FUNCTION_ARGS);
+ extern Datum xpath_exists(PG_FUNCTION_ARGS);

  extern Datum table_to_xml(PG_FUNCTION_ARGS);
  extern Datum query_to_xml(PG_FUNCTION_ARGS);
Index: src/test/regress/expected/xml.out
===================================================================
RCS file: /home/mfowler/cvsrepo/pgrepo/pgsql/src/test/regress/expected/xml.out,v
retrieving revision 1.25
diff -c -r1.25 xml.out
*** src/test/regress/expected/xml.out    9 Jun 2009 22:00:57 -0000    1.25
--- src/test/regress/expected/xml.out    11 May 2010 09:04:20 -0000
***************
*** 502,504 ****
--- 502,545 ----
   {<b>two</b>,<b>etc</b>}
  (1 row)

+ -- Text xpath_exists evalutation
+ INSERT INTO xmltest VALUES (4,
'<menu><beers><name>Budvar</name><cost>free</cost><name>Carling</name><cost>lots</cost></beers></menu>'::xml);
+ INSERT INTO xmltest VALUES (5,
'<menu><beers><name>Molson</name><cost>free</cost><name>Carling</name><cost>lots</cost></beers></menu>'::xml);
+ INSERT INTO xmltest VALUES (6, '<myns:menu
xmlns:myns="http://myns.com"><myns:beers><myns:name>Budvar</myns:name><myns:cost>free</myns:cost><myns:name>Carling</myns:name><myns:cost>lots</myns:cost></myns:beers></myns:menu>'::xml);
+ INSERT INTO xmltest VALUES (7, '<myns:menu
xmlns:myns="http://myns.com"><myns:beers><myns:name>Molson</myns:name><myns:cost>free</myns:cost><myns:name>Carling</myns:name><myns:cost>lots</myns:cost></myns:beers></myns:menu>'::xml);
+ SELECT COUNT(id) FROM xmltest WHERE xpath_exists('/menu/beer',data);
+  count
+ -------
+      0
+ (1 row)
+
+ SELECT COUNT(id) FROM xmltest WHERE xpath_exists('/menu/beers',data);
+  count
+ -------
+      2
+ (1 row)
+
+ SELECT COUNT(id) FROM xmltest WHERE xpath_exists('/menu/beers/name[text() = ''Molson'']',data);
+  count
+ -------
+      1
+ (1 row)
+
+ SELECT COUNT(id) FROM xmltest WHERE xpath_exists('/myns:menu/myns:beer',data,ARRAY[ARRAY['myns','http://myns.com']]);
+  count
+ -------
+      0
+ (1 row)
+
+ SELECT COUNT(id) FROM xmltest WHERE
xpath_exists('/myns:menu/myns:beers',data,ARRAY[ARRAY['myns','http://myns.com']]);
+  count
+ -------
+      2
+ (1 row)
+
+ SELECT COUNT(id) FROM xmltest WHERE xpath_exists('/myns:menu/myns:beers/myns:name[text() =
''Molson'']',data,ARRAY[ARRAY['myns','http://myns.com']]);
+  count
+ -------
+      1
+ (1 row)
+
Index: src/test/regress/sql/xml.sql
===================================================================
RCS file: /home/mfowler/cvsrepo/pgrepo/pgsql/src/test/regress/sql/xml.sql,v
retrieving revision 1.19
diff -c -r1.19 xml.sql
*** src/test/regress/sql/xml.sql    9 Jun 2009 22:00:57 -0000    1.19
--- src/test/regress/sql/xml.sql    11 May 2010 08:18:52 -0000
***************
*** 163,165 ****
--- 163,178 ----
  SELECT xpath('//text()', '<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number
one</local:piece><local:pieceid="2" /></local:data>'); 
  SELECT xpath('//loc:piece/@id', '<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number
one</local:piece><local:pieceid="2" /></local:data>', ARRAY[ARRAY['loc', 'http://127.0.0.1']]); 
  SELECT xpath('//b', '<a>one <b>two</b> three <b>etc</b></a>');
+
+ -- Text xpath_exists evalutation
+ INSERT INTO xmltest VALUES (4,
'<menu><beers><name>Budvar</name><cost>free</cost><name>Carling</name><cost>lots</cost></beers></menu>'::xml);
+ INSERT INTO xmltest VALUES (5,
'<menu><beers><name>Molson</name><cost>free</cost><name>Carling</name><cost>lots</cost></beers></menu>'::xml);
+ INSERT INTO xmltest VALUES (6, '<myns:menu
xmlns:myns="http://myns.com"><myns:beers><myns:name>Budvar</myns:name><myns:cost>free</myns:cost><myns:name>Carling</myns:name><myns:cost>lots</myns:cost></myns:beers></myns:menu>'::xml);
+ INSERT INTO xmltest VALUES (7, '<myns:menu
xmlns:myns="http://myns.com"><myns:beers><myns:name>Molson</myns:name><myns:cost>free</myns:cost><myns:name>Carling</myns:name><myns:cost>lots</myns:cost></myns:beers></myns:menu>'::xml);
+
+ SELECT COUNT(id) FROM xmltest WHERE xpath_exists('/menu/beer',data);
+ SELECT COUNT(id) FROM xmltest WHERE xpath_exists('/menu/beers',data);
+ SELECT COUNT(id) FROM xmltest WHERE xpath_exists('/menu/beers/name[text() = ''Molson'']',data);
+ SELECT COUNT(id) FROM xmltest WHERE xpath_exists('/myns:menu/myns:beer',data,ARRAY[ARRAY['myns','http://myns.com']]);
+ SELECT COUNT(id) FROM xmltest WHERE
xpath_exists('/myns:menu/myns:beers',data,ARRAY[ARRAY['myns','http://myns.com']]);
+ SELECT COUNT(id) FROM xmltest WHERE xpath_exists('/myns:menu/myns:beers/myns:name[text() =
''Molson'']',data,ARRAY[ARRAY['myns','http://myns.com']]);
\ No newline at end of file

pgsql-hackers by date:

Previous
From: Heikki Linnakangas
Date:
Subject: Re: Patch for PKST timezone
Next
From: Robert Haas
Date:
Subject: Re: Partitioning/inherited tables vs FKs