[PATCH] Re: Issue: Deprecation of the XML2 module 'xml_is_well_formed' function - Mailing list pgsql-hackers

From Mike Fowler
Subject [PATCH] Re: Issue: Deprecation of the XML2 module 'xml_is_well_formed' function
Date
Msg-id 4C349F44.9020505@mlfowler.com
Whole thread Raw
In response to Re: Issue: Deprecation of the XML2 module 'xml_is_well_formed' function  (Peter Eisentraut <peter_e@gmx.net>)
Responses Re: [PATCH] Re: Issue: Deprecation of the XML2 module 'xml_is_well_formed' function
List pgsql-hackers
Peter Eisentraut wrote:
> On lör, 2010-07-03 at 09:26 +0100, Mike Fowler wrote:
>
>> What I will do
>> instead is implement the xml_is_well_formed function and get a patch
>> out in the next day or two.
>>
>
> That sounds very useful.
>
Here's the patch to add the 'xml_is_well_formed' function. Paraphrasing
the SGML the syntax is:

|xml_is_well_formed|(/text/)

The function |xml_is_well_formed| evaluates whether the /text/ is well
formed XML content, returning a boolean. I've done some tests (included
in the patch) with tables containing a mixture of well formed documents
and content and the function is happily returning the expected result.
Combining with IS (NOT) DOCUMENT is working nicely for pulling out
content or documents from a table of text.

Unless I missed something in the original correspondence, I think this
patch will solve the issue.

Regards,

--
Mike Fowler
Registered Linux user: 379787

*** a/doc/src/sgml/func.sgml
--- b/doc/src/sgml/func.sgml
***************
*** 8554,8562 **** SELECT xmlagg(x) FROM (SELECT * FROM test ORDER BY y DESC) AS tab;
  ]]></screen>
      </para>
     </sect3>

     <sect3>
!     <title>XML Predicates</title>

      <indexterm>
       <primary>IS DOCUMENT</primary>
--- 8554,8566 ----
  ]]></screen>
      </para>
     </sect3>
+   </sect2>
+
+   <sect2>
+    <title>XML Predicates</title>

     <sect3>
!     <title>IS DOCUMENT</title>

      <indexterm>
       <primary>IS DOCUMENT</primary>
***************
*** 8574,8579 **** SELECT xmlagg(x) FROM (SELECT * FROM test ORDER BY y DESC) AS tab;
--- 8578,8653 ----
       between documents and content fragments.
      </para>
     </sect3>
+
+    <sect3>
+     <title>xml_is_well_formed</title>
+
+     <indexterm>
+      <primary>xml_is_well_formed</primary>
+      <secondary>well formed</secondary>
+     </indexterm>
+
+ <synopsis>
+ <function>xml_is_well_formed</function>(<replaceable>text</replaceable>)
+ </synopsis>
+
+     <para>
+      The function <function>xml_is_well_formed</function> evaluates whether
+      the <replaceable>text</replaceable> is well formed XML content, returning
+      a boolean.
+     </para>
+     <para>
+     Example:
+ <screen><![CDATA[
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+  xml_is_well_formed
+ --------------------
+  t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo');
+  xml_is_well_formed
+ --------------------
+  f
+ (1 row)
+ ]]></screen>
+     </para>
+     <para>
+     This function can be combined with the IS DOCUMENT predicate to prevent
+     invalid XML content errors from occuring in queries. For example, given a
+     table that may have rows with invalid XML mixed in with rows of valid
+     XML, <function>xml_is_well_formed</function> can be used to filter out all
+     the invalid rows.
+     </para>
+     <para>
+     Example:
+ <screen><![CDATA[
+ SELECT * FROM mixed;
+              data
+ ------------------------------
+  <foo>bar</foo>
+  <foo>bar</foo
+  <foo>bar</foo><bar>foo</bar>
+  <foo>bar</foo><bar>foo</bar
+ (4 rows)
+
+ SELECT COUNT(data) FROM mixed WHERE data::xml IS DOCUMENT;
+ ERROR:  invalid XML content
+ DETAIL:  Entity: line 1: parser error : expected '>'
+ <foo>bar</foo
+              ^
+ Entity: line 1: parser error : chunk is not well balanced
+ <foo>bar</foo
+              ^
+
+ SELECT COUNT(data) FROM mixed WHERE xml_is_well_formed(data) AND data::xml IS DOCUMENT;
+  count
+ -------
+      1
+ (1 row)
+ ]]></screen>
+     </para>
+    </sect3>
    </sect2>

    <sect2 id="functions-xml-processing">
*** a/src/backend/utils/adt/xml.c
--- b/src/backend/utils/adt/xml.c
***************
*** 3293,3298 **** xml_xmlnodetoxmltype(xmlNodePtr cur)
--- 3293,3365 ----
  }
  #endif

+ Datum
+ xml_is_well_formed(PG_FUNCTION_ARGS)
+ {
+ #ifdef USE_LIBXML
+     text                *data = PG_GETARG_TEXT_P(0);
+     bool                result;
+     int                    res_code;
+     int32                len;
+     const xmlChar        *string;
+     xmlParserCtxtPtr    ctxt;
+     xmlDocPtr            doc = NULL;
+
+     len = VARSIZE(data) - VARHDRSZ;
+     string = xml_text2xmlChar(data);
+
+     /* Start up libxml and its parser (no-ops if already done) */
+     pg_xml_init();
+     xmlInitParser();
+
+     ctxt = xmlNewParserCtxt();
+     if (ctxt == NULL)
+         xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY,
+                     "could not allocate parser context");
+
+     PG_TRY();
+     {
+         size_t        count;
+         xmlChar    *version = NULL;
+         int            standalone = -1;
+
+         res_code = parse_xml_decl(string, &count, &version, NULL, &standalone);
+         if (res_code != 0)
+             xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT,
+                           "invalid XML content: invalid XML declaration",
+                             res_code);
+
+         doc = xmlNewDoc(version);
+         doc->encoding = xmlStrdup((const xmlChar *) "UTF-8");
+         doc->standalone = 1;
+
+         res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, string + count, NULL);
+
+         result = !res_code;
+     }
+     PG_CATCH();
+     {
+         if (doc)
+             xmlFreeDoc(doc);
+         if (ctxt)
+             xmlFreeParserCtxt(ctxt);
+
+         PG_RE_THROW();
+     }
+     PG_END_TRY();
+
+     if (doc)
+         xmlFreeDoc(doc);
+     if (ctxt)
+         xmlFreeParserCtxt(ctxt);
+
+     return result;
+ #else
+     NO_XML_SUPPORT();
+     return 0;
+ #endif
+ }
+

  /*
   * Evaluate XPath expression and return array of XML values.
*** a/src/include/catalog/pg_proc.h
--- b/src/include/catalog/pg_proc.h
***************
*** 4385,4390 **** DESCR("evaluate XPath expression, with namespaces support");
--- 4385,4393 ----
  DATA(insert OID = 2932 (  xpath         PGNSP PGUID 14 1 0 0 f f f t f i 2 0 143 "25 142" _null_ _null_ _null_ _null_
"selectpg_catalog.xpath($1, $2, ''{}''::pg_catalog.text[])" _null_ _null_ _null_ )); 
  DESCR("evaluate XPath expression");

+ DATA(insert OID = 3037 (  xml_is_well_formed    PGNSP PGUID 12 1 0 0 f f f t f i 1 0 16 "25" _null_ _null_ _null_
_null_xml_is_well_formed _null_ _null_ _null_ )); 
+ DESCR("determine if a text fragment is well formed XML");
+
  /* uuid */
  DATA(insert OID = 2952 (  uuid_in           PGNSP PGUID 12 1 0 0 f f f t f i 1 0 2950 "2275" _null_ _null_ _null_
_null_uuid_in _null_ _null_ _null_ )); 
  DESCR("I/O");
*** a/src/include/utils/xml.h
--- b/src/include/utils/xml.h
***************
*** 46,51 **** extern Datum query_to_xmlschema(PG_FUNCTION_ARGS);
--- 46,52 ----
  extern Datum cursor_to_xmlschema(PG_FUNCTION_ARGS);
  extern Datum table_to_xml_and_xmlschema(PG_FUNCTION_ARGS);
  extern Datum query_to_xml_and_xmlschema(PG_FUNCTION_ARGS);
+ extern Datum xml_is_well_formed(PG_FUNCTION_ARGS);

  extern Datum schema_to_xml(PG_FUNCTION_ARGS);
  extern Datum schema_to_xmlschema(PG_FUNCTION_ARGS);
*** a/src/test/regress/expected/xml.out
--- b/src/test/regress/expected/xml.out
***************
*** 502,504 **** SELECT xpath('//b', '<a>one <b>two</b> three <b>etc</b></a>');
--- 502,565 ----
   {<b>two</b>,<b>etc</b>}
  (1 row)

+ -- Test xml_is_well_formed
+ SELECT xml_is_well_formed('<>');
+  xml_is_well_formed
+ --------------------
+  f
+ (1 row)
+
+ SELECT xml_is_well_formed('abc');
+  xml_is_well_formed
+ --------------------
+  t
+ (1 row)
+
+ SELECT xml_is_well_formed('<abc/>');
+  xml_is_well_formed
+ --------------------
+  t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+  xml_is_well_formed
+ --------------------
+  t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo');
+  xml_is_well_formed
+ --------------------
+  f
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo><bar>baz</foo>');
+  xml_is_well_formed
+ --------------------
+  f
+ (1 row)
+
+ SELECT xml_is_well_formed('<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number
one</local:piece><local:pieceid="2" /></local:data>'); 
+  xml_is_well_formed
+ --------------------
+  t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo>') AND '<foo>bar</foo>' IS DOCUMENT;
+  ?column?
+ ----------
+  t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo>baz') AND '<foo>bar</foo>baz' IS NOT DOCUMENT;
+  ?column?
+ ----------
+  t
+ (1 row)
+
+ SELECT xml_is_well_formed('<foo>bar</foo><bar>foo</bar>') AND '<foo>bar</foo><bar>foo</bar>' IS NOT DOCUMENT;
+  ?column?
+ ----------
+  t
+ (1 row)
+
*** a/src/test/regress/sql/xml.sql
--- b/src/test/regress/sql/xml.sql
***************
*** 163,165 **** SELECT xpath('', '<!-- error -->');
--- 163,179 ----
  SELECT xpath('//text()', '<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number
one</local:piece><local:pieceid="2" /></local:data>'); 
  SELECT xpath('//loc:piece/@id', '<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number
one</local:piece><local:pieceid="2" /></local:data>', ARRAY[ARRAY['loc', 'http://127.0.0.1']]); 
  SELECT xpath('//b', '<a>one <b>two</b> three <b>etc</b></a>');
+
+ -- Test xml_is_well_formed
+
+ SELECT xml_is_well_formed('<>');
+ SELECT xml_is_well_formed('abc');
+ SELECT xml_is_well_formed('<abc/>');
+ SELECT xml_is_well_formed('<foo>bar</foo>');
+ SELECT xml_is_well_formed('<foo>bar</foo');
+ SELECT xml_is_well_formed('<foo><bar>baz</foo>');
+ SELECT xml_is_well_formed('<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number
one</local:piece><local:pieceid="2" /></local:data>'); 
+ SELECT xml_is_well_formed('<foo>bar</foo>') AND '<foo>bar</foo>' IS DOCUMENT;
+ SELECT xml_is_well_formed('<foo>bar</foo>baz') AND '<foo>bar</foo>baz' IS NOT DOCUMENT;
+ SELECT xml_is_well_formed('<foo>bar</foo><bar>foo</bar>') AND '<foo>bar</foo><bar>foo</bar>' IS NOT DOCUMENT;
+

pgsql-hackers by date:

Previous
From: Tim Landscheidt
Date:
Subject: Re: cvs to git migration - keywords
Next
From: Jan Urbański
Date:
Subject: Re: Python Interface Hacking