[PATCH] Re: Issue: Deprecation of the XML2 module 'xml_is_well_formed' function - Mailing list pgsql-hackers
From | Mike Fowler |
---|---|
Subject | [PATCH] Re: Issue: Deprecation of the XML2 module 'xml_is_well_formed' function |
Date | |
Msg-id | 4C349F44.9020505@mlfowler.com Whole thread Raw |
In response to | Re: Issue: Deprecation of the XML2 module 'xml_is_well_formed' function (Peter Eisentraut <peter_e@gmx.net>) |
Responses |
Re: [PATCH] Re: Issue: Deprecation of the XML2 module
'xml_is_well_formed' function
|
List | pgsql-hackers |
Peter Eisentraut wrote: > On lör, 2010-07-03 at 09:26 +0100, Mike Fowler wrote: > >> What I will do >> instead is implement the xml_is_well_formed function and get a patch >> out in the next day or two. >> > > That sounds very useful. > Here's the patch to add the 'xml_is_well_formed' function. Paraphrasing the SGML the syntax is: |xml_is_well_formed|(/text/) The function |xml_is_well_formed| evaluates whether the /text/ is well formed XML content, returning a boolean. I've done some tests (included in the patch) with tables containing a mixture of well formed documents and content and the function is happily returning the expected result. Combining with IS (NOT) DOCUMENT is working nicely for pulling out content or documents from a table of text. Unless I missed something in the original correspondence, I think this patch will solve the issue. Regards, -- Mike Fowler Registered Linux user: 379787 *** a/doc/src/sgml/func.sgml --- b/doc/src/sgml/func.sgml *************** *** 8554,8562 **** SELECT xmlagg(x) FROM (SELECT * FROM test ORDER BY y DESC) AS tab; ]]></screen> </para> </sect3> <sect3> ! <title>XML Predicates</title> <indexterm> <primary>IS DOCUMENT</primary> --- 8554,8566 ---- ]]></screen> </para> </sect3> + </sect2> + + <sect2> + <title>XML Predicates</title> <sect3> ! <title>IS DOCUMENT</title> <indexterm> <primary>IS DOCUMENT</primary> *************** *** 8574,8579 **** SELECT xmlagg(x) FROM (SELECT * FROM test ORDER BY y DESC) AS tab; --- 8578,8653 ---- between documents and content fragments. </para> </sect3> + + <sect3> + <title>xml_is_well_formed</title> + + <indexterm> + <primary>xml_is_well_formed</primary> + <secondary>well formed</secondary> + </indexterm> + + <synopsis> + <function>xml_is_well_formed</function>(<replaceable>text</replaceable>) + </synopsis> + + <para> + The function <function>xml_is_well_formed</function> evaluates whether + the <replaceable>text</replaceable> is well formed XML content, returning + a boolean. + </para> + <para> + Example: + <screen><![CDATA[ + SELECT xml_is_well_formed('<foo>bar</foo>'); + xml_is_well_formed + -------------------- + t + (1 row) + + SELECT xml_is_well_formed('<foo>bar</foo'); + xml_is_well_formed + -------------------- + f + (1 row) + ]]></screen> + </para> + <para> + This function can be combined with the IS DOCUMENT predicate to prevent + invalid XML content errors from occuring in queries. For example, given a + table that may have rows with invalid XML mixed in with rows of valid + XML, <function>xml_is_well_formed</function> can be used to filter out all + the invalid rows. + </para> + <para> + Example: + <screen><![CDATA[ + SELECT * FROM mixed; + data + ------------------------------ + <foo>bar</foo> + <foo>bar</foo + <foo>bar</foo><bar>foo</bar> + <foo>bar</foo><bar>foo</bar + (4 rows) + + SELECT COUNT(data) FROM mixed WHERE data::xml IS DOCUMENT; + ERROR: invalid XML content + DETAIL: Entity: line 1: parser error : expected '>' + <foo>bar</foo + ^ + Entity: line 1: parser error : chunk is not well balanced + <foo>bar</foo + ^ + + SELECT COUNT(data) FROM mixed WHERE xml_is_well_formed(data) AND data::xml IS DOCUMENT; + count + ------- + 1 + (1 row) + ]]></screen> + </para> + </sect3> </sect2> <sect2 id="functions-xml-processing"> *** a/src/backend/utils/adt/xml.c --- b/src/backend/utils/adt/xml.c *************** *** 3293,3298 **** xml_xmlnodetoxmltype(xmlNodePtr cur) --- 3293,3365 ---- } #endif + Datum + xml_is_well_formed(PG_FUNCTION_ARGS) + { + #ifdef USE_LIBXML + text *data = PG_GETARG_TEXT_P(0); + bool result; + int res_code; + int32 len; + const xmlChar *string; + xmlParserCtxtPtr ctxt; + xmlDocPtr doc = NULL; + + len = VARSIZE(data) - VARHDRSZ; + string = xml_text2xmlChar(data); + + /* Start up libxml and its parser (no-ops if already done) */ + pg_xml_init(); + xmlInitParser(); + + ctxt = xmlNewParserCtxt(); + if (ctxt == NULL) + xml_ereport(ERROR, ERRCODE_OUT_OF_MEMORY, + "could not allocate parser context"); + + PG_TRY(); + { + size_t count; + xmlChar *version = NULL; + int standalone = -1; + + res_code = parse_xml_decl(string, &count, &version, NULL, &standalone); + if (res_code != 0) + xml_ereport_by_code(ERROR, ERRCODE_INVALID_XML_CONTENT, + "invalid XML content: invalid XML declaration", + res_code); + + doc = xmlNewDoc(version); + doc->encoding = xmlStrdup((const xmlChar *) "UTF-8"); + doc->standalone = 1; + + res_code = xmlParseBalancedChunkMemory(doc, NULL, NULL, 0, string + count, NULL); + + result = !res_code; + } + PG_CATCH(); + { + if (doc) + xmlFreeDoc(doc); + if (ctxt) + xmlFreeParserCtxt(ctxt); + + PG_RE_THROW(); + } + PG_END_TRY(); + + if (doc) + xmlFreeDoc(doc); + if (ctxt) + xmlFreeParserCtxt(ctxt); + + return result; + #else + NO_XML_SUPPORT(); + return 0; + #endif + } + /* * Evaluate XPath expression and return array of XML values. *** a/src/include/catalog/pg_proc.h --- b/src/include/catalog/pg_proc.h *************** *** 4385,4390 **** DESCR("evaluate XPath expression, with namespaces support"); --- 4385,4393 ---- DATA(insert OID = 2932 ( xpath PGNSP PGUID 14 1 0 0 f f f t f i 2 0 143 "25 142" _null_ _null_ _null_ _null_ "selectpg_catalog.xpath($1, $2, ''{}''::pg_catalog.text[])" _null_ _null_ _null_ )); DESCR("evaluate XPath expression"); + DATA(insert OID = 3037 ( xml_is_well_formed PGNSP PGUID 12 1 0 0 f f f t f i 1 0 16 "25" _null_ _null_ _null_ _null_xml_is_well_formed _null_ _null_ _null_ )); + DESCR("determine if a text fragment is well formed XML"); + /* uuid */ DATA(insert OID = 2952 ( uuid_in PGNSP PGUID 12 1 0 0 f f f t f i 1 0 2950 "2275" _null_ _null_ _null_ _null_uuid_in _null_ _null_ _null_ )); DESCR("I/O"); *** a/src/include/utils/xml.h --- b/src/include/utils/xml.h *************** *** 46,51 **** extern Datum query_to_xmlschema(PG_FUNCTION_ARGS); --- 46,52 ---- extern Datum cursor_to_xmlschema(PG_FUNCTION_ARGS); extern Datum table_to_xml_and_xmlschema(PG_FUNCTION_ARGS); extern Datum query_to_xml_and_xmlschema(PG_FUNCTION_ARGS); + extern Datum xml_is_well_formed(PG_FUNCTION_ARGS); extern Datum schema_to_xml(PG_FUNCTION_ARGS); extern Datum schema_to_xmlschema(PG_FUNCTION_ARGS); *** a/src/test/regress/expected/xml.out --- b/src/test/regress/expected/xml.out *************** *** 502,504 **** SELECT xpath('//b', '<a>one <b>two</b> three <b>etc</b></a>'); --- 502,565 ---- {<b>two</b>,<b>etc</b>} (1 row) + -- Test xml_is_well_formed + SELECT xml_is_well_formed('<>'); + xml_is_well_formed + -------------------- + f + (1 row) + + SELECT xml_is_well_formed('abc'); + xml_is_well_formed + -------------------- + t + (1 row) + + SELECT xml_is_well_formed('<abc/>'); + xml_is_well_formed + -------------------- + t + (1 row) + + SELECT xml_is_well_formed('<foo>bar</foo>'); + xml_is_well_formed + -------------------- + t + (1 row) + + SELECT xml_is_well_formed('<foo>bar</foo'); + xml_is_well_formed + -------------------- + f + (1 row) + + SELECT xml_is_well_formed('<foo><bar>baz</foo>'); + xml_is_well_formed + -------------------- + f + (1 row) + + SELECT xml_is_well_formed('<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number one</local:piece><local:pieceid="2" /></local:data>'); + xml_is_well_formed + -------------------- + t + (1 row) + + SELECT xml_is_well_formed('<foo>bar</foo>') AND '<foo>bar</foo>' IS DOCUMENT; + ?column? + ---------- + t + (1 row) + + SELECT xml_is_well_formed('<foo>bar</foo>baz') AND '<foo>bar</foo>baz' IS NOT DOCUMENT; + ?column? + ---------- + t + (1 row) + + SELECT xml_is_well_formed('<foo>bar</foo><bar>foo</bar>') AND '<foo>bar</foo><bar>foo</bar>' IS NOT DOCUMENT; + ?column? + ---------- + t + (1 row) + *** a/src/test/regress/sql/xml.sql --- b/src/test/regress/sql/xml.sql *************** *** 163,165 **** SELECT xpath('', '<!-- error -->'); --- 163,179 ---- SELECT xpath('//text()', '<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number one</local:piece><local:pieceid="2" /></local:data>'); SELECT xpath('//loc:piece/@id', '<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number one</local:piece><local:pieceid="2" /></local:data>', ARRAY[ARRAY['loc', 'http://127.0.0.1']]); SELECT xpath('//b', '<a>one <b>two</b> three <b>etc</b></a>'); + + -- Test xml_is_well_formed + + SELECT xml_is_well_formed('<>'); + SELECT xml_is_well_formed('abc'); + SELECT xml_is_well_formed('<abc/>'); + SELECT xml_is_well_formed('<foo>bar</foo>'); + SELECT xml_is_well_formed('<foo>bar</foo'); + SELECT xml_is_well_formed('<foo><bar>baz</foo>'); + SELECT xml_is_well_formed('<local:data xmlns:local="http://127.0.0.1"><local:piece id="1">number one</local:piece><local:pieceid="2" /></local:data>'); + SELECT xml_is_well_formed('<foo>bar</foo>') AND '<foo>bar</foo>' IS DOCUMENT; + SELECT xml_is_well_formed('<foo>bar</foo>baz') AND '<foo>bar</foo>baz' IS NOT DOCUMENT; + SELECT xml_is_well_formed('<foo>bar</foo><bar>foo</bar>') AND '<foo>bar</foo><bar>foo</bar>' IS NOT DOCUMENT; +
pgsql-hackers by date: