by Peter T Mount <peter@retep.org.uk>
xml -
+ Storing XML in PostgreSQL (obsolete version)
+ by John Gray <jgray@azuli.co.uk>
+
+xml2 -
Storing XML in PostgreSQL
by John Gray <jgray@azuli.co.uk>
--- /dev/null
+PGXML TODO List
+===============
+
+Some of these items still require much more thought! Since the first
+release, the XPath support has improved (because I'm no longer using a
+homemade algorithm!).
+
+1. Performance considerations
+
+At present each document is parsed to produce the DOM tree on every query.
+
+Pros:
+ Easy
+ No persistent memory or storage allocation for parsed trees
+ (libxml docs suggest representation of a document might
+ be 4 times the size of the text)
+
+Cons:
+ Slow/ CPU intensive to parse.
+ Makes it difficult for PLs to apply libxml manipulations to create
+ new documents or amend existing ones.
+
+
+2. XQuery
+
+I'm not sure if the addition of XQuery would be best as a function or
+as a new front-end parser. This is one to think about, but with a
+decent implementation of XPath, one of the prerequisites is covered.
+
+3. DOM Interfaces
+
+Expose more aspects of the DOM to user functions/ PLs. This would
+allow a procedure in a PL to run some queries and then use exposed
+interfaces to libxml to create an XML document out of the query
+results. I accept the argument that this might be more properly
+performed on the client side.
+
+4. Returning sets of documents from XPath queries.
+
+Although the current implementation allows you to amalgamate the
+returned results into a single document, it's quite possible that
+you'd like to use the returned set of nodes as a source for FROM.
+
+Is there a good way to optimise/index the results of certain XPath
+operations to make them faster?:
+
+select docid, pgxml_xpath(document,'//site/location/text()','','') as location
+where pgxml_xpath(document,'//site/name/text()','','') = 'Church Farm';
+
+and with multiple element occurences in a document?
+
+select d.docid, pgxml_xpath(d.document,'//site/location/text()','','')
+from docstore d,
+pgxml_xpaths('docstore','document','//feature/type/text()','docid') ft
+where ft.key = d.docid and ft.value ='Limekiln';
+
+pgxml_xpaths params are relname, attrname, xpath, returnkey. It would
+return a set of two-element tuples (key,value) consisting of the value of
+returnkey, and the cdata value of the xpath. The XML document would be
+defined by relname and attrname.
+
+The pgxml_xpaths function could be the basis of a functional index,
+which could speed up the above query very substantially, working
+through the normal query planner mechanism.
+
+5. Return type support.
+
+Better support for returning e.g. numeric or boolean values. I need to
+get to grips with the returned data from libxml first.
+
+
+John Gray <jgray@azuli.co.uk> 16 August 2001
+
+
+
+
+
+
--- /dev/null
+/********************************************************
+ * Interface code to parse an XML document using expat
+ ********************************************************/
+
+#include "postgres.h"
+#include "fmgr.h"
+
+#include "expat.h"
+#include "pgxml.h"
+
+/* Memory management - we make expat use standard pg MM */
+
+XML_Memory_Handling_Suite mhs;
+
+/* passthrough functions (palloc is a macro) */
+
+static void *
+pgxml_palloc(size_t size)
+{
+ return palloc(size);
+}
+
+static void *
+pgxml_repalloc(void *ptr, size_t size)
+{
+ return repalloc(ptr, size);
+}
+
+static void
+pgxml_pfree(void *ptr)
+{
+ return pfree(ptr);
+}
+
+static void
+pgxml_mhs_init()
+{
+ mhs.malloc_fcn = pgxml_palloc;
+ mhs.realloc_fcn = pgxml_repalloc;
+ mhs.free_fcn = pgxml_pfree;
+}
+
+static void
+pgxml_handler_init()
+{
+ /*
+ * This code should set up the relevant handlers from user-supplied
+ * settings. Quite how these settings are made is another matter :)
+ */
+}
+
+/* Returns true if document is well-formed */
+
+PG_FUNCTION_INFO_V1(pgxml_parse);
+
+Datum
+pgxml_parse(PG_FUNCTION_ARGS)
+{
+ /* called as pgxml_parse(document) */
+ XML_Parser p;
+ text *t = PG_GETARG_TEXT_P(0); /* document buffer */
+ int32 docsize = VARSIZE(t) - VARHDRSZ;
+
+ pgxml_mhs_init();
+
+ pgxml_handler_init();
+
+ p = XML_ParserCreate_MM(NULL, &mhs, NULL);
+ if (!p)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
+ errmsg("could not create expat parser")));
+ PG_RETURN_NULL(); /* seems appropriate if we couldn't parse */
+ }
+
+ if (!XML_Parse(p, (char *) VARDATA(t), docsize, 1))
+ {
+ /*
+ * elog(WARNING, "Parse error at line %d:%s",
+ * XML_GetCurrentLineNumber(p),
+ * XML_ErrorString(XML_GetErrorCode(p)));
+ */
+ XML_ParserFree(p);
+ PG_RETURN_BOOL(false);
+ }
+
+ XML_ParserFree(p);
+ PG_RETURN_BOOL(true);
+}
+
+/* XPath handling functions */
+
+/* XPath support here is for a very skeletal kind of XPath!
+ It was easy to program though... */
+
+/* This first is the core function that builds a result set. The
+ actual functions called by the user manipulate that result set
+ in various ways.
+*/
+
+static XPath_Results *
+build_xpath_results(text *doc, text *pathstr)
+{
+ XPath_Results *xpr;
+ char *res;
+ pgxml_udata *udata;
+ XML_Parser p;
+ int32 docsize;
+
+ xpr = (XPath_Results *) palloc((sizeof(XPath_Results)));
+ memset((void *) xpr, 0, sizeof(XPath_Results));
+ xpr->rescount = 0;
+
+ docsize = VARSIZE(doc) - VARHDRSZ;
+
+ /* res isn't going to be the real return type, it is just a buffer */
+
+ res = (char *) palloc(docsize);
+ memset((void *) res, 0, docsize);
+
+ xpr->resbuf = res;
+
+ udata = (pgxml_udata *) palloc((sizeof(pgxml_udata)));
+ memset((void *) udata, 0, sizeof(pgxml_udata));
+
+ udata->currentpath[0] = '\0';
+ udata->textgrab = 0;
+
+ udata->path = (char *) palloc(VARSIZE(pathstr));
+ memcpy(udata->path, VARDATA(pathstr), VARSIZE(pathstr) - VARHDRSZ);
+
+ udata->path[VARSIZE(pathstr) - VARHDRSZ] = '\0';
+
+ udata->resptr = res;
+ udata->reslen = 0;
+
+ udata->xpres = xpr;
+
+ /* Now fire up the parser */
+ pgxml_mhs_init();
+
+ p = XML_ParserCreate_MM(NULL, &mhs, NULL);
+ if (!p)
+ {
+ ereport(ERROR,
+ (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION),
+ errmsg("could not create expat parser")));
+ pfree(xpr);
+ pfree(udata->path);
+ pfree(udata);
+ pfree(res);
+ return NULL;
+ }
+ XML_SetUserData(p, (void *) udata);
+
+ /* Set the handlers */
+
+ XML_SetElementHandler(p, pgxml_starthandler, pgxml_endhandler);
+ XML_SetCharacterDataHandler(p, pgxml_charhandler);
+
+ if (!XML_Parse(p, (char *) VARDATA(doc), docsize, 1))
+ {
+ /*
+ * elog(WARNING, "Parse error at line %d:%s",
+ * XML_GetCurrentLineNumber(p),
+ * XML_ErrorString(XML_GetErrorCode(p)));
+ */
+ XML_ParserFree(p);
+ pfree(xpr);
+ pfree(udata->path);
+ pfree(udata);
+
+ return NULL;
+ }
+
+ pfree(udata->path);
+ pfree(udata);
+ XML_ParserFree(p);
+ return xpr;
+}
+
+
+PG_FUNCTION_INFO_V1(pgxml_xpath);
+
+Datum
+pgxml_xpath(PG_FUNCTION_ARGS)
+{
+ /* called as pgxml_xpath(document,pathstr, index) for the moment */
+
+ XPath_Results *xpresults;
+ text *restext;
+
+ text *t = PG_GETARG_TEXT_P(0); /* document buffer */
+ text *t2 = PG_GETARG_TEXT_P(1);
+ int32 ind = PG_GETARG_INT32(2) - 1;
+
+ xpresults = build_xpath_results(t, t2);
+
+ /*
+ * This needs to be changed depending on the mechanism for returning
+ * our set of results.
+ */
+
+ if (xpresults == NULL) /* parse error (not WF or parser failure) */
+ PG_RETURN_NULL();
+
+ if (ind >= (xpresults->rescount))
+ PG_RETURN_NULL();
+
+ restext = (text *) palloc(xpresults->reslens[ind] + VARHDRSZ);
+ memcpy(VARDATA(restext), xpresults->results[ind], xpresults->reslens[ind]);
+
+ VARATT_SIZEP(restext) = xpresults->reslens[ind] + VARHDRSZ;
+
+ pfree(xpresults->resbuf);
+ pfree(xpresults);
+
+ PG_RETURN_TEXT_P(restext);
+}
+
+
+static void
+pgxml_pathcompare(void *userData)
+{
+ char *matchpos;
+
+ matchpos = strstr(UD->currentpath, UD->path);
+
+ if (matchpos == NULL)
+ { /* Should we have more logic here ? */
+ if (UD->textgrab)
+ {
+ UD->textgrab = 0;
+ pgxml_finalisegrabbedtext(userData);
+ }
+ return;
+ }
+
+ /*
+ * OK, we have a match of some sort. Now we need to check that our
+ * match is anchored to the *end* of the string AND that it is
+ * immediately preceded by a '/'
+ */
+
+ /*
+ * This test wouldn't work if strlen (UD->path) overran the length of
+ * the currentpath, but that's not possible because we got a match!
+ */
+
+ if ((matchpos + strlen(UD->path))[0] == '\0')
+ {
+ if ((UD->path)[0] == '/')
+ {
+ if (matchpos == UD->currentpath)
+ UD->textgrab = 1;
+ }
+ else
+ {
+ if ((matchpos - 1)[0] == '/')
+ UD->textgrab = 1;
+ }
+ }
+}
+
+static void
+pgxml_starthandler(void *userData, const XML_Char * name,
+ const XML_Char ** atts)
+{
+
+ char sepstr[] = "/";
+
+ if ((strlen(name) + strlen(UD->currentpath)) > MAXPATHLENGTH - 2)
+ elog(WARNING, "path too long");
+ else
+ {
+ strncat(UD->currentpath, sepstr, 1);
+ strcat(UD->currentpath, name);
+ }
+ if (UD->textgrab)
+ {
+ /*
+ * Depending on user preference, should we "reconstitute" the
+ * element into the result text?
+ */
+ }
+ else
+ pgxml_pathcompare(userData);
+}
+
+static void
+pgxml_endhandler(void *userData, const XML_Char * name)
+{
+ /*
+ * Start by removing the current element off the end of the
+ * currentpath
+ */
+
+ char *sepptr;
+
+ sepptr = strrchr(UD->currentpath, '/');
+ if (sepptr == NULL)
+ {
+ /* internal error */
+ elog(ERROR, "did not find '/'");
+ sepptr = UD->currentpath;
+ }
+ if (strcmp(name, sepptr + 1) != 0)
+ {
+ elog(WARNING, "wanted [%s], got [%s]", sepptr, name);
+ /* unmatched entry, so do nothing */
+ }
+ else
+ {
+ sepptr[0] = '\0'; /* Chop that element off the end */
+ }
+
+ if (UD->textgrab)
+ pgxml_pathcompare(userData);
+
+}
+
+static void
+pgxml_charhandler(void *userData, const XML_Char * s, int len)
+{
+ if (UD->textgrab)
+ {
+ if (len > 0)
+ {
+ memcpy(UD->resptr, s, len);
+ UD->resptr += len;
+ UD->reslen += len;
+ }
+ }
+}
+
+/* Should I be using PG list types here? */
+
+static void
+pgxml_finalisegrabbedtext(void *userData)
+{
+ /* In res/reslen, we have a single result. */
+ UD->xpres->results[UD->xpres->rescount] = UD->resptr - UD->reslen;
+ UD->xpres->reslens[UD->xpres->rescount] = UD->reslen;
+ UD->reslen = 0;
+ UD->xpres->rescount++;
+
+ /*
+ * This effectively concatenates all the results together but we do
+ * know where one ends and the next begins
+ */
+}
--- /dev/null
+/* Header for pg xml parser interface */
+
+static void *pgxml_palloc(size_t size);
+static void *pgxml_repalloc(void *ptr, size_t size);
+static void pgxml_pfree(void *ptr);
+static void pgxml_mhs_init();
+static void pgxml_handler_init();
+Datum pgxml_parse(PG_FUNCTION_ARGS);
+Datum pgxml_xpath(PG_FUNCTION_ARGS);
+static void pgxml_starthandler(void *userData, const XML_Char * name,
+ const XML_Char ** atts);
+static void pgxml_endhandler(void *userData, const XML_Char * name);
+static void pgxml_charhandler(void *userData, const XML_Char * s, int len);
+static void pgxml_pathcompare(void *userData);
+static void pgxml_finalisegrabbedtext(void *userData);
+
+#define MAXPATHLENGTH 512
+#define MAXRESULTS 100
+
+
+typedef struct
+{
+ int rescount;
+ char *results[MAXRESULTS];
+ int32 reslens[MAXRESULTS];
+ char *resbuf; /* pointer to the result buffer for pfree */
+} XPath_Results;
+
+
+
+typedef struct
+{
+ char currentpath[MAXPATHLENGTH];
+ char *path;
+ int textgrab;
+ char *resptr;
+ int32 reslen;
+ XPath_Results *xpres;
+} pgxml_udata;
+
+
+#define UD ((pgxml_udata *) userData)
--- /dev/null
+/* Parser interface for DOM-based parser (libxml) rather than
+ stream-based SAX-type parser */
+
+#include "postgres.h"
+#include "fmgr.h"
+
+/* libxml includes */
+
+#include <libxml/xpath.h>
+#include <libxml/tree.h>
+#include <libxml/xmlmemory.h>
+
+/* declarations */
+
+static void *pgxml_palloc(size_t size);
+static void *pgxml_repalloc(void *ptr, size_t size);
+static void pgxml_pfree(void *ptr);
+static char *pgxml_pstrdup(const char *string);
+
+static void pgxml_parser_init();
+
+static xmlChar *pgxmlNodeSetToText(xmlNodeSetPtr nodeset, xmlDocPtr doc,
+ xmlChar * toptagname, xmlChar * septagname,
+ int format);
+
+static xmlChar *pgxml_texttoxmlchar(text *textstring);
+
+
+Datum pgxml_parse(PG_FUNCTION_ARGS);
+Datum pgxml_xpath(PG_FUNCTION_ARGS);
+
+/* memory handling passthrough functions (e.g. palloc, pstrdup are
+ currently macros, and the others might become so...) */
+
+static void *
+pgxml_palloc(size_t size)
+{
+ return palloc(size);
+}
+
+static void *
+pgxml_repalloc(void *ptr, size_t size)
+{
+ return repalloc(ptr, size);
+}
+
+static void
+pgxml_pfree(void *ptr)
+{
+ return pfree(ptr);
+}
+
+static char *
+pgxml_pstrdup(const char *string)
+{
+ return pstrdup(string);
+}
+
+static void
+pgxml_parser_init()
+{
+ /*
+ * This code should also set parser settings from user-supplied info.
+ * Quite how these settings are made is another matter :)
+ */
+
+ xmlMemSetup(pgxml_pfree, pgxml_palloc, pgxml_repalloc, pgxml_pstrdup);
+ xmlInitParser();
+
+}
+
+
+/* Returns true if document is well-formed */
+
+PG_FUNCTION_INFO_V1(pgxml_parse);
+
+Datum
+pgxml_parse(PG_FUNCTION_ARGS)
+{
+ /* called as pgxml_parse(document) */
+ xmlDocPtr doctree;
+ text *t = PG_GETARG_TEXT_P(0); /* document buffer */
+ int32 docsize = VARSIZE(t) - VARHDRSZ;
+
+ pgxml_parser_init();
+
+ doctree = xmlParseMemory((char *) VARDATA(t), docsize);
+ if (doctree == NULL)
+ {
+ xmlCleanupParser();
+ PG_RETURN_BOOL(false); /* i.e. not well-formed */
+ }
+ xmlCleanupParser();
+ xmlFreeDoc(doctree);
+ PG_RETURN_BOOL(true);
+}
+
+static xmlChar
+*
+pgxmlNodeSetToText(xmlNodeSetPtr nodeset,
+ xmlDocPtr doc,
+ xmlChar * toptagname,
+ xmlChar * septagname,
+ int format)
+{
+ /* Function translates a nodeset into a text representation */
+
+ /*
+ * iterates over each node in the set and calls xmlNodeDump to write
+ * it to an xmlBuffer -from which an xmlChar * string is returned.
+ */
+ /* each representation is surrounded by <tagname> ... </tagname> */
+ /* if format==0, add a newline between nodes?? */
+
+ xmlBufferPtr buf;
+ xmlChar *result;
+ int i;
+
+ buf = xmlBufferCreate();
+
+ if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0))
+ {
+ xmlBufferWriteChar(buf, "<");
+ xmlBufferWriteCHAR(buf, toptagname);
+ xmlBufferWriteChar(buf, ">");
+ }
+ if (nodeset != NULL)
+ {
+ for (i = 0; i < nodeset->nodeNr; i++)
+ {
+ if ((septagname != NULL) && (xmlStrlen(septagname) > 0))
+ {
+ xmlBufferWriteChar(buf, "<");
+ xmlBufferWriteCHAR(buf, septagname);
+ xmlBufferWriteChar(buf, ">");
+ }
+ xmlNodeDump(buf, doc, nodeset->nodeTab[i], 1, (format == 2));
+
+ if ((septagname != NULL) && (xmlStrlen(septagname) > 0))
+ {
+ xmlBufferWriteChar(buf, "</");
+ xmlBufferWriteCHAR(buf, septagname);
+ xmlBufferWriteChar(buf, ">");
+ }
+ if (format)
+ xmlBufferWriteChar(buf, "\n");
+ }
+ }
+
+ if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0))
+ {
+ xmlBufferWriteChar(buf, "</");
+ xmlBufferWriteCHAR(buf, toptagname);
+ xmlBufferWriteChar(buf, ">");
+ }
+ result = xmlStrdup(buf->content);
+ xmlBufferFree(buf);
+ return result;
+}
+
+static xmlChar *
+pgxml_texttoxmlchar(text *textstring)
+{
+ xmlChar *res;
+ int32 txsize;
+
+ txsize = VARSIZE(textstring) - VARHDRSZ;
+ res = (xmlChar *) palloc(txsize + 1);
+ memcpy((char *) res, VARDATA(textstring), txsize);
+ res[txsize] = '\0';
+ return res;
+}
+
+
+PG_FUNCTION_INFO_V1(pgxml_xpath);
+
+Datum
+pgxml_xpath(PG_FUNCTION_ARGS)
+{
+ xmlDocPtr doctree;
+ xmlXPathContextPtr ctxt;
+ xmlXPathObjectPtr res;
+ xmlChar *xpath,
+ *xpresstr,
+ *toptag,
+ *septag;
+ xmlXPathCompExprPtr comppath;
+
+ int32 docsize,
+ ressize;
+ text *t,
+ *xpres;
+
+ t = PG_GETARG_TEXT_P(0); /* document buffer */
+ xpath = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(1)); /* XPath expression */
+ toptag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(2));
+ septag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(3));
+
+ docsize = VARSIZE(t) - VARHDRSZ;
+
+ pgxml_parser_init();
+
+ doctree = xmlParseMemory((char *) VARDATA(t), docsize);
+ if (doctree == NULL)
+ { /* not well-formed */
+ xmlCleanupParser();
+ PG_RETURN_NULL();
+ }
+
+ ctxt = xmlXPathNewContext(doctree);
+ ctxt->node = xmlDocGetRootElement(doctree);
+
+ /* compile the path */
+ comppath = xmlXPathCompile(xpath);
+ if (comppath == NULL)
+ {
+ elog(WARNING, "XPath syntax error");
+ xmlFreeDoc(doctree);
+ pfree((void *) xpath);
+ xmlCleanupParser();
+ PG_RETURN_NULL();
+ }
+
+ /* Now evaluate the path expression. */
+ res = xmlXPathCompiledEval(comppath, ctxt);
+ xmlXPathFreeCompExpr(comppath);
+
+ if (res == NULL)
+ {
+ xmlFreeDoc(doctree);
+ pfree((void *) xpath);
+ xmlCleanupParser();
+ PG_RETURN_NULL(); /* seems appropriate */
+ }
+ /* now we dump this node, ?surrounding by tags? */
+ /* To do this, we look first at the type */
+ switch (res->type)
+ {
+ case XPATH_NODESET:
+ xpresstr = pgxmlNodeSetToText(res->nodesetval,
+ doctree,
+ toptag, septag, 0);
+ break;
+ case XPATH_STRING:
+ xpresstr = xmlStrdup(res->stringval);
+ break;
+ default:
+ elog(WARNING, "Unsupported XQuery result: %d", res->type);
+ xpresstr = xmlStrdup("<unsupported/>");
+ }
+
+
+ /* Now convert this result back to text */
+ ressize = strlen(xpresstr);
+ xpres = (text *) palloc(ressize + VARHDRSZ);
+ memcpy(VARDATA(xpres), xpresstr, ressize);
+ VARATT_SIZEP(xpres) = ressize + VARHDRSZ;
+
+ /* Free various storage */
+ xmlFreeDoc(doctree);
+ pfree((void *) xpath);
+ xmlFree(xpresstr);
+ xmlCleanupParser();
+ PG_RETURN_TEXT_P(xpres);
+}
--- /dev/null
+-- SQL for XML parser
+
+-- Adjust this setting to control where the objects get created.
+SET search_path TO public;
+
+CREATE OR REPLACE FUNCTION pgxml_parse(text) RETURNS boolean
+ AS 'MODULE_PATHNAME' LANGUAGE c STRICT;
+
+CREATE OR REPLACE FUNCTION pgxml_xpath(text, text, text, text) RETURNS text
+ AS 'MODULE_PATHNAME' LANGUAGE c STRICT;