From 31f4b59a464808ab0fec0ffb2eaa723321ea1af7 Mon Sep 17 00:00:00 2001 From: Bruce Momjian Date: Fri, 5 Mar 2004 03:57:58 +0000 Subject: [PATCH] Move new version of contrib/ xml into xml2, keep old version in /xml. --- contrib/README | 4 + contrib/xml/TODO | 78 +++++++ contrib/xml/pgxml.c | 352 +++++++++++++++++++++++++++++ contrib/xml/pgxml.h | 42 ++++ contrib/xml/pgxml_dom.c | 265 ++++++++++++++++++++++ contrib/xml/pgxml_dom.sql.in | 10 + contrib/{xml => xml2}/Makefile | 0 contrib/{xml => xml2}/README.pgxml | 0 contrib/{xml => xml2}/pgxml.sql.in | 0 contrib/{xml => xml2}/xpath.c | 0 contrib/{xml => xml2}/xslt_proc.c | 0 11 files changed, 751 insertions(+) create mode 100644 contrib/xml/TODO create mode 100644 contrib/xml/pgxml.c create mode 100644 contrib/xml/pgxml.h create mode 100644 contrib/xml/pgxml_dom.c create mode 100644 contrib/xml/pgxml_dom.sql.in rename contrib/{xml => xml2}/Makefile (100%) rename contrib/{xml => xml2}/README.pgxml (100%) rename contrib/{xml => xml2}/pgxml.sql.in (100%) rename contrib/{xml => xml2}/xpath.c (100%) rename contrib/{xml => xml2}/xslt_proc.c (100%) diff --git a/contrib/README b/contrib/README index a8a2c6c968..0071f43b85 100644 --- a/contrib/README +++ b/contrib/README @@ -217,5 +217,9 @@ vacuumlo - by Peter T Mount xml - + Storing XML in PostgreSQL (obsolete version) + by John Gray + +xml2 - Storing XML in PostgreSQL by John Gray diff --git a/contrib/xml/TODO b/contrib/xml/TODO new file mode 100644 index 0000000000..5ddd62a658 --- /dev/null +++ b/contrib/xml/TODO @@ -0,0 +1,78 @@ +PGXML TODO List +=============== + +Some of these items still require much more thought! Since the first +release, the XPath support has improved (because I'm no longer using a +homemade algorithm!). + +1. Performance considerations + +At present each document is parsed to produce the DOM tree on every query. + +Pros: + Easy + No persistent memory or storage allocation for parsed trees + (libxml docs suggest representation of a document might + be 4 times the size of the text) + +Cons: + Slow/ CPU intensive to parse. + Makes it difficult for PLs to apply libxml manipulations to create + new documents or amend existing ones. + + +2. XQuery + +I'm not sure if the addition of XQuery would be best as a function or +as a new front-end parser. This is one to think about, but with a +decent implementation of XPath, one of the prerequisites is covered. + +3. DOM Interfaces + +Expose more aspects of the DOM to user functions/ PLs. This would +allow a procedure in a PL to run some queries and then use exposed +interfaces to libxml to create an XML document out of the query +results. I accept the argument that this might be more properly +performed on the client side. + +4. Returning sets of documents from XPath queries. + +Although the current implementation allows you to amalgamate the +returned results into a single document, it's quite possible that +you'd like to use the returned set of nodes as a source for FROM. + +Is there a good way to optimise/index the results of certain XPath +operations to make them faster?: + +select docid, pgxml_xpath(document,'//site/location/text()','','') as location +where pgxml_xpath(document,'//site/name/text()','','') = 'Church Farm'; + +and with multiple element occurences in a document? + +select d.docid, pgxml_xpath(d.document,'//site/location/text()','','') +from docstore d, +pgxml_xpaths('docstore','document','//feature/type/text()','docid') ft +where ft.key = d.docid and ft.value ='Limekiln'; + +pgxml_xpaths params are relname, attrname, xpath, returnkey. It would +return a set of two-element tuples (key,value) consisting of the value of +returnkey, and the cdata value of the xpath. The XML document would be +defined by relname and attrname. + +The pgxml_xpaths function could be the basis of a functional index, +which could speed up the above query very substantially, working +through the normal query planner mechanism. + +5. Return type support. + +Better support for returning e.g. numeric or boolean values. I need to +get to grips with the returned data from libxml first. + + +John Gray 16 August 2001 + + + + + + diff --git a/contrib/xml/pgxml.c b/contrib/xml/pgxml.c new file mode 100644 index 0000000000..4d8c3b96bc --- /dev/null +++ b/contrib/xml/pgxml.c @@ -0,0 +1,352 @@ +/******************************************************** + * Interface code to parse an XML document using expat + ********************************************************/ + +#include "postgres.h" +#include "fmgr.h" + +#include "expat.h" +#include "pgxml.h" + +/* Memory management - we make expat use standard pg MM */ + +XML_Memory_Handling_Suite mhs; + +/* passthrough functions (palloc is a macro) */ + +static void * +pgxml_palloc(size_t size) +{ + return palloc(size); +} + +static void * +pgxml_repalloc(void *ptr, size_t size) +{ + return repalloc(ptr, size); +} + +static void +pgxml_pfree(void *ptr) +{ + return pfree(ptr); +} + +static void +pgxml_mhs_init() +{ + mhs.malloc_fcn = pgxml_palloc; + mhs.realloc_fcn = pgxml_repalloc; + mhs.free_fcn = pgxml_pfree; +} + +static void +pgxml_handler_init() +{ + /* + * This code should set up the relevant handlers from user-supplied + * settings. Quite how these settings are made is another matter :) + */ +} + +/* Returns true if document is well-formed */ + +PG_FUNCTION_INFO_V1(pgxml_parse); + +Datum +pgxml_parse(PG_FUNCTION_ARGS) +{ + /* called as pgxml_parse(document) */ + XML_Parser p; + text *t = PG_GETARG_TEXT_P(0); /* document buffer */ + int32 docsize = VARSIZE(t) - VARHDRSZ; + + pgxml_mhs_init(); + + pgxml_handler_init(); + + p = XML_ParserCreate_MM(NULL, &mhs, NULL); + if (!p) + { + ereport(ERROR, + (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), + errmsg("could not create expat parser"))); + PG_RETURN_NULL(); /* seems appropriate if we couldn't parse */ + } + + if (!XML_Parse(p, (char *) VARDATA(t), docsize, 1)) + { + /* + * elog(WARNING, "Parse error at line %d:%s", + * XML_GetCurrentLineNumber(p), + * XML_ErrorString(XML_GetErrorCode(p))); + */ + XML_ParserFree(p); + PG_RETURN_BOOL(false); + } + + XML_ParserFree(p); + PG_RETURN_BOOL(true); +} + +/* XPath handling functions */ + +/* XPath support here is for a very skeletal kind of XPath! + It was easy to program though... */ + +/* This first is the core function that builds a result set. The + actual functions called by the user manipulate that result set + in various ways. +*/ + +static XPath_Results * +build_xpath_results(text *doc, text *pathstr) +{ + XPath_Results *xpr; + char *res; + pgxml_udata *udata; + XML_Parser p; + int32 docsize; + + xpr = (XPath_Results *) palloc((sizeof(XPath_Results))); + memset((void *) xpr, 0, sizeof(XPath_Results)); + xpr->rescount = 0; + + docsize = VARSIZE(doc) - VARHDRSZ; + + /* res isn't going to be the real return type, it is just a buffer */ + + res = (char *) palloc(docsize); + memset((void *) res, 0, docsize); + + xpr->resbuf = res; + + udata = (pgxml_udata *) palloc((sizeof(pgxml_udata))); + memset((void *) udata, 0, sizeof(pgxml_udata)); + + udata->currentpath[0] = '\0'; + udata->textgrab = 0; + + udata->path = (char *) palloc(VARSIZE(pathstr)); + memcpy(udata->path, VARDATA(pathstr), VARSIZE(pathstr) - VARHDRSZ); + + udata->path[VARSIZE(pathstr) - VARHDRSZ] = '\0'; + + udata->resptr = res; + udata->reslen = 0; + + udata->xpres = xpr; + + /* Now fire up the parser */ + pgxml_mhs_init(); + + p = XML_ParserCreate_MM(NULL, &mhs, NULL); + if (!p) + { + ereport(ERROR, + (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), + errmsg("could not create expat parser"))); + pfree(xpr); + pfree(udata->path); + pfree(udata); + pfree(res); + return NULL; + } + XML_SetUserData(p, (void *) udata); + + /* Set the handlers */ + + XML_SetElementHandler(p, pgxml_starthandler, pgxml_endhandler); + XML_SetCharacterDataHandler(p, pgxml_charhandler); + + if (!XML_Parse(p, (char *) VARDATA(doc), docsize, 1)) + { + /* + * elog(WARNING, "Parse error at line %d:%s", + * XML_GetCurrentLineNumber(p), + * XML_ErrorString(XML_GetErrorCode(p))); + */ + XML_ParserFree(p); + pfree(xpr); + pfree(udata->path); + pfree(udata); + + return NULL; + } + + pfree(udata->path); + pfree(udata); + XML_ParserFree(p); + return xpr; +} + + +PG_FUNCTION_INFO_V1(pgxml_xpath); + +Datum +pgxml_xpath(PG_FUNCTION_ARGS) +{ + /* called as pgxml_xpath(document,pathstr, index) for the moment */ + + XPath_Results *xpresults; + text *restext; + + text *t = PG_GETARG_TEXT_P(0); /* document buffer */ + text *t2 = PG_GETARG_TEXT_P(1); + int32 ind = PG_GETARG_INT32(2) - 1; + + xpresults = build_xpath_results(t, t2); + + /* + * This needs to be changed depending on the mechanism for returning + * our set of results. + */ + + if (xpresults == NULL) /* parse error (not WF or parser failure) */ + PG_RETURN_NULL(); + + if (ind >= (xpresults->rescount)) + PG_RETURN_NULL(); + + restext = (text *) palloc(xpresults->reslens[ind] + VARHDRSZ); + memcpy(VARDATA(restext), xpresults->results[ind], xpresults->reslens[ind]); + + VARATT_SIZEP(restext) = xpresults->reslens[ind] + VARHDRSZ; + + pfree(xpresults->resbuf); + pfree(xpresults); + + PG_RETURN_TEXT_P(restext); +} + + +static void +pgxml_pathcompare(void *userData) +{ + char *matchpos; + + matchpos = strstr(UD->currentpath, UD->path); + + if (matchpos == NULL) + { /* Should we have more logic here ? */ + if (UD->textgrab) + { + UD->textgrab = 0; + pgxml_finalisegrabbedtext(userData); + } + return; + } + + /* + * OK, we have a match of some sort. Now we need to check that our + * match is anchored to the *end* of the string AND that it is + * immediately preceded by a '/' + */ + + /* + * This test wouldn't work if strlen (UD->path) overran the length of + * the currentpath, but that's not possible because we got a match! + */ + + if ((matchpos + strlen(UD->path))[0] == '\0') + { + if ((UD->path)[0] == '/') + { + if (matchpos == UD->currentpath) + UD->textgrab = 1; + } + else + { + if ((matchpos - 1)[0] == '/') + UD->textgrab = 1; + } + } +} + +static void +pgxml_starthandler(void *userData, const XML_Char * name, + const XML_Char ** atts) +{ + + char sepstr[] = "/"; + + if ((strlen(name) + strlen(UD->currentpath)) > MAXPATHLENGTH - 2) + elog(WARNING, "path too long"); + else + { + strncat(UD->currentpath, sepstr, 1); + strcat(UD->currentpath, name); + } + if (UD->textgrab) + { + /* + * Depending on user preference, should we "reconstitute" the + * element into the result text? + */ + } + else + pgxml_pathcompare(userData); +} + +static void +pgxml_endhandler(void *userData, const XML_Char * name) +{ + /* + * Start by removing the current element off the end of the + * currentpath + */ + + char *sepptr; + + sepptr = strrchr(UD->currentpath, '/'); + if (sepptr == NULL) + { + /* internal error */ + elog(ERROR, "did not find '/'"); + sepptr = UD->currentpath; + } + if (strcmp(name, sepptr + 1) != 0) + { + elog(WARNING, "wanted [%s], got [%s]", sepptr, name); + /* unmatched entry, so do nothing */ + } + else + { + sepptr[0] = '\0'; /* Chop that element off the end */ + } + + if (UD->textgrab) + pgxml_pathcompare(userData); + +} + +static void +pgxml_charhandler(void *userData, const XML_Char * s, int len) +{ + if (UD->textgrab) + { + if (len > 0) + { + memcpy(UD->resptr, s, len); + UD->resptr += len; + UD->reslen += len; + } + } +} + +/* Should I be using PG list types here? */ + +static void +pgxml_finalisegrabbedtext(void *userData) +{ + /* In res/reslen, we have a single result. */ + UD->xpres->results[UD->xpres->rescount] = UD->resptr - UD->reslen; + UD->xpres->reslens[UD->xpres->rescount] = UD->reslen; + UD->reslen = 0; + UD->xpres->rescount++; + + /* + * This effectively concatenates all the results together but we do + * know where one ends and the next begins + */ +} diff --git a/contrib/xml/pgxml.h b/contrib/xml/pgxml.h new file mode 100644 index 0000000000..2b80124b77 --- /dev/null +++ b/contrib/xml/pgxml.h @@ -0,0 +1,42 @@ +/* Header for pg xml parser interface */ + +static void *pgxml_palloc(size_t size); +static void *pgxml_repalloc(void *ptr, size_t size); +static void pgxml_pfree(void *ptr); +static void pgxml_mhs_init(); +static void pgxml_handler_init(); +Datum pgxml_parse(PG_FUNCTION_ARGS); +Datum pgxml_xpath(PG_FUNCTION_ARGS); +static void pgxml_starthandler(void *userData, const XML_Char * name, + const XML_Char ** atts); +static void pgxml_endhandler(void *userData, const XML_Char * name); +static void pgxml_charhandler(void *userData, const XML_Char * s, int len); +static void pgxml_pathcompare(void *userData); +static void pgxml_finalisegrabbedtext(void *userData); + +#define MAXPATHLENGTH 512 +#define MAXRESULTS 100 + + +typedef struct +{ + int rescount; + char *results[MAXRESULTS]; + int32 reslens[MAXRESULTS]; + char *resbuf; /* pointer to the result buffer for pfree */ +} XPath_Results; + + + +typedef struct +{ + char currentpath[MAXPATHLENGTH]; + char *path; + int textgrab; + char *resptr; + int32 reslen; + XPath_Results *xpres; +} pgxml_udata; + + +#define UD ((pgxml_udata *) userData) diff --git a/contrib/xml/pgxml_dom.c b/contrib/xml/pgxml_dom.c new file mode 100644 index 0000000000..2b11b1d646 --- /dev/null +++ b/contrib/xml/pgxml_dom.c @@ -0,0 +1,265 @@ +/* Parser interface for DOM-based parser (libxml) rather than + stream-based SAX-type parser */ + +#include "postgres.h" +#include "fmgr.h" + +/* libxml includes */ + +#include +#include +#include + +/* declarations */ + +static void *pgxml_palloc(size_t size); +static void *pgxml_repalloc(void *ptr, size_t size); +static void pgxml_pfree(void *ptr); +static char *pgxml_pstrdup(const char *string); + +static void pgxml_parser_init(); + +static xmlChar *pgxmlNodeSetToText(xmlNodeSetPtr nodeset, xmlDocPtr doc, + xmlChar * toptagname, xmlChar * septagname, + int format); + +static xmlChar *pgxml_texttoxmlchar(text *textstring); + + +Datum pgxml_parse(PG_FUNCTION_ARGS); +Datum pgxml_xpath(PG_FUNCTION_ARGS); + +/* memory handling passthrough functions (e.g. palloc, pstrdup are + currently macros, and the others might become so...) */ + +static void * +pgxml_palloc(size_t size) +{ + return palloc(size); +} + +static void * +pgxml_repalloc(void *ptr, size_t size) +{ + return repalloc(ptr, size); +} + +static void +pgxml_pfree(void *ptr) +{ + return pfree(ptr); +} + +static char * +pgxml_pstrdup(const char *string) +{ + return pstrdup(string); +} + +static void +pgxml_parser_init() +{ + /* + * This code should also set parser settings from user-supplied info. + * Quite how these settings are made is another matter :) + */ + + xmlMemSetup(pgxml_pfree, pgxml_palloc, pgxml_repalloc, pgxml_pstrdup); + xmlInitParser(); + +} + + +/* Returns true if document is well-formed */ + +PG_FUNCTION_INFO_V1(pgxml_parse); + +Datum +pgxml_parse(PG_FUNCTION_ARGS) +{ + /* called as pgxml_parse(document) */ + xmlDocPtr doctree; + text *t = PG_GETARG_TEXT_P(0); /* document buffer */ + int32 docsize = VARSIZE(t) - VARHDRSZ; + + pgxml_parser_init(); + + doctree = xmlParseMemory((char *) VARDATA(t), docsize); + if (doctree == NULL) + { + xmlCleanupParser(); + PG_RETURN_BOOL(false); /* i.e. not well-formed */ + } + xmlCleanupParser(); + xmlFreeDoc(doctree); + PG_RETURN_BOOL(true); +} + +static xmlChar +* +pgxmlNodeSetToText(xmlNodeSetPtr nodeset, + xmlDocPtr doc, + xmlChar * toptagname, + xmlChar * septagname, + int format) +{ + /* Function translates a nodeset into a text representation */ + + /* + * iterates over each node in the set and calls xmlNodeDump to write + * it to an xmlBuffer -from which an xmlChar * string is returned. + */ + /* each representation is surrounded by ... */ + /* if format==0, add a newline between nodes?? */ + + xmlBufferPtr buf; + xmlChar *result; + int i; + + buf = xmlBufferCreate(); + + if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) + { + xmlBufferWriteChar(buf, "<"); + xmlBufferWriteCHAR(buf, toptagname); + xmlBufferWriteChar(buf, ">"); + } + if (nodeset != NULL) + { + for (i = 0; i < nodeset->nodeNr; i++) + { + if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) + { + xmlBufferWriteChar(buf, "<"); + xmlBufferWriteCHAR(buf, septagname); + xmlBufferWriteChar(buf, ">"); + } + xmlNodeDump(buf, doc, nodeset->nodeTab[i], 1, (format == 2)); + + if ((septagname != NULL) && (xmlStrlen(septagname) > 0)) + { + xmlBufferWriteChar(buf, ""); + } + if (format) + xmlBufferWriteChar(buf, "\n"); + } + } + + if ((toptagname != NULL) && (xmlStrlen(toptagname) > 0)) + { + xmlBufferWriteChar(buf, ""); + } + result = xmlStrdup(buf->content); + xmlBufferFree(buf); + return result; +} + +static xmlChar * +pgxml_texttoxmlchar(text *textstring) +{ + xmlChar *res; + int32 txsize; + + txsize = VARSIZE(textstring) - VARHDRSZ; + res = (xmlChar *) palloc(txsize + 1); + memcpy((char *) res, VARDATA(textstring), txsize); + res[txsize] = '\0'; + return res; +} + + +PG_FUNCTION_INFO_V1(pgxml_xpath); + +Datum +pgxml_xpath(PG_FUNCTION_ARGS) +{ + xmlDocPtr doctree; + xmlXPathContextPtr ctxt; + xmlXPathObjectPtr res; + xmlChar *xpath, + *xpresstr, + *toptag, + *septag; + xmlXPathCompExprPtr comppath; + + int32 docsize, + ressize; + text *t, + *xpres; + + t = PG_GETARG_TEXT_P(0); /* document buffer */ + xpath = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(1)); /* XPath expression */ + toptag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(2)); + septag = pgxml_texttoxmlchar(PG_GETARG_TEXT_P(3)); + + docsize = VARSIZE(t) - VARHDRSZ; + + pgxml_parser_init(); + + doctree = xmlParseMemory((char *) VARDATA(t), docsize); + if (doctree == NULL) + { /* not well-formed */ + xmlCleanupParser(); + PG_RETURN_NULL(); + } + + ctxt = xmlXPathNewContext(doctree); + ctxt->node = xmlDocGetRootElement(doctree); + + /* compile the path */ + comppath = xmlXPathCompile(xpath); + if (comppath == NULL) + { + elog(WARNING, "XPath syntax error"); + xmlFreeDoc(doctree); + pfree((void *) xpath); + xmlCleanupParser(); + PG_RETURN_NULL(); + } + + /* Now evaluate the path expression. */ + res = xmlXPathCompiledEval(comppath, ctxt); + xmlXPathFreeCompExpr(comppath); + + if (res == NULL) + { + xmlFreeDoc(doctree); + pfree((void *) xpath); + xmlCleanupParser(); + PG_RETURN_NULL(); /* seems appropriate */ + } + /* now we dump this node, ?surrounding by tags? */ + /* To do this, we look first at the type */ + switch (res->type) + { + case XPATH_NODESET: + xpresstr = pgxmlNodeSetToText(res->nodesetval, + doctree, + toptag, septag, 0); + break; + case XPATH_STRING: + xpresstr = xmlStrdup(res->stringval); + break; + default: + elog(WARNING, "Unsupported XQuery result: %d", res->type); + xpresstr = xmlStrdup(""); + } + + + /* Now convert this result back to text */ + ressize = strlen(xpresstr); + xpres = (text *) palloc(ressize + VARHDRSZ); + memcpy(VARDATA(xpres), xpresstr, ressize); + VARATT_SIZEP(xpres) = ressize + VARHDRSZ; + + /* Free various storage */ + xmlFreeDoc(doctree); + pfree((void *) xpath); + xmlFree(xpresstr); + xmlCleanupParser(); + PG_RETURN_TEXT_P(xpres); +} diff --git a/contrib/xml/pgxml_dom.sql.in b/contrib/xml/pgxml_dom.sql.in new file mode 100644 index 0000000000..514643b936 --- /dev/null +++ b/contrib/xml/pgxml_dom.sql.in @@ -0,0 +1,10 @@ +-- SQL for XML parser + +-- Adjust this setting to control where the objects get created. +SET search_path TO public; + +CREATE OR REPLACE FUNCTION pgxml_parse(text) RETURNS boolean + AS 'MODULE_PATHNAME' LANGUAGE c STRICT; + +CREATE OR REPLACE FUNCTION pgxml_xpath(text, text, text, text) RETURNS text + AS 'MODULE_PATHNAME' LANGUAGE c STRICT; diff --git a/contrib/xml/Makefile b/contrib/xml2/Makefile similarity index 100% rename from contrib/xml/Makefile rename to contrib/xml2/Makefile diff --git a/contrib/xml/README.pgxml b/contrib/xml2/README.pgxml similarity index 100% rename from contrib/xml/README.pgxml rename to contrib/xml2/README.pgxml diff --git a/contrib/xml/pgxml.sql.in b/contrib/xml2/pgxml.sql.in similarity index 100% rename from contrib/xml/pgxml.sql.in rename to contrib/xml2/pgxml.sql.in diff --git a/contrib/xml/xpath.c b/contrib/xml2/xpath.c similarity index 100% rename from contrib/xml/xpath.c rename to contrib/xml2/xpath.c diff --git a/contrib/xml/xslt_proc.c b/contrib/xml2/xslt_proc.c similarity index 100% rename from contrib/xml/xslt_proc.c rename to contrib/xml2/xslt_proc.c -- 2.40.0