top_builddir = ../..
include $(top_builddir)/src/Makefile.global
-override CFLAGS+= $(CFLAGS_SL)
+override CFLAGS+= $(CFLAGS_SL) -g
#
# DLOBJS is the dynamically-loaded object files. The "funcs" queries
# include CREATE FUNCTIONs that load routines from these files.
#
-DLOBJS= pgxml$(DLSUFFIX)
+DLOBJS= pgxml_dom$(DLSUFFIX)
-QUERIES= pgxml.sql
+QUERIES= pgxml_dom.sql
all: $(DLOBJS) $(QUERIES)
-# Requires the expat library
-
%.so: %.o
- $(CC) -shared -lexpat -o $@ $<
+ $(CC) -shared -lxml2 -o $@ $<
%.sql: %.source
clean:
rm -f $(DLOBJS) $(QUERIES)
+
+
+
+
-This package contains a couple of simple routines for hooking the
-expat XML parser up to PostgreSQL. This is a work-in-progress and all
-very basic at the moment (see the file TODO for some outline of what
-remains to be done).
+This package contains some simple routines for manipulating XML
+documents stored in PostgreSQL. This is a work-in-progress and
+somewhat basic at the moment (see the file TODO for some outline of
+what remains to be done).
-At present, two functions are defined, one which checks
-well-formedness, and the other which performs very simple XPath-type
-queries.
+At present, two modules (based on different XML handling libraries)
+are provided.
Prerequisite:
+pgxml.c:
expat parser 1.95.0 or newer (http://expat.sourceforge.net)
-I used a shared library version -I'm sure you could use a static
-library if you wished though. I had no problems compiling from source.
+or
+
+pgxml_dom.c:
+libxml2 (http://xmlsoft.org)
+
+The libxml2 version provides more complete XPath functionality, and
+seems like a good way to go. I've left the old versions in there for
+comparison.
+
+Compiling and loading:
+----------------------
+
+The Makefile only builds the libxml2 version.
+
+To compile, just type make.
+
+Then you can use psql to load the two function definitions:
+\i pgxml_dom.sql
+
Function documentation and usage:
---------------------------------
well-formed or not. It returns NULL if the parser couldn't be
created for any reason.
+pgxml_xpath (XQuery functions) - differs between the versions:
+
+pgxml.c (expat version) has:
+
pgxml_xpath(text doc, text xpath, int n) returns text
parses doc and returns the cdata of the nth occurence of
-the "XPath" listed. See below for details on the syntax.
+the "simple path" entry.
+However, the remainder of this document will cover the pgxml_dom.c version.
+
+pgxml_xpath(text doc, text xpath, text toptag, text septag) returns text
+ evaluates xpath on doc, and returns the result wrapped in
+<toptag>...</toptag> and each result node wrapped in
+<septag></septag>. toptag and septag may be empty strings, in which
+case the respective tag will be omitted.
Example:
one can type:
select docid,
-pgxml_xpath(document,'/site/name',1) as sitename,
-pgxml_xpath(document,'/site/location',1) as location
+pgxml_xpath(document,'//site/name/text()','','') as sitename,
+pgxml_xpath(document,'//site/location/text()','','') as location
from docstore;
and get as output:
- docid | sitename | location
--------+-----------------------------+------------
- 1 | Church Farm, Ashton Keynes | SU04209424
- 2 | Glebe Farm, Long Itchington | SP41506500
-(2 rows)
+ docid | sitename | location
+-------+--------------------------------------+------------
+ 1 | Church Farm, Ashton Keynes | SU04209424
+ 2 | Glebe Farm, Long Itchington | SP41506500
+ 3 | The Bungalow, Thames Lane, Cricklade | SU10229362
+(3 rows)
+
+or, to illustrate the use of the extra tags:
+select docid as id,
+pgxml_xpath(document,'//find/type/text()','set','findtype')
+from docstore;
-"XPath" syntax supported
-------------------------
+ id | pgxml_xpath
+----+-------------------------------------------------------------------------
+ 1 | <set></set>
+ 2 | <set><findtype>Urn</findtype></set>
+ 3 | <set><findtype>Pottery</findtype><findtype>Animal bone</findtype></set>
+(3 rows)
-At present it only supports paths of the form:
-'tag1/tag2' or '/tag1/tag2'
+Which produces a new, well-formed document. Note that document 1 had
+no matching instances, so the set returned contains no
+elements. document 2 has 1 matching element and document 3 has 2.
-The first case will find any <tag2> within a <tag1>, the second will
-find any <tag2> within a <tag1> at the top level of the document.
+This is just scratching the surface because XPath allows all sorts of
+operations.
-The real XPath is much more complex (see TODO file).
+Note: I've only implemented the return of nodeset and string values so
+far. This covers (I think) many types of queries, however.
+John Gray <jgray@azuli.co.uk> 16 August 2001
-John Gray <jgray@azuli.co.uk> 26 July 2001
PGXML TODO List
===============
-Some of these items still require much more thought! The data model
-for XML documents and the parsing model of expat don't really fit so
-well with a standard SQL model.
+Some of these items still require much more thought! Since the first
+release, the XPath support has improved (because I'm no longer using a
+homemade algorithm!).
-1. Generalised XML parsing support
+1. Performance considerations
-Allow a user to specify handlers (in any PL) to be used by the parser.
-This must permit distinct sets of parser settings -user may want some
-documents in a database to parsed with one set of handlers, others
-with a different set.
+At present each document is parsed to produce the DOM tree on every query.
-i.e. the pgxml_parse function would take as parameters (document,
-parsername) where parsername was the identifier for a collection of
-handler etc. settings.
+Pros:
+ Easy
+ No persistent memory or storage allocation for parsed trees
+ (libxml docs suggest representation of a document might
+ be 4 times the size of the text)
-"Stub" handlers in the pgxml code would invoke the functions through
-the standard fmgr interface. The parser interface would define the
-prototype for these functions. How does the handler function know
-which document/context has resulted it in being called?
+Cons:
+ Slow/ CPU intensive to parse.
+ Makes it difficult for PLs to apply libxml manipulations to create
+ new documents or amend existing ones.
-Mechanism for defining collection of parser settings (in a table? -but
-maybe copied for efficiency into a structure when first required by a
-query?)
-2. Support for other parsers
+2. XQuery
-Expat may not be the best choice as a parser because a new parser
-instance is needed for each document i.e. all the handlers must be set
-again for each document. Another parser may have a more efficient way
-of parsing a set of documents identically.
+I'm not sure if the addition of XQuery would be best as a function or
+as a new front-end parser. This is one to think about, but with a
+decent implementation of XPath, one of the prerequisites is covered.
-3. XPath support
+3. DOM Interfaces
-Proper XPath support. I really need to sit down and plough
-through the specification...
+Expose more aspects of the DOM to user functions/ PLs. This would
+allow a procedure in a PL to run some queries and then use exposed
+interfaces to libxml to create an XML document out of the query
+results. I accept the argument that this might be more properly
+performed on the client side.
-The very simple text comparison system currently used is too
-basic. Need to convert the path to an ordered list of nodes. Each node
-is an element qualifier, and may have a list of attribute
-qualifications attached. This probably requires lexx/yacc combination.
-(James Clark has written a yacc grammar for XPath). Not all the
-features of XPath are necessarily relevant.
+4. Returning sets of documents from XPath queries.
-An option to return subdocuments (i.e. subelements AND cdata, not just
-cdata). This should maybe be the default.
-
-4. Multiple occurences of elements.
-
-This section is all very sketchy, and has various weaknesses.
+Although the current implementation allows you to amalgamate the
+returned results into a single document, it's quite possible that
+you'd like to use the returned set of nodes as a source for FROM.
Is there a good way to optimise/index the results of certain XPath
operations to make them faster?:
-select docid, pgxml_xpath(document,'/site/location',1) as location
-where pgxml_xpath(document,'/site/name',1) = 'Church Farm';
+select docid, pgxml_xpath(document,'//site/location/text()','','') as location
+where pgxml_xpath(document,'//site/name/text()','','') = 'Church Farm';
and with multiple element occurences in a document?
-select d.docid, pgxml_xpath(d.document,'/site/location',1)
+select d.docid, pgxml_xpath(d.document,'//site/location/text()','','')
from docstore d,
-pgxml_xpaths('docstore','document','feature/type','docid') ft
+pgxml_xpaths('docstore','document','//feature/type/text()','docid') ft
where ft.key = d.docid and ft.value ='Limekiln';
pgxml_xpaths params are relname, attrname, xpath, returnkey. It would
The pgxml_xpaths function could be the basis of a functional index,
which could speed up the above query very substantially, working
-through the normal query planner mechanism. Syntax above is fragile
-through using names rather than OID.
+through the normal query planner mechanism.
+
+5. Return type support.
+
+Better support for returning e.g. numeric or boolean values. I need to
+get to grips with the returned data from libxml first.
+
-John Gray <jgray@azuli.co.uk>
+John Gray <jgray@azuli.co.uk> 16 August 2001
CREATE FUNCTION pgxml_parse(text) RETURNS bool
AS '_OBJWD_/pgxml_DLSUFFIX_' LANGUAGE 'c' WITH (isStrict);
-CREATE FUNCTION pgxml_xpath(text,text,int) RETURNS text
+CREATE FUNCTION pgxml_xpath(text,text,text,text) RETURNS text
AS '_OBJWD_/pgxml_DLSUFFIX_' LANGUAGE 'c' WITH (isStrict);
\ No newline at end of file