1. I've now produced an updated version (and called it 0.2) of my XML

author Bruce Momjian <bruce@momjian.us>

Tue, 21 Aug 2001 00:39:20 +0000 (00:39 +0000)

committer Bruce Momjian <bruce@momjian.us>

Tue, 21 Aug 2001 00:39:20 +0000 (00:39 +0000)
author Bruce Momjian <bruce@momjian.us>
Tue, 21 Aug 2001 00:39:20 +0000 (00:39 +0000)
committer Bruce Momjian <bruce@momjian.us>
Tue, 21 Aug 2001 00:39:20 +0000 (00:39 +0000)
diff --git a/contrib/xml/Makefile b/contrib/xml/Makefile

index 39e012dd1fb84d1059105340c7bec2621b35d608..a75ac095f607909d5dd66bba4aecd4f94a21c91e 100644 (file)
--- a/contrib/xml/Makefile
+++ b/contrib/xml/Makefile
@@ -8,24 +8,22 @@ subdir = contrib/xml
  top_builddir = ../..
  include $(top_builddir)/src/Makefile.global
  
-override CFLAGS+= $(CFLAGS_SL)
+override CFLAGS+= $(CFLAGS_SL) -g
  
  
  #
  # DLOBJS is the dynamically-loaded object files.  The "funcs" queries
  # include CREATE FUNCTIONs that load routines from these files.
  #
-DLOBJS= pgxml$(DLSUFFIX)
+DLOBJS= pgxml_dom$(DLSUFFIX)
  
  
-QUERIES= pgxml.sql
+QUERIES= pgxml_dom.sql
  
  all: $(DLOBJS) $(QUERIES)
  
-# Requires the expat library
-
  %.so: %.o
-       $(CC) -shared -lexpat -o $@ $<
+       $(CC) -shared -lxml2 -o $@ $<
  
  
  %.sql: %.source
@@ -41,3 +39,7 @@ all: $(DLOBJS) $(QUERIES)
  
  clean:
         rm -f $(DLOBJS) $(QUERIES)
+
+
+
+
diff --git a/contrib/xml/README b/contrib/xml/README

index 068615eaa800e3a2e3ca7970ff315747eb5de575..6c714f74e120f467f9d07842084da2b02707afac 100644 (file)
--- a/contrib/xml/README
+++ b/contrib/xml/README
@@ -1,18 +1,35 @@
-This package contains a couple of simple routines for hooking the
-expat XML parser up to PostgreSQL. This is a work-in-progress and all
-very basic at the moment (see the file TODO for some outline of what
-remains to be done).
+This package contains some simple routines for manipulating XML
+documents stored in PostgreSQL. This is a work-in-progress and
+somewhat basic at the moment (see the file TODO for some outline of
+what remains to be done).
  
-At present, two functions are defined, one which checks
-well-formedness, and the other which performs very simple XPath-type
-queries.
+At present, two modules (based on different XML handling libraries)
+are provided.
  
  Prerequisite:
  
+pgxml.c:
  expat parser 1.95.0 or newer (http://expat.sourceforge.net)
  
-I used a shared library version -I'm sure you could use a static
-library if you wished though. I had no problems compiling from source.
+or
+
+pgxml_dom.c:
+libxml2 (http://xmlsoft.org)
+
+The libxml2 version provides more complete XPath functionality, and
+seems like a good way to go. I've left the old versions in there for
+comparison.
+
+Compiling and loading:
+----------------------
+
+The Makefile only builds the libxml2 version.
+
+To compile, just type make.
+
+Then you can use psql to load the two function definitions: 
+\i pgxml_dom.sql
+
  
  Function documentation and usage:
  ---------------------------------
@@ -22,10 +39,21 @@ pgxml_parse(text) returns bool
  well-formed or not. It returns NULL if the parser couldn't be
  created for any reason.
  
+pgxml_xpath (XQuery functions) - differs between the versions:
+
+pgxml.c (expat version) has:
+
  pgxml_xpath(text doc, text xpath, int n) returns text
    parses doc and returns the cdata of the nth occurence of
-the "XPath" listed. See below for details on the syntax.
+the "simple path" entry. 
  
+However, the remainder of this document will cover the pgxml_dom.c version.
+
+pgxml_xpath(text doc, text xpath, text toptag, text septag) returns text
+  evaluates xpath on doc, and returns the result wrapped in
+<toptag>...</toptag> and each result node wrapped in
+<septag></septag>. toptag and septag may be empty strings, in which
+case the respective tag will be omitted.
  
  Example:
  
@@ -49,30 +77,42 @@ descriptions, in case anyone is wondering):
  one can type:
  
  select docid, 
-pgxml_xpath(document,'/site/name',1) as sitename,
-pgxml_xpath(document,'/site/location',1) as location
+pgxml_xpath(document,'//site/name/text()','','') as sitename,
+pgxml_xpath(document,'//site/location/text()','','') as location
   from docstore;
   
  and get as output:
  
- docid |          sitename           |  location  
--------+-----------------------------+------------
-     1 | Church Farm, Ashton Keynes  | SU04209424
-     2 | Glebe Farm, Long Itchington | SP41506500
-(2 rows)
+ docid |               sitename               |  location  
+-------+--------------------------------------+------------
+     1 | Church Farm, Ashton Keynes           | SU04209424
+     2 | Glebe Farm, Long Itchington          | SP41506500
+     3 | The Bungalow, Thames Lane, Cricklade | SU10229362
+(3 rows)
+
+or, to illustrate the use of the extra tags:
  
+select docid as id,
+pgxml_xpath(document,'//find/type/text()','set','findtype') 
+from docstore;
  
-"XPath" syntax supported
-------------------------
+ id |                               pgxml_xpath                               
+----+-------------------------------------------------------------------------
+  1 | <set></set>
+  2 | <set><findtype>Urn</findtype></set>
+  3 | <set><findtype>Pottery</findtype><findtype>Animal bone</findtype></set>
+(3 rows)
  
-At present it only supports paths of the form:
-'tag1/tag2' or '/tag1/tag2'
+Which produces a new, well-formed document. Note that document 1 had
+no matching instances, so the set returned contains no
+elements. document 2 has 1 matching element and document 3 has 2.
  
-The first case will find any <tag2> within a <tag1>, the second will
-find any <tag2> within a <tag1> at the top level of the document.
+This is just scratching the surface because XPath allows all sorts of
+operations.
  
-The real XPath is much more complex (see TODO file).
+Note: I've only implemented the return of nodeset and string values so
+far. This covers (I think) many types of queries, however.
  
+John Gray <jgray@azuli.co.uk>  16 August 2001
  
-John Gray <jgray@azuli.co.uk>  26 July 2001
  
diff --git a/contrib/xml/TODO b/contrib/xml/TODO

index 5bec69b4a75e10170e03db14eeeb05ce8dc1df1d..5ddd62a658a7516294f965fa6c526b6808bf9f71 100644 (file)
--- a/contrib/xml/TODO
+++ b/contrib/xml/TODO
@@ -1,67 +1,57 @@
  PGXML TODO List
  ===============
  
-Some of these items still require much more thought! The data model
-for XML documents and the parsing model of expat don't really fit so
-well with a standard SQL model.
+Some of these items still require much more thought! Since the first
+release, the XPath support has improved (because I'm no longer using a
+homemade algorithm!).
  
-1. Generalised XML parsing support
+1. Performance considerations
  
-Allow a user to specify handlers (in any PL) to be used by the parser.
-This must permit distinct sets of parser settings -user may want some
-documents in a database to parsed with one set of handlers, others
-with a different set.
+At present each document is parsed to produce the DOM tree on every query.
  
-i.e. the pgxml_parse function would take as parameters (document,
-parsername) where parsername was the identifier for a collection of
-handler etc. settings.
+Pros: 
+       Easy
+       No persistent memory or storage allocation for parsed trees
+               (libxml docs suggest representation of a document might
+                be 4 times the size of the text)
  
-"Stub" handlers in the pgxml code would invoke the functions through
-the standard fmgr interface. The parser interface would define the
-prototype for these functions. How does the handler function know
-which document/context has resulted it in being called?
+Cons:
+       Slow/ CPU intensive to parse.
+       Makes it difficult for PLs to apply libxml manipulations to create
+               new documents or amend existing ones.
  
-Mechanism for defining collection of parser settings (in a table? -but
-maybe copied for efficiency into a structure when first required by a
-query?)
  
-2. Support for other parsers
+2. XQuery 
  
-Expat may not be the best choice as a parser because a new parser
-instance is needed for each document i.e. all the handlers must be set
-again for each document. Another parser may have a more efficient way
-of parsing a set of documents identically.
+I'm not sure if the addition of XQuery would be best as a function or
+as a new front-end parser. This is one to think about, but with a
+decent implementation of XPath, one of the prerequisites is covered.
  
-3. XPath support
+3. DOM Interfaces
  
-Proper XPath support. I really need to sit down and plough
-through the specification...
+Expose more aspects of the DOM to user functions/ PLs. This would
+allow a procedure in a PL to run some queries and then use exposed
+interfaces to libxml to create an XML document out of the query
+results. I accept the argument that this might be more properly
+performed on the client side.
  
-The very simple text comparison system currently used is too
-basic. Need to convert the path to an ordered list of nodes. Each node
-is an element qualifier, and may have a list of attribute
-qualifications attached. This probably requires lexx/yacc combination.
-(James Clark has written a yacc grammar for XPath). Not all the
-features of XPath are necessarily relevant.
+4. Returning sets of documents from XPath queries.
  
-An option to return subdocuments (i.e. subelements AND cdata, not just
-cdata). This should maybe be the default.
-
-4. Multiple occurences of elements.
-
-This section is all very sketchy, and has various weaknesses.
+Although the current implementation allows you to amalgamate the
+returned results into a single document, it's quite possible that
+you'd like to use the returned set of nodes as a source for FROM.
   
  Is there a good way to optimise/index the results of certain XPath
  operations to make them faster?:
  
-select docid, pgxml_xpath(document,'/site/location',1) as location 
-where pgxml_xpath(document,'/site/name',1) = 'Church Farm';
+select docid, pgxml_xpath(document,'//site/location/text()','','') as location 
+where pgxml_xpath(document,'//site/name/text()','','') = 'Church Farm';
  
  and with multiple element occurences in a document?
  
-select d.docid, pgxml_xpath(d.document,'/site/location',1) 
+select d.docid, pgxml_xpath(d.document,'//site/location/text()','','') 
  from docstore d, 
-pgxml_xpaths('docstore','document','feature/type','docid') ft 
+pgxml_xpaths('docstore','document','//feature/type/text()','docid') ft 
  where ft.key = d.docid and ft.value ='Limekiln';
  
  pgxml_xpaths params are relname, attrname, xpath, returnkey. It would
@@ -71,10 +61,15 @@ defined by relname and attrname.
  
  The pgxml_xpaths function could be the basis of a functional index,
  which could speed up the above query very substantially, working
-through the normal query planner mechanism. Syntax above is fragile
-through using names rather than OID.
+through the normal query planner mechanism.
+
+5. Return type support.
+
+Better support for returning e.g. numeric or boolean values. I need to
+get to grips with the returned data from libxml first.
+
   
-John Gray <jgray@azuli.co.uk>
+John Gray <jgray@azuli.co.uk> 16 August 2001
  
  
  
diff --git a/contrib/xml/pgxml.source b/contrib/xml/pgxml.source

index 6f425077c191f4cd0de288fd21fa0a2a2289d61f..8a04fa2c9b2ff64f4ea1b60ea0583c4f3e545ef8 100644 (file)
--- a/contrib/xml/pgxml.source
+++ b/contrib/xml/pgxml.source
@@ -3,5 +3,5 @@
  CREATE FUNCTION pgxml_parse(text) RETURNS bool
         AS '_OBJWD_/pgxml_DLSUFFIX_' LANGUAGE 'c' WITH (isStrict);
  
-CREATE FUNCTION pgxml_xpath(text,text,int) RETURNS text
+CREATE FUNCTION pgxml_xpath(text,text,text,text) RETURNS text
         AS '_OBJWD_/pgxml_DLSUFFIX_' LANGUAGE 'c' WITH (isStrict);
 \ No newline at end of file
author	Bruce Momjian <bruce@momjian.us>
	Tue, 21 Aug 2001 00:39:20 +0000 (00:39 +0000)
committer	Bruce Momjian <bruce@momjian.us>
	Tue, 21 Aug 2001 00:39:20 +0000 (00:39 +0000)
contrib/xml/Makefile		patch \| blob \| history
contrib/xml/README		patch \| blob \| history
contrib/xml/TODO		patch \| blob \| history
contrib/xml/pgxml.source		patch \| blob \| history