]> granicus.if.org Git - postgresql/commitdiff
The new LZ compression and an lztext data type based on it.
authorJan Wieck <JanWieck@Yahoo.com>
Wed, 17 Nov 1999 21:21:51 +0000 (21:21 +0000)
committerJan Wieck <JanWieck@Yahoo.com>
Wed, 17 Nov 1999 21:21:51 +0000 (21:21 +0000)
Jan

src/backend/utils/adt/Makefile
src/backend/utils/adt/lztext.c [new file with mode: 0644]
src/backend/utils/adt/pg_lzcompress.c [new file with mode: 0644]
src/include/catalog/pg_proc.h
src/include/catalog/pg_type.h
src/include/utils/builtins.h
src/include/utils/lztext.h [new file with mode: 0644]
src/include/utils/pg_lzcompress.h [new file with mode: 0644]

index 74ef6285d2c791f524265dd37de7b394c04f1c03..b2999d0d0df9e8cf9027c03b00d2b2bb84893102 100644 (file)
@@ -4,7 +4,7 @@
 #    Makefile for utils/adt
 #
 # IDENTIFICATION
-#    $Header: /cvsroot/pgsql/src/backend/utils/adt/Makefile,v 1.26 1999/09/30 14:54:22 wieck Exp $
+#    $Header: /cvsroot/pgsql/src/backend/utils/adt/Makefile,v 1.27 1999/11/17 21:21:50 wieck Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -35,7 +35,7 @@ OBJS = acl.o arrayfuncs.o arrayutils.o bool.o cash.o char.o chunk.o \
        regexp.o regproc.o ruleutils.o selfuncs.o sets.o \
        tid.o timestamp.o varchar.o varlena.o version.o \
        network.o mac.o inet_net_ntop.o inet_net_pton.o \
-       ri_triggers.o
+       ri_triggers.o pg_lzcompress.o lztext.o
 
 all: SUBSYS.o
 
diff --git a/src/backend/utils/adt/lztext.c b/src/backend/utils/adt/lztext.c
new file mode 100644 (file)
index 0000000..49c37de
--- /dev/null
@@ -0,0 +1,266 @@
+/* ----------
+ * lztext.c -
+ *
+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/Attic/lztext.c,v 1.1 1999/11/17 21:21:50 wieck Exp $
+ *
+ *     Text type with internal LZ compressed representation. Uses the
+ *     standard PostgreSQL compression method.
+ * ----------
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <errno.h>
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "utils/palloc.h"
+#include "utils/pg_lzcompress.h"
+
+
+/* ----------
+ * lztextin -
+ *
+ *             Input function for datatype lztext
+ * ----------
+ */
+lztext *
+lztextin(char *str)
+{
+       lztext             *result;
+       int32                   rawsize;
+       lztext             *tmp;
+       int                             tmp_size;
+
+       /* ----------
+        * Handle NULL
+        * ----------
+        */
+       if (str == NULL)
+               return NULL;
+
+       /* ----------
+        * Determine input size and eventually tuple size
+        * ----------
+        */
+       rawsize = strlen(str);
+       tmp_size = PGLZ_MAX_OUTPUT(rawsize);
+
+       /* ----------
+        * Allocate a temporary result and compress into it
+        * ----------
+        */
+       tmp = (lztext *) palloc(tmp_size);
+       pglz_compress(str, rawsize, tmp, NULL);
+
+       /* ----------
+        * If we miss less than x% bytes at the end of the temp value,
+        * so be it. Therefore we save a memcpy().
+        * ----------
+        */
+       if (tmp_size - tmp->varsize < 256 || 
+                                       tmp_size - tmp->varsize < tmp_size / 4)
+       {
+               result = tmp;
+       } else {
+               result = (lztext *) palloc(tmp->varsize);
+               memcpy(result, tmp, tmp->varsize);
+               pfree(tmp);
+       }
+
+       return result;
+}
+
+
+/* ----------
+ * lztextout -
+ *
+ *             Output function for data type lztext
+ * ----------
+ */
+char *
+lztextout(lztext *lz)
+{
+       char                    *result;
+
+       /* ----------
+        * Handle NULL
+        * ----------
+        */
+       if (lz == NULL)
+       {
+               result = (char *) palloc(2);
+               result[0] = '-';
+               result[1] = '\0';
+               return result;
+       }
+
+       /* ----------
+        * Allocate the result string - the required size is remembered
+        * in the lztext header so we don't need a temporary buffer or
+        * have to diddle with realloc's.
+        * ----------
+        */
+       result = (char *) palloc(PGLZ_RAW_SIZE(lz) + 1);
+
+       /* ----------
+        * Decompress and add terminating ZERO
+        * ----------
+        */
+       pglz_decompress(lz, result);
+       result[lz->rawsize] = '\0';
+
+       /* ----------
+        * Return the result
+        * ----------
+        */
+       return result;
+}
+
+
+/* ----------
+ * lztextlen -
+ *
+ *     Logical length of lztext field (it's the uncompressed size
+ *     of the original data).
+ * ----------
+ */
+int32
+lztextlen(lztext *lz)
+{
+       /* ----------
+        * Handle NULL
+        * ----------
+        */
+       if (lz == NULL)
+               return 0;
+
+       /* ----------
+        * without multibyte support, it's the remembered rawsize
+        * ----------
+        */
+       return lz->rawsize;
+}
+
+
+/* ----------
+ * lztextoctetlen -
+ *
+ *     Physical length of lztext field (it's the compressed size
+ *     plus the rawsize field).
+ * ----------
+ */
+int32
+lztextoctetlen(lztext *lz)
+{
+       /* ----------
+        * Handle NULL
+        * ----------
+        */
+       if (lz == NULL)
+               return 0;
+
+       /* ----------
+        * Return the varsize minus the VARSIZE field itself.
+        * ----------
+        */
+       return lz->varsize - sizeof(int32);
+}
+
+
+/* ----------
+ * text_lztext -
+ *
+ *     Convert text to lztext
+ * ----------
+ */
+lztext *
+text_lztext(text *txt)
+{
+       lztext             *result;
+       int32                   rawsize;
+       lztext             *tmp;
+       int                             tmp_size;
+       char               *str;
+
+       /* ----------
+        * Handle NULL
+        * ----------
+        */
+       if (txt == NULL)
+               return NULL;
+
+       /* ----------
+        * Determine input size and eventually tuple size
+        * ----------
+        */
+       rawsize  = VARSIZE(txt) - VARHDRSZ;
+       str      = VARDATA(txt);
+       tmp_size = PGLZ_MAX_OUTPUT(rawsize);
+
+       /* ----------
+        * Allocate a temporary result and compress into it
+        * ----------
+        */
+       tmp = (lztext *) palloc(tmp_size);
+       pglz_compress(str, rawsize, tmp, NULL);
+
+       /* ----------
+        * If we miss less than x% bytes at the end of the temp value,
+        * so be it. Therefore we save a memcpy().
+        * ----------
+        */
+       if (tmp_size - tmp->varsize < 256 || 
+                                       tmp_size - tmp->varsize < tmp_size / 4)
+       {
+               result = tmp;
+       } else {
+               result = (lztext *) palloc(tmp->varsize);
+               memcpy(result, tmp, tmp->varsize);
+               pfree(tmp);
+       }
+
+       return result;
+
+       
+}
+
+
+/* ----------
+ * lztext_text -
+ *
+ *     Convert lztext to text
+ * ----------
+ */
+text *
+lztext_text(lztext *lz)
+{
+       text       *result;
+
+       /* ----------
+        * Handle NULL
+        * ----------
+        */
+       if (lz == NULL)
+               return NULL;
+
+       /* ----------
+        * Allocate and initialize the text result
+        * ----------
+        */
+       result = (text *) palloc(lz->rawsize + VARHDRSZ + 1);
+       VARSIZE(result) = lz->rawsize + VARHDRSZ;
+
+       /* ----------
+        * Decompress directly into the text data area.
+        * ----------
+        */
+       pglz_decompress(lz, VARDATA(result));
+       VARDATA(result)[lz->rawsize] = 0;
+
+       return result;
+}
+
+
diff --git a/src/backend/utils/adt/pg_lzcompress.c b/src/backend/utils/adt/pg_lzcompress.c
new file mode 100644 (file)
index 0000000..88721d3
--- /dev/null
@@ -0,0 +1,669 @@
+/* ----------
+ * pg_lzcompress.c -
+ *
+ * $Header: /cvsroot/pgsql/src/backend/utils/adt/pg_lzcompress.c,v 1.1 1999/11/17 21:21:50 wieck Exp $
+ *
+ *             This is an implementation of LZ compression for PostgreSQL.
+ *             It uses a simple history table and generates 2-3 byte tags
+ *             capable of backward copy information for 3-273 bytes with
+ *             an offset of max. 4095.
+ *
+ *             Entry routines:
+ *
+ *                     int
+ *                     pglz_compress(char *source, int slen, PGLZ_Header *dest,
+ *                                                                             PGLZ_Strategy *strategy);
+ *
+ *                             source is the input data to be compressed.
+ *
+ *                             slen is the length of the input data.
+ *
+ *                             dest is the output area for the compressed result.
+ *                                     It must be big enough to hold the worst case of
+ *                                     compression failure and can be computed by the
+ *                                     macro PGLZ_MAX_OUTPUT(slen). Don't be surprised,
+ *                                     it is larger than the input data size.
+ *
+ *                             strategy is a pointer to some information controlling
+ *                                     the compression algorithm. If NULL, the compiled
+ *                                     in default strategy is used.
+ *
+ *                             The return value is the size of bytes written to buff.
+ *
+ *                     int
+ *                     pglz_decompress(PGLZ_Header *source, char *dest)
+ *
+ *                             source is the compressed input.
+ *
+ *                             dest is the area where the uncompressed data will be
+ *                                     written to. It is the callers responsibility to
+ *                                     provide enough space. The required amount can be
+ *                                     obtained with the macro PGLZ_RAW_SIZE(source).
+ *
+ *                                     The data is written to buff exactly as it was handed
+ *                                     to pglz_compress(). No terminating zero byte is added.
+ *
+ *                             The return value is the size of bytes written to buff.
+ *                                     Obviously the same as PGLZ_RAW_SIZE() returns.
+ *
+ *             The compression algorithm and internal data format:
+ *
+ *                     PGLZ_Header is defined as
+ *
+ *                             typedef struct PGLZ_Header {
+ *                                     int32           varsize;
+ *                                     int32           rawsize;
+ *                             }
+ *
+ *                     The header is followed by the compressed data itself.
+ *
+ *                     The algorithm is easiest explained by describing the process
+ *                     of decompression.
+ *
+ *                     If varsize == rawsize + sizeof(PGLZ_Header), then the data
+ *                     is stored uncompressed as plain bytes. Thus, the decompressor
+ *                     simply copies rawsize bytes from the location after the
+ *                     header to the destination.
+ *
+ *                     Otherwise the first byte after the header tells what to do
+ *                     the next 8 times. We call this the control byte.
+ *
+ *                     An unset bit in the control byte means, that one uncompressed
+ *                     byte follows, which is copied from input to output.
+ *
+ *                     A set bit in the control byte means, that a tag of 2-3 bytes
+ *                     follows. A tag contains information to copy some bytes, that
+ *                     are already in the output buffer, to the current location in
+ *                     the output. Let's call the three tag bytes T1, T2 and T3. The
+ *                     position of the data to copy is coded as an offset from the
+ *                     actual output position.
+ *
+ *                     The offset is in the upper nibble of T1 and in T2.
+ *                     The length is in the lower nibble of T1.
+ *
+ *                     So the 16 bits of a 2 byte tag are coded as
+ *
+ *              7---T1--0  7---T2--0
+ *                             OOOO LLLL  OOOO OOOO
+ *
+ *                     This limits the offset to 1-4095 (12 bits) and the length
+ *                     to 3-18 (4 bits) because 3 is allways added to it. To emit
+ *                     a tag of 2 bytes with a length of 2 only saves one control
+ *                     bit. But we loose one byte in the possible length of a tag.
+ *
+ *                     In the actual implementation, the 2 byte tag's length is
+ *                     limited to 3-17, because the value 0xF in the length nibble
+ *                     has special meaning. It means, that the next following
+ *                     byte (T3) has to be added to the length value of 18. That
+ *                     makes total limits of 1-4095 for offset and 3-273 for length.
+ *
+ *                     Now that we have successfully decoded a tag. We simply copy
+ *                     the output that occured <offset> bytes back to the current
+ *                     output location in the specified <length>. Thus, a
+ *                     sequence of 200 spaces (think about bpchar fields) could be
+ *                     coded in 4 bytes. One literal space and a three byte tag to
+ *                     copy 199 bytes with a -1 offset. Whow - that's a compression
+ *                     rate of 98%! Well, the implementation needs to save the
+ *                     original data size too, so we need another 4 bytes for it
+ *                     and end up with a total compression rate of 96%, what's still
+ *                     worth a Whow.
+ *
+ *             Acknowledgements:
+ *
+ *                     Many thanks to Adisak Pochanayon, who's article about SLZ
+ *                     inspired me to write the PostgreSQL compression this way.
+ *
+ *                     Jan Wieck
+ * ----------
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+
+#include "postgres.h"
+#include "utils/palloc.h"
+#include "utils/pg_lzcompress.h"
+
+
+/* ----------
+ * Local definitions
+ * ----------
+ */
+#define PGLZ_HISTORY_SIZE              8192
+#define PGLZ_HISTORY_MASK              0x1fff
+#define PGLZ_HISTORY_PREALLOC  8192
+#define PGLZ_MAX_MATCH                 273
+
+
+/* ----------
+ * PGLZ_HistEntry -
+ *
+ *             Linked list for the backward history lookup
+ * ----------
+ */
+typedef struct PGLZ_HistEntry {
+       struct PGLZ_HistEntry      *next;
+       char                                       *pos;
+} PGLZ_HistEntry;
+
+
+/* ----------
+ * The provided standard strategies
+ * ----------
+ */
+static PGLZ_Strategy strategy_default_data = {
+       256,    /* Data chunks smaller 256 bytes are nott compressed                    */
+       6144,   /* Data chunks greater equal 6K force compression                               */
+                       /* except compressed result is greater uncompressed data                */
+       20,             /* Compression rates below 20% mean fallback to uncompressed    */
+                       /* storage except compression is forced by previous parameter   */
+       128,    /* Stop history lookup if a match of 128 bytes is found                 */
+       10              /* Lower good match size by 10% at every lookup loop iteration. */
+};
+PGLZ_Strategy  *PGLZ_strategy_default = &strategy_default_data;
+
+
+static PGLZ_Strategy strategy_allways_data = {
+       0,              /* Chunks of any size are compressed                                                    */
+       0,              /*                                                                                                                              */
+       0,              /* We want to save at least one single byte                                             */
+       128,    /* Stop history lookup if a match of 128 bytes is found                 */
+       6               /* Look harder for a good match.                                                                */
+};
+PGLZ_Strategy  *PGLZ_strategy_allways = &strategy_allways_data;
+
+
+static PGLZ_Strategy strategy_never_data = {
+       0,              /*                                                                                                                              */
+       0,              /*                                                                                                                              */
+       0,              /*                                                                                                                              */
+       0,              /* Zero indicates "store uncompressed allways"                                  */
+       0               /*                                                                                                                              */
+};
+PGLZ_Strategy  *PGLZ_strategy_never = &strategy_never_data;
+
+
+
+/* ----------
+ * pglz_hist_idx -
+ *
+ *             Computes the history table slot for the lookup by the next 4
+ *             characters in the input.
+ * ----------
+ */
+#if 1
+#define pglz_hist_idx(_s,_e) (                                                                                         \
+                       (((_e) - (_s)) < 4) ? 0 :                                                                               \
+                       ((((_s)[0] << 9) ^ ((_s)[1] << 6) ^                                                     \
+                       ((_s)[2] << 3) ^ (_s)[3]) & (PGLZ_HISTORY_MASK))                                \
+               )
+#else
+#define pglz_hist_idx(_s,_e) (                                                                                         \
+                       (((_e) - (_s)) < 2) ? 0 :                                                                               \
+                       ((((_s)[0] << 8) ^ (_s)[1]) & (PGLZ_HISTORY_MASK))                              \
+               )
+#endif
+
+
+/* ----------
+ * pglz_hist_add -
+ *
+ *             Adds a new entry to the history table.
+ * ----------
+ */
+#define pglz_hist_add(_hs,_hn,_s,_e) {                                                                         \
+                       int __hindex = pglz_hist_idx((_s),(_e));                                                \
+                       (_hn)->next = (_hs)[__hindex];                                                                  \
+                       (_hn)->pos  = (_s);                                                                                             \
+                       (_hs)[__hindex] = (_hn)++;                                                                              \
+               }
+
+
+/* ----------
+ * pglz_out_ctrl -
+ *
+ *             Outputs the last and allocates a new control byte if needed.
+ * ----------
+ */
+#define pglz_out_ctrl(__ctrlp,__ctrlb,__ctrl,__buf) {                                          \
+       if ((__ctrl & 0xff) == 0)                                                                                               \
+       {                                                                                                                                               \
+               *__ctrlp = __ctrlb;                                                                                                     \
+               __ctrlp = __buf++;                                                                                                      \
+               __ctrlb = 0;                                                                                                            \
+               __ctrl = 1;                                                                                                                     \
+       }                                                                                                                                               \
+}
+
+
+/* ----------
+ * pglz_out_literal -
+ *
+ *             Outputs a literal byte to the destination buffer including the
+ *             appropriate control bit.
+ * ----------
+ */
+#define pglz_out_literal(_ctrlp,_ctrlb,_ctrl,_buf,_byte) {                                     \
+       pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf);                                                                \
+       *_buf++ = (unsigned char)(_byte);                                                                               \
+       _ctrl <<= 1;                                                                                                                    \
+}
+
+
+/* ----------
+ * pglz_out_tag -
+ *
+ *             Outputs a backward reference tag of 2-4 bytes (depending on
+ *             offset and length) to the destination buffer including the
+ *             appropriate control bit.
+ * ----------
+ */
+#define pglz_out_tag(_ctrlp,_ctrlb,_ctrl,_buf,_len,_off) {                                     \
+       pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf);                                                                \
+       _ctrlb |= _ctrl;                                                                                                                \
+       _ctrl <<= 1;                                                                                                                    \
+       if (_len > 17)                                                                                                                  \
+       {                                                                                                                                               \
+               _buf[0] = (unsigned char)((((_off) & 0xf00) >> 4) | 0x0f);                      \
+               _buf[1] = (unsigned char)((_off & 0xff));                                                       \
+               _buf[2] = (unsigned char)((_len) - 18);                                                         \
+               _buf += 3;                                                                                                                      \
+       } else {                                                                                                                                \
+               _buf[0] = (unsigned char)((((_off) & 0xf00) >> 4) | (_len - 3));        \
+               _buf[1] = (unsigned char)((_off) & 0xff);                                                       \
+               _buf += 2;                                                                                                                      \
+       }                                                                                                                                               \
+}
+
+
+/* ----------
+ * pglz_find_match -
+ *
+ *             Lookup the history table if the actual input stream matches
+ *             another sequence of characters, starting somewhere earlier
+ *             in the input buffer.
+ * ----------
+ */
+static inline int
+pglz_find_match (PGLZ_HistEntry **hstart, char *input, char *end, 
+                                               int *lenp, int *offp, int good_match, int good_drop)
+{
+       PGLZ_HistEntry     *hent;
+       int32                           len = 0;
+       int32                           off = 0;
+       int32                           thislen;
+       int32                           thisoff;
+       char                       *ip;
+       char                       *hp;
+
+       /* ----------
+        * Traverse the linked history list until a good enough
+        * match is found.
+        * ----------
+        */
+       hent = hstart[pglz_hist_idx(input, end)];
+       while (hent && len < good_match)
+       {
+               /* ----------
+                * Be happy with lesser good matches the more entries we visited.
+                * ----------
+                */
+               good_match -= (good_match * good_drop) /100;
+
+               /* ----------
+                * Stop if the offset does not fit into our tag anymore.
+                * ----------
+                */
+               thisoff = (ip = input) - (hp = hent->pos);
+               if (thisoff >= 0x0fff)
+                       break;
+
+               /* ----------
+                * Determine length of match. A better match must be larger than
+                * the best so far. And if we already have a match of 16 or more
+                * bytes, it's worth the call overhead to use memcmp() to check
+                * if this match is equal for the same size. After that we must
+                * fallback to character by character comparision to know the
+                * exact position where the diff occured.
+                * ----------
+                */
+               if (len >= 16)
+               {
+                       if (memcmp(ip, hp, len) != 0)
+                       {
+                               hent = hent->next;
+                               continue;
+                       }
+                       thislen = len;
+                       ip += len;
+                       hp += len;
+               } else {
+                       thislen = 0;
+               }
+               while (ip < end && *ip == *hp && thislen < PGLZ_MAX_MATCH)
+               {
+                       thislen++;
+                       ip++;
+                       hp++;
+               }
+
+               /* ----------
+                * Remember this match as the best (if it is)
+                * ----------
+                */
+               if (thislen > len)
+               {
+                       len = thislen;
+                       off = thisoff;
+               }
+
+               /* ----------
+                * Advance to the next history entry
+                * ----------
+                */
+               hent = hent->next;
+       }
+
+       /* ----------
+        * Return match information only if it results at least in one
+        * byte reduction.
+        * ----------
+        */
+       if (len > 2)
+       {
+               *lenp = len;
+               *offp = off;
+               return 1;
+       }
+
+       return 0;
+}
+
+
+/* ----------
+ * pglz_compress -
+ * ----------
+ */
+int
+pglz_compress (char *source, int slen, PGLZ_Header *dest, PGLZ_Strategy *strategy)
+{
+       PGLZ_HistEntry     *hist_start[PGLZ_HISTORY_SIZE];
+       PGLZ_HistEntry     *hist_alloc;
+       PGLZ_HistEntry     hist_prealloc[PGLZ_HISTORY_PREALLOC];
+       PGLZ_HistEntry     *hist_next;
+
+       unsigned char      *bp = ((unsigned char *)dest) + sizeof(PGLZ_Header);
+       unsigned char      *bstart = bp;
+       char                       *dp = source;
+       char                       *dend = source + slen;
+       unsigned char           ctrl_dummy = 0;
+       unsigned char      *ctrlp = &ctrl_dummy;
+       unsigned char           ctrlb = 0;
+       unsigned char           ctrl = 0;
+       int32                           match_len;
+       int32                           match_off;
+       int32                           good_match;
+       int32                           good_drop;
+       int32                           do_compress = 1;
+       int32                           result_size = -1;
+       int32                           result_max;
+       int32                           need_rate;
+
+       /* ----------
+        * Our fallback strategy is the default.
+        * ----------
+        */
+       if (strategy == NULL)
+               strategy = PGLZ_strategy_default;
+
+       /* ----------
+        * Save the original source size in the header.
+        * ----------
+        */
+       dest->rawsize = slen;
+
+       /* ----------
+        * If the strategy forbids compression (at all or if source chunk too
+        * small), copy input to output without compression.
+        * ----------
+        */
+       if (strategy->match_size_good == 0)
+       {
+               memcpy(bstart, source, slen);
+               return (dest->varsize = slen + sizeof(PGLZ_Header));
+       } else {
+               if (slen < strategy->min_input_size)
+               {
+                       memcpy(bstart, source, slen);
+                       return (dest->varsize = slen + sizeof(PGLZ_Header));
+               }
+       }
+
+       /* ----------
+        * Limit the match size to the maximum implementation allowed value
+        * ----------
+        */
+       if ((good_match = strategy->match_size_good) > PGLZ_MAX_MATCH)
+               good_match = PGLZ_MAX_MATCH;
+       if (good_match < 17)
+               good_match = 17;
+
+       if ((good_drop = strategy->match_size_drop) < 0)
+               good_drop = 0;
+       if (good_drop > 100)
+               good_drop = 100;
+
+       /* ----------
+        * Initialize the history tables. For inputs smaller than
+        * PGLZ_HISTORY_PREALLOC, we already have a big enough history
+        * table on the stack frame.
+        * ----------
+        */
+       memset((void *)hist_start, 0, sizeof(hist_start));
+       if (slen + 1 <= PGLZ_HISTORY_PREALLOC)
+               hist_alloc = hist_prealloc;
+       else
+               hist_alloc = (PGLZ_HistEntry *)
+                                                       palloc(sizeof(PGLZ_HistEntry) * (slen + 1));
+       hist_next = hist_alloc;
+
+       /* ----------
+        * Compute the maximum result size allowed by the strategy.
+        * If the input size exceeds force_input_size, the max result size
+        * is the input size itself.
+        * Otherwise, it is the input size minus the minimum wanted
+        * compression rate.
+        * ----------
+        */
+       if (slen >= strategy->force_input_size)
+       {
+               result_max = slen;
+       } else {
+               need_rate = strategy->min_comp_rate;
+               if (need_rate < 0)
+                       need_rate = 0;
+               else if (need_rate > 99)
+                       need_rate = 99;
+               result_max = slen - ((slen * need_rate) / 100);
+       }
+
+       /* ----------
+        * Compress the source directly into the output buffer.
+        * ----------
+        */
+       while (dp < dend)
+       {
+               /* ----------
+                * If we already exceeded the maximum result size, set no compression
+                * flag and stop this. But don't check too often.
+                * ----------
+                */
+               if (bp - bstart >= result_max)
+               {
+                       do_compress = 0;
+                       break;
+               }
+
+               /* ----------
+                * Try to find a match in the history
+                * ----------
+                */
+               if (pglz_find_match(hist_start, dp, dend, &match_len, 
+                                                                               &match_off, good_match, good_drop))
+               {
+                       /* ----------
+                        * Create the tag and add history entries for
+                        * all matched characters.
+                        * ----------
+                        */
+                       pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off);
+                       while(match_len--)
+                       {
+                               pglz_hist_add(hist_start, hist_next, dp, dend);
+                               dp++;   /* Do not do this ++ in the line above!         */
+                                               /* The macro would do it four times - Jan.      */
+                       }
+               } else {
+                       /* ----------
+                        * No match found. Copy one literal byte.
+                        * ----------
+                        */
+                       pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp);
+                       pglz_hist_add(hist_start, hist_next, dp, dend);
+                       dp++;   /* Do not do this ++ in the line above!         */
+                                       /* The macro would do it four times - Jan.      */
+               }
+       }
+
+       /* ----------
+        * Get rid of the history (if allocated)
+        * ----------
+        */
+       if (hist_alloc != hist_prealloc)
+               pfree((void *)hist_alloc);
+
+       /* ----------
+        * If we are still in compressing mode, write out the last
+        * control byte and determine if the compression gained the
+        * rate requested by the strategy.
+        * ----------
+        */
+       if (do_compress)
+       {
+               *ctrlp = ctrlb;
+
+               result_size = bp - bstart;
+               if (result_size >= result_max) {
+                       do_compress = 0;
+               }
+       }
+
+       /* ----------
+        * Done - if we successfully compressed and matched the
+        * strategy's constraints, return the compressed result.
+        * Otherwise copy the original source over it and return
+        * the original length.
+        * ----------
+        */
+       if (do_compress)
+       {
+               return (dest->varsize = result_size + sizeof(PGLZ_Header));
+       } else {
+               memcpy(((char *)dest) + sizeof(PGLZ_Header), source, slen);
+               return (dest->varsize = slen + sizeof(PGLZ_Header));
+       }
+}
+
+
+/* ----------
+ * pglz_decompress -
+ * ----------
+ */
+int
+pglz_decompress (PGLZ_Header *source, char *dest)
+{
+       unsigned char      *dp;
+       unsigned char      *dend;
+       unsigned char      *bp;
+       unsigned char           ctrl;
+       int32                           ctrlc;
+       int32                           len;
+       int32                           off;
+
+       dp              = ((unsigned char *)source) + sizeof(PGLZ_Header);
+       dend    = ((unsigned char *)source) + source->varsize;
+       bp              = (unsigned char *)dest;
+
+       if (source->varsize == source->rawsize + sizeof(PGLZ_Header))
+       {
+               memcpy(dest, dp, source->rawsize);
+               return source->rawsize;
+       }
+
+       while (dp < dend)
+       {
+               /* ----------
+                * Read one control byte and process the next 8 items.
+                * ----------
+                */
+               ctrl = *dp++;
+               for (ctrlc = 0; ctrlc < 8 && dp < dend; ctrlc++)
+               {
+                       if (ctrl & 1)
+                       {
+                               /* ----------
+                                * Otherwise it contains the match length minus 3
+                                * and the upper 4 bits of the offset. The next following
+                                * byte contains the lower 8 bits of the offset. If
+                                * the length is coded as 18, another extension tag byte
+                                * tells how much longer the match really was (0-255).
+                                * ----------
+                                */
+                               len = (dp[0] & 0x0f) + 3;
+                               off = ((dp[0] & 0xf0) << 4) | dp[1];
+                               dp += 2;
+                               if (len == 18)
+                               {
+                                       len += *dp++;
+                               }
+
+                               /* ----------
+                                * Now we copy the bytes specified by the tag from
+                                * OUTPUT to OUTPUT. It is dangerous and platform
+                                * dependant to use memcpy() here, because the copied
+                                * areas could overlap extremely!
+                                * ----------
+                                */
+                               while (len--)
+                               {
+                                       *bp = bp[-off];
+                                       bp++;
+                               }
+                       } else {
+                               /* ----------
+                                * An unset control bit means LITERAL BYTE. So we
+                                * just copy one from INPUT to OUTPUT.
+                                * ----------
+                                */
+                               *bp++ = *dp++;
+                       }
+
+                       /* ----------
+                        * Advance the control bit
+                        * ----------
+                        */
+                       ctrl >>= 1;
+               }
+       }
+
+       /* ----------
+        * That's it.
+        * ----------
+        */
+       return (char *)bp - dest;
+}
+
+
index 89ff91c8e840917a7108762c1f5279f71af89f2d..bb2a5b6dd7f36b66f3f5c4b0d64171aac7bfc87b 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: pg_proc.h,v 1.105 1999/10/11 06:28:28 inoue Exp $
+ * $Id: pg_proc.h,v 1.106 1999/11/17 21:21:50 wieck Exp $
  *
  * NOTES
  *       The script catalog/genbki.sh reads this file and generates .bki
@@ -2338,6 +2338,28 @@ DESCR("larger of two numbers");
 DATA(insert OID = 1769 ( numeric_cmp                   PGUID 11 f t t 2 f 23 "1700 1700" 100 0 0 100  numeric_cmp - ));
 DESCR("compare two numbers");
 
+/* OID's 1625 - 1639 LZTEXT data type */
+DATA(insert OID = 1626 ( lztextin                              PGUID 11 f t t 1 f 1625 "0" 100 0 0 100  lztextin - ));
+DESCR("(internal)");
+DATA(insert OID = 1627 ( lztextout                             PGUID 11 f t t 1 f 23 "0" 100 0 0 100  lztextout - ));
+DESCR("(internal)");
+DATA(insert OID = 1628 ( lztext_text                   PGUID 11 f t t 1 f 25 "1625" 100 0 0 100  lztext_text -));
+DESCR("convert lztext to text");
+DATA(insert OID = 1629 ( text                                  PGUID 11 f t t 1 f 25 "1625" 100 0 0 100  lztext_text -));
+DESCR("convert lztext to text");
+DATA(insert OID = 1630 ( text_lztext                   PGUID 11 f t t 1 f 1625 "25" 100 0 0 100  text_lztext -));
+DESCR("convert text to lztext");
+DATA(insert OID = 1631 ( lztext                                        PGUID 11 f t t 1 f 1625 "25" 100 0 0 100  text_lztext -));
+DESCR("convert text to lztext");
+DATA(insert OID = 1632 ( lztextlen                             PGUID 11 f t t 1 f 23 "1625" 100 0 1 0  lztextlen - ));
+DESCR("length");
+DATA(insert OID = 1633 ( length                                        PGUID 11 f t t 1 f 23 "1625" 100 0 1 0  lztextlen - ));
+DESCR("length");
+DATA(insert OID = 1634 ( lztextoctetlen                        PGUID 11 f t t 1 f 23 "1625" 100 0 1 0  lztextoctetlen - ));
+DESCR("octet length");
+DATA(insert OID = 1635 ( octet_length                  PGUID 11 f t t 1 f 23 "1625" 100 0 1 0  lztextoctetlen - ));
+DESCR("octet length");
+
 
 /*
  * prototypes for functions pg_proc.c
index aceadef757ef3dc2d784cbbb5aee36d07799b0bd..f154d016cef9113201f4d9a5642dab1211c29662 100644 (file)
@@ -7,7 +7,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: pg_type.h,v 1.70 1999/10/18 14:14:04 momjian Exp $
+ * $Id: pg_type.h,v 1.71 1999/11/17 21:21:51 wieck Exp $
  *
  * NOTES
  *       the genbki.sh script reads this file and generates .bki
@@ -382,6 +382,11 @@ DATA(insert OID = 1296 ( timestamp  PGUID  4  19 t b t \054 0      0 timestamp_in time
 DESCR("date time timezone, limited-range ISO-formated date and time");
 #define TIMESTAMPOID   1296
 
+/* OIDS 1625 - 1639 */
+DATA(insert OID = 1625 ( lztext           PGUID -1  -1 f b t \054 0  0 lztextin lztextout lztextin lztextout i _null_ ));
+DESCR("variable-length string, stored compressed");
+#define LZTEXTOID              1625
+
 /* OIDS 1700 - 1799 */
 DATA(insert OID = 1700 ( numeric          PGUID -1  -1 f b t \054 0  0 numeric_in numeric_out numeric_in numeric_out i _null_ ));
 DESCR("numeric(precision, decimal), arbitrary precision number");
index fe6fd1171875fc531e383c6fa5a004b4ec2bcc61..1bf3273ca1316e70b060db14ac1ed4c25c770178 100644 (file)
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: builtins.h,v 1.89 1999/10/11 06:28:28 inoue Exp $
+ * $Id: builtins.h,v 1.90 1999/11/17 21:21:51 wieck Exp $
  *
  * NOTES
  *       This should normally only be included by fmgr.h.
@@ -30,6 +30,7 @@
 #include "utils/int8.h"
 #include "utils/nabstime.h"
 #include "utils/numeric.h"
+#include "utils/lztext.h"
 #include "access/heapam.h"             /* for HeapTuple */
 
 /*
@@ -627,4 +628,12 @@ HeapTuple  RI_FKey_setnull_upd(FmgrInfo *proinfo);
 HeapTuple      RI_FKey_setdefault_del(FmgrInfo *proinfo);
 HeapTuple      RI_FKey_setdefault_upd(FmgrInfo *proinfo);
 
+/* lztext.c */
+lztext    *lztextin(char *str);
+char      *lztextout(lztext *lz);
+text      *lztext_text(lztext *lz);
+lztext    *text_lztext(text *txt);
+int32          lztextlen(lztext *lz);
+int32          lztextoctetlen(lztext *lz);
+
 #endif  /* BUILTINS_H */
diff --git a/src/include/utils/lztext.h b/src/include/utils/lztext.h
new file mode 100644 (file)
index 0000000..c832806
--- /dev/null
@@ -0,0 +1,22 @@
+/* ----------
+ * lztext.h
+ *
+ * $Header: /cvsroot/pgsql/src/include/utils/Attic/lztext.h,v 1.1 1999/11/17 21:21:51 wieck Exp $
+ *
+ *     Definitions for the lztext compressed data type
+ * ----------
+ */
+
+#ifndef _LZTEXT_H_
+#define _LZTEXT_H_
+
+#include "utils/pg_lzcompress.h"
+
+
+/* ----------
+ * The internal storage format of an LZ compressed text field
+ * ----------
+ */
+typedef PGLZ_Header            lztext;
+
+#endif /* _LZTEXT_H_ */
diff --git a/src/include/utils/pg_lzcompress.h b/src/include/utils/pg_lzcompress.h
new file mode 100644 (file)
index 0000000..dba52fa
--- /dev/null
@@ -0,0 +1,125 @@
+/* ----------
+ * pg_lzcompress.h -
+ *
+ * $Header: /cvsroot/pgsql/src/include/utils/pg_lzcompress.h,v 1.1 1999/11/17 21:21:51 wieck Exp $
+ *
+ *     Definitions for the builtin LZ compressor
+ * ----------
+ */
+
+#ifndef _PG_LZCOMPRESS_H_
+#define _PG_LZCOMPRESS_H_
+
+
+/* ----------
+ * PGLZ_Header -
+ *
+ *      The information at the top of the compressed data.
+ *             The varsize must be kept the same data type as the value
+ *             in front of all variable size data types in PostgreSQL.
+ * ----------
+ */
+typedef struct PGLZ_Header {
+    int32                       varsize;
+    int32                       rawsize;
+} PGLZ_Header;
+
+
+/* ----------
+ * PGLZ_MAX_OUTPUT -
+ *
+ *             Macro to compute the maximum buffer required for the
+ *             compression output. It is larger than the input, because
+ *             in the worst case, we cannot write out one single tag but
+ *             need one control byte per 8 literal data bytes plus the
+ *             EOF mark at the end.
+ * ----------
+ */
+#define PGLZ_MAX_OUTPUT(_dlen)                 ((_dlen) + (((_dlen) | 0x07) >> 3)      \
+                                                                                                        + sizeof(PGLZ_Header))
+#define PGLZ_RAW_SIZE(_lzdata)                 (_lzdata->rawsize)
+#define PGLZ_IS_COMPRESSED(_lzdata)            (_lzdata->varsize !=                            \
+                                                                                _lzdata->rawsize + sizeof(PGLZ_Header))
+
+/* ----------
+ * PGLZ_Strategy -
+ *
+ *             Some values that control the compression algorithm.
+ *
+ *             min_input_size          Minimum input data size to start compression.
+ *
+ *             force_input_size        Input data size at which compressed storage is
+ *                                                     forced even if the compression rate drops below
+ *                                                     min_comp_rate (but not below 0).
+ *
+ *             min_comp_rate           Minimum compression rate (0-99%), the output
+ *                                                     must be smaller than the input. If that isn't
+ *                                                     the case, the compressor will throw away it's
+ *                                                     output and copy the original, uncompressed data
+ *                                                     to the output buffer.
+ *
+ *             match_size_good         The initial GOOD match size when starting history
+ *                                                     lookup. When looking up the history to find a
+ *                                                     match that could be expressed as a tag, the
+ *                                                     algorithm does not allways walk back entirely.
+ *                                                     A good match fast is usually better than the 
+ *                                                     best possible one very late. For each iteration
+ *                                                     in the lookup, this value is lowered so the
+ *                                                     longer the lookup takes, the smaller matches
+ *                                                     are considered good.
+ *
+ *             match_size_drop         The percentage, match_size_good is lowered
+ *                                                     at each history check. Allowed values are
+ *                                                     0 (no change until end) to 100 (only check
+ *                                                     latest history entry at all).
+ * ----------
+ */
+typedef struct PGLZ_Strategy {
+       int32           min_input_size;
+       int32           force_input_size;
+       int32           min_comp_rate;
+       int32           match_size_good;
+       int32           match_size_drop;
+} PGLZ_Strategy;
+
+
+/* ----------
+ * The standard strategies
+ *
+ *             PGLZ_strategy_default           Starts compression only if input is
+ *                                                                     at least 256 bytes large. Stores output
+ *                                                                     uncompressed if compression does not
+ *                                                                     gain at least 20% size reducture but
+ *                                                                     input does not exceed 6K. Stops history
+ *                                                                     lookup if at least a 128 byte long
+ *                                                                     match has been found.
+ *
+ *                                                                     This is the default strategy if none
+ *                                                                     is given to pglz_compress().
+ *
+ *             PGLZ_strategy_allways           Starts compression on any infinitely
+ *                                                                     small input and does fallback to
+ *                                                                     uncompressed storage only if output
+ *                                                                     would be larger than input.
+ *
+ *             PGLZ_strategy_never                     Force pglz_compress to act as a custom
+ *                                                                     interface for memcpy(). Only useful
+ *                                                                     for generic interfacing.
+ * ----------
+ */
+extern PGLZ_Strategy   *PGLZ_strategy_default;
+extern PGLZ_Strategy   *PGLZ_strategy_allways;
+extern PGLZ_Strategy   *PGLZ_strategy_never;
+
+
+/* ----------
+ * Global function declarations
+ * ----------
+ */
+int    pglz_compress (char *source, int32 slen, PGLZ_Header *dest,
+                                                                        PGLZ_Strategy *strategy);
+int pglz_decompress (PGLZ_Header *source, char *dest);
+
+
+#endif /* _PG_LZCOMPRESS_H_ */
+