From: Jan Wieck Date: Wed, 17 Nov 1999 21:21:51 +0000 (+0000) Subject: The new LZ compression and an lztext data type based on it. X-Git-Tag: REL7_0~1167 X-Git-Url: https://granicus.if.org/sourcecode?a=commitdiff_plain;h=79c3b71c1be3a79ec2d1f4d64bdef13f0e0a086a;p=postgresql The new LZ compression and an lztext data type based on it. Jan --- diff --git a/src/backend/utils/adt/Makefile b/src/backend/utils/adt/Makefile index 74ef6285d2..b2999d0d0d 100644 --- a/src/backend/utils/adt/Makefile +++ b/src/backend/utils/adt/Makefile @@ -4,7 +4,7 @@ # Makefile for utils/adt # # IDENTIFICATION -# $Header: /cvsroot/pgsql/src/backend/utils/adt/Makefile,v 1.26 1999/09/30 14:54:22 wieck Exp $ +# $Header: /cvsroot/pgsql/src/backend/utils/adt/Makefile,v 1.27 1999/11/17 21:21:50 wieck Exp $ # #------------------------------------------------------------------------- @@ -35,7 +35,7 @@ OBJS = acl.o arrayfuncs.o arrayutils.o bool.o cash.o char.o chunk.o \ regexp.o regproc.o ruleutils.o selfuncs.o sets.o \ tid.o timestamp.o varchar.o varlena.o version.o \ network.o mac.o inet_net_ntop.o inet_net_pton.o \ - ri_triggers.o + ri_triggers.o pg_lzcompress.o lztext.o all: SUBSYS.o diff --git a/src/backend/utils/adt/lztext.c b/src/backend/utils/adt/lztext.c new file mode 100644 index 0000000000..49c37def91 --- /dev/null +++ b/src/backend/utils/adt/lztext.c @@ -0,0 +1,266 @@ +/* ---------- + * lztext.c - + * + * $Header: /cvsroot/pgsql/src/backend/utils/adt/Attic/lztext.c,v 1.1 1999/11/17 21:21:50 wieck Exp $ + * + * Text type with internal LZ compressed representation. Uses the + * standard PostgreSQL compression method. + * ---------- + */ + +#include +#include +#include +#include +#include + +#include "postgres.h" +#include "utils/builtins.h" +#include "utils/palloc.h" +#include "utils/pg_lzcompress.h" + + +/* ---------- + * lztextin - + * + * Input function for datatype lztext + * ---------- + */ +lztext * +lztextin(char *str) +{ + lztext *result; + int32 rawsize; + lztext *tmp; + int tmp_size; + + /* ---------- + * Handle NULL + * ---------- + */ + if (str == NULL) + return NULL; + + /* ---------- + * Determine input size and eventually tuple size + * ---------- + */ + rawsize = strlen(str); + tmp_size = PGLZ_MAX_OUTPUT(rawsize); + + /* ---------- + * Allocate a temporary result and compress into it + * ---------- + */ + tmp = (lztext *) palloc(tmp_size); + pglz_compress(str, rawsize, tmp, NULL); + + /* ---------- + * If we miss less than x% bytes at the end of the temp value, + * so be it. Therefore we save a memcpy(). + * ---------- + */ + if (tmp_size - tmp->varsize < 256 || + tmp_size - tmp->varsize < tmp_size / 4) + { + result = tmp; + } else { + result = (lztext *) palloc(tmp->varsize); + memcpy(result, tmp, tmp->varsize); + pfree(tmp); + } + + return result; +} + + +/* ---------- + * lztextout - + * + * Output function for data type lztext + * ---------- + */ +char * +lztextout(lztext *lz) +{ + char *result; + + /* ---------- + * Handle NULL + * ---------- + */ + if (lz == NULL) + { + result = (char *) palloc(2); + result[0] = '-'; + result[1] = '\0'; + return result; + } + + /* ---------- + * Allocate the result string - the required size is remembered + * in the lztext header so we don't need a temporary buffer or + * have to diddle with realloc's. + * ---------- + */ + result = (char *) palloc(PGLZ_RAW_SIZE(lz) + 1); + + /* ---------- + * Decompress and add terminating ZERO + * ---------- + */ + pglz_decompress(lz, result); + result[lz->rawsize] = '\0'; + + /* ---------- + * Return the result + * ---------- + */ + return result; +} + + +/* ---------- + * lztextlen - + * + * Logical length of lztext field (it's the uncompressed size + * of the original data). + * ---------- + */ +int32 +lztextlen(lztext *lz) +{ + /* ---------- + * Handle NULL + * ---------- + */ + if (lz == NULL) + return 0; + + /* ---------- + * without multibyte support, it's the remembered rawsize + * ---------- + */ + return lz->rawsize; +} + + +/* ---------- + * lztextoctetlen - + * + * Physical length of lztext field (it's the compressed size + * plus the rawsize field). + * ---------- + */ +int32 +lztextoctetlen(lztext *lz) +{ + /* ---------- + * Handle NULL + * ---------- + */ + if (lz == NULL) + return 0; + + /* ---------- + * Return the varsize minus the VARSIZE field itself. + * ---------- + */ + return lz->varsize - sizeof(int32); +} + + +/* ---------- + * text_lztext - + * + * Convert text to lztext + * ---------- + */ +lztext * +text_lztext(text *txt) +{ + lztext *result; + int32 rawsize; + lztext *tmp; + int tmp_size; + char *str; + + /* ---------- + * Handle NULL + * ---------- + */ + if (txt == NULL) + return NULL; + + /* ---------- + * Determine input size and eventually tuple size + * ---------- + */ + rawsize = VARSIZE(txt) - VARHDRSZ; + str = VARDATA(txt); + tmp_size = PGLZ_MAX_OUTPUT(rawsize); + + /* ---------- + * Allocate a temporary result and compress into it + * ---------- + */ + tmp = (lztext *) palloc(tmp_size); + pglz_compress(str, rawsize, tmp, NULL); + + /* ---------- + * If we miss less than x% bytes at the end of the temp value, + * so be it. Therefore we save a memcpy(). + * ---------- + */ + if (tmp_size - tmp->varsize < 256 || + tmp_size - tmp->varsize < tmp_size / 4) + { + result = tmp; + } else { + result = (lztext *) palloc(tmp->varsize); + memcpy(result, tmp, tmp->varsize); + pfree(tmp); + } + + return result; + + +} + + +/* ---------- + * lztext_text - + * + * Convert lztext to text + * ---------- + */ +text * +lztext_text(lztext *lz) +{ + text *result; + + /* ---------- + * Handle NULL + * ---------- + */ + if (lz == NULL) + return NULL; + + /* ---------- + * Allocate and initialize the text result + * ---------- + */ + result = (text *) palloc(lz->rawsize + VARHDRSZ + 1); + VARSIZE(result) = lz->rawsize + VARHDRSZ; + + /* ---------- + * Decompress directly into the text data area. + * ---------- + */ + pglz_decompress(lz, VARDATA(result)); + VARDATA(result)[lz->rawsize] = 0; + + return result; +} + + diff --git a/src/backend/utils/adt/pg_lzcompress.c b/src/backend/utils/adt/pg_lzcompress.c new file mode 100644 index 0000000000..88721d30a2 --- /dev/null +++ b/src/backend/utils/adt/pg_lzcompress.c @@ -0,0 +1,669 @@ +/* ---------- + * pg_lzcompress.c - + * + * $Header: /cvsroot/pgsql/src/backend/utils/adt/pg_lzcompress.c,v 1.1 1999/11/17 21:21:50 wieck Exp $ + * + * This is an implementation of LZ compression for PostgreSQL. + * It uses a simple history table and generates 2-3 byte tags + * capable of backward copy information for 3-273 bytes with + * an offset of max. 4095. + * + * Entry routines: + * + * int + * pglz_compress(char *source, int slen, PGLZ_Header *dest, + * PGLZ_Strategy *strategy); + * + * source is the input data to be compressed. + * + * slen is the length of the input data. + * + * dest is the output area for the compressed result. + * It must be big enough to hold the worst case of + * compression failure and can be computed by the + * macro PGLZ_MAX_OUTPUT(slen). Don't be surprised, + * it is larger than the input data size. + * + * strategy is a pointer to some information controlling + * the compression algorithm. If NULL, the compiled + * in default strategy is used. + * + * The return value is the size of bytes written to buff. + * + * int + * pglz_decompress(PGLZ_Header *source, char *dest) + * + * source is the compressed input. + * + * dest is the area where the uncompressed data will be + * written to. It is the callers responsibility to + * provide enough space. The required amount can be + * obtained with the macro PGLZ_RAW_SIZE(source). + * + * The data is written to buff exactly as it was handed + * to pglz_compress(). No terminating zero byte is added. + * + * The return value is the size of bytes written to buff. + * Obviously the same as PGLZ_RAW_SIZE() returns. + * + * The compression algorithm and internal data format: + * + * PGLZ_Header is defined as + * + * typedef struct PGLZ_Header { + * int32 varsize; + * int32 rawsize; + * } + * + * The header is followed by the compressed data itself. + * + * The algorithm is easiest explained by describing the process + * of decompression. + * + * If varsize == rawsize + sizeof(PGLZ_Header), then the data + * is stored uncompressed as plain bytes. Thus, the decompressor + * simply copies rawsize bytes from the location after the + * header to the destination. + * + * Otherwise the first byte after the header tells what to do + * the next 8 times. We call this the control byte. + * + * An unset bit in the control byte means, that one uncompressed + * byte follows, which is copied from input to output. + * + * A set bit in the control byte means, that a tag of 2-3 bytes + * follows. A tag contains information to copy some bytes, that + * are already in the output buffer, to the current location in + * the output. Let's call the three tag bytes T1, T2 and T3. The + * position of the data to copy is coded as an offset from the + * actual output position. + * + * The offset is in the upper nibble of T1 and in T2. + * The length is in the lower nibble of T1. + * + * So the 16 bits of a 2 byte tag are coded as + * + * 7---T1--0 7---T2--0 + * OOOO LLLL OOOO OOOO + * + * This limits the offset to 1-4095 (12 bits) and the length + * to 3-18 (4 bits) because 3 is allways added to it. To emit + * a tag of 2 bytes with a length of 2 only saves one control + * bit. But we loose one byte in the possible length of a tag. + * + * In the actual implementation, the 2 byte tag's length is + * limited to 3-17, because the value 0xF in the length nibble + * has special meaning. It means, that the next following + * byte (T3) has to be added to the length value of 18. That + * makes total limits of 1-4095 for offset and 3-273 for length. + * + * Now that we have successfully decoded a tag. We simply copy + * the output that occured bytes back to the current + * output location in the specified . Thus, a + * sequence of 200 spaces (think about bpchar fields) could be + * coded in 4 bytes. One literal space and a three byte tag to + * copy 199 bytes with a -1 offset. Whow - that's a compression + * rate of 98%! Well, the implementation needs to save the + * original data size too, so we need another 4 bytes for it + * and end up with a total compression rate of 96%, what's still + * worth a Whow. + * + * Acknowledgements: + * + * Many thanks to Adisak Pochanayon, who's article about SLZ + * inspired me to write the PostgreSQL compression this way. + * + * Jan Wieck + * ---------- + */ +#include +#include +#include +#include +#include +#include + +#include "postgres.h" +#include "utils/palloc.h" +#include "utils/pg_lzcompress.h" + + +/* ---------- + * Local definitions + * ---------- + */ +#define PGLZ_HISTORY_SIZE 8192 +#define PGLZ_HISTORY_MASK 0x1fff +#define PGLZ_HISTORY_PREALLOC 8192 +#define PGLZ_MAX_MATCH 273 + + +/* ---------- + * PGLZ_HistEntry - + * + * Linked list for the backward history lookup + * ---------- + */ +typedef struct PGLZ_HistEntry { + struct PGLZ_HistEntry *next; + char *pos; +} PGLZ_HistEntry; + + +/* ---------- + * The provided standard strategies + * ---------- + */ +static PGLZ_Strategy strategy_default_data = { + 256, /* Data chunks smaller 256 bytes are nott compressed */ + 6144, /* Data chunks greater equal 6K force compression */ + /* except compressed result is greater uncompressed data */ + 20, /* Compression rates below 20% mean fallback to uncompressed */ + /* storage except compression is forced by previous parameter */ + 128, /* Stop history lookup if a match of 128 bytes is found */ + 10 /* Lower good match size by 10% at every lookup loop iteration. */ +}; +PGLZ_Strategy *PGLZ_strategy_default = &strategy_default_data; + + +static PGLZ_Strategy strategy_allways_data = { + 0, /* Chunks of any size are compressed */ + 0, /* */ + 0, /* We want to save at least one single byte */ + 128, /* Stop history lookup if a match of 128 bytes is found */ + 6 /* Look harder for a good match. */ +}; +PGLZ_Strategy *PGLZ_strategy_allways = &strategy_allways_data; + + +static PGLZ_Strategy strategy_never_data = { + 0, /* */ + 0, /* */ + 0, /* */ + 0, /* Zero indicates "store uncompressed allways" */ + 0 /* */ +}; +PGLZ_Strategy *PGLZ_strategy_never = &strategy_never_data; + + + +/* ---------- + * pglz_hist_idx - + * + * Computes the history table slot for the lookup by the next 4 + * characters in the input. + * ---------- + */ +#if 1 +#define pglz_hist_idx(_s,_e) ( \ + (((_e) - (_s)) < 4) ? 0 : \ + ((((_s)[0] << 9) ^ ((_s)[1] << 6) ^ \ + ((_s)[2] << 3) ^ (_s)[3]) & (PGLZ_HISTORY_MASK)) \ + ) +#else +#define pglz_hist_idx(_s,_e) ( \ + (((_e) - (_s)) < 2) ? 0 : \ + ((((_s)[0] << 8) ^ (_s)[1]) & (PGLZ_HISTORY_MASK)) \ + ) +#endif + + +/* ---------- + * pglz_hist_add - + * + * Adds a new entry to the history table. + * ---------- + */ +#define pglz_hist_add(_hs,_hn,_s,_e) { \ + int __hindex = pglz_hist_idx((_s),(_e)); \ + (_hn)->next = (_hs)[__hindex]; \ + (_hn)->pos = (_s); \ + (_hs)[__hindex] = (_hn)++; \ + } + + +/* ---------- + * pglz_out_ctrl - + * + * Outputs the last and allocates a new control byte if needed. + * ---------- + */ +#define pglz_out_ctrl(__ctrlp,__ctrlb,__ctrl,__buf) { \ + if ((__ctrl & 0xff) == 0) \ + { \ + *__ctrlp = __ctrlb; \ + __ctrlp = __buf++; \ + __ctrlb = 0; \ + __ctrl = 1; \ + } \ +} + + +/* ---------- + * pglz_out_literal - + * + * Outputs a literal byte to the destination buffer including the + * appropriate control bit. + * ---------- + */ +#define pglz_out_literal(_ctrlp,_ctrlb,_ctrl,_buf,_byte) { \ + pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \ + *_buf++ = (unsigned char)(_byte); \ + _ctrl <<= 1; \ +} + + +/* ---------- + * pglz_out_tag - + * + * Outputs a backward reference tag of 2-4 bytes (depending on + * offset and length) to the destination buffer including the + * appropriate control bit. + * ---------- + */ +#define pglz_out_tag(_ctrlp,_ctrlb,_ctrl,_buf,_len,_off) { \ + pglz_out_ctrl(_ctrlp,_ctrlb,_ctrl,_buf); \ + _ctrlb |= _ctrl; \ + _ctrl <<= 1; \ + if (_len > 17) \ + { \ + _buf[0] = (unsigned char)((((_off) & 0xf00) >> 4) | 0x0f); \ + _buf[1] = (unsigned char)((_off & 0xff)); \ + _buf[2] = (unsigned char)((_len) - 18); \ + _buf += 3; \ + } else { \ + _buf[0] = (unsigned char)((((_off) & 0xf00) >> 4) | (_len - 3)); \ + _buf[1] = (unsigned char)((_off) & 0xff); \ + _buf += 2; \ + } \ +} + + +/* ---------- + * pglz_find_match - + * + * Lookup the history table if the actual input stream matches + * another sequence of characters, starting somewhere earlier + * in the input buffer. + * ---------- + */ +static inline int +pglz_find_match (PGLZ_HistEntry **hstart, char *input, char *end, + int *lenp, int *offp, int good_match, int good_drop) +{ + PGLZ_HistEntry *hent; + int32 len = 0; + int32 off = 0; + int32 thislen; + int32 thisoff; + char *ip; + char *hp; + + /* ---------- + * Traverse the linked history list until a good enough + * match is found. + * ---------- + */ + hent = hstart[pglz_hist_idx(input, end)]; + while (hent && len < good_match) + { + /* ---------- + * Be happy with lesser good matches the more entries we visited. + * ---------- + */ + good_match -= (good_match * good_drop) /100; + + /* ---------- + * Stop if the offset does not fit into our tag anymore. + * ---------- + */ + thisoff = (ip = input) - (hp = hent->pos); + if (thisoff >= 0x0fff) + break; + + /* ---------- + * Determine length of match. A better match must be larger than + * the best so far. And if we already have a match of 16 or more + * bytes, it's worth the call overhead to use memcmp() to check + * if this match is equal for the same size. After that we must + * fallback to character by character comparision to know the + * exact position where the diff occured. + * ---------- + */ + if (len >= 16) + { + if (memcmp(ip, hp, len) != 0) + { + hent = hent->next; + continue; + } + thislen = len; + ip += len; + hp += len; + } else { + thislen = 0; + } + while (ip < end && *ip == *hp && thislen < PGLZ_MAX_MATCH) + { + thislen++; + ip++; + hp++; + } + + /* ---------- + * Remember this match as the best (if it is) + * ---------- + */ + if (thislen > len) + { + len = thislen; + off = thisoff; + } + + /* ---------- + * Advance to the next history entry + * ---------- + */ + hent = hent->next; + } + + /* ---------- + * Return match information only if it results at least in one + * byte reduction. + * ---------- + */ + if (len > 2) + { + *lenp = len; + *offp = off; + return 1; + } + + return 0; +} + + +/* ---------- + * pglz_compress - + * ---------- + */ +int +pglz_compress (char *source, int slen, PGLZ_Header *dest, PGLZ_Strategy *strategy) +{ + PGLZ_HistEntry *hist_start[PGLZ_HISTORY_SIZE]; + PGLZ_HistEntry *hist_alloc; + PGLZ_HistEntry hist_prealloc[PGLZ_HISTORY_PREALLOC]; + PGLZ_HistEntry *hist_next; + + unsigned char *bp = ((unsigned char *)dest) + sizeof(PGLZ_Header); + unsigned char *bstart = bp; + char *dp = source; + char *dend = source + slen; + unsigned char ctrl_dummy = 0; + unsigned char *ctrlp = &ctrl_dummy; + unsigned char ctrlb = 0; + unsigned char ctrl = 0; + int32 match_len; + int32 match_off; + int32 good_match; + int32 good_drop; + int32 do_compress = 1; + int32 result_size = -1; + int32 result_max; + int32 need_rate; + + /* ---------- + * Our fallback strategy is the default. + * ---------- + */ + if (strategy == NULL) + strategy = PGLZ_strategy_default; + + /* ---------- + * Save the original source size in the header. + * ---------- + */ + dest->rawsize = slen; + + /* ---------- + * If the strategy forbids compression (at all or if source chunk too + * small), copy input to output without compression. + * ---------- + */ + if (strategy->match_size_good == 0) + { + memcpy(bstart, source, slen); + return (dest->varsize = slen + sizeof(PGLZ_Header)); + } else { + if (slen < strategy->min_input_size) + { + memcpy(bstart, source, slen); + return (dest->varsize = slen + sizeof(PGLZ_Header)); + } + } + + /* ---------- + * Limit the match size to the maximum implementation allowed value + * ---------- + */ + if ((good_match = strategy->match_size_good) > PGLZ_MAX_MATCH) + good_match = PGLZ_MAX_MATCH; + if (good_match < 17) + good_match = 17; + + if ((good_drop = strategy->match_size_drop) < 0) + good_drop = 0; + if (good_drop > 100) + good_drop = 100; + + /* ---------- + * Initialize the history tables. For inputs smaller than + * PGLZ_HISTORY_PREALLOC, we already have a big enough history + * table on the stack frame. + * ---------- + */ + memset((void *)hist_start, 0, sizeof(hist_start)); + if (slen + 1 <= PGLZ_HISTORY_PREALLOC) + hist_alloc = hist_prealloc; + else + hist_alloc = (PGLZ_HistEntry *) + palloc(sizeof(PGLZ_HistEntry) * (slen + 1)); + hist_next = hist_alloc; + + /* ---------- + * Compute the maximum result size allowed by the strategy. + * If the input size exceeds force_input_size, the max result size + * is the input size itself. + * Otherwise, it is the input size minus the minimum wanted + * compression rate. + * ---------- + */ + if (slen >= strategy->force_input_size) + { + result_max = slen; + } else { + need_rate = strategy->min_comp_rate; + if (need_rate < 0) + need_rate = 0; + else if (need_rate > 99) + need_rate = 99; + result_max = slen - ((slen * need_rate) / 100); + } + + /* ---------- + * Compress the source directly into the output buffer. + * ---------- + */ + while (dp < dend) + { + /* ---------- + * If we already exceeded the maximum result size, set no compression + * flag and stop this. But don't check too often. + * ---------- + */ + if (bp - bstart >= result_max) + { + do_compress = 0; + break; + } + + /* ---------- + * Try to find a match in the history + * ---------- + */ + if (pglz_find_match(hist_start, dp, dend, &match_len, + &match_off, good_match, good_drop)) + { + /* ---------- + * Create the tag and add history entries for + * all matched characters. + * ---------- + */ + pglz_out_tag(ctrlp, ctrlb, ctrl, bp, match_len, match_off); + while(match_len--) + { + pglz_hist_add(hist_start, hist_next, dp, dend); + dp++; /* Do not do this ++ in the line above! */ + /* The macro would do it four times - Jan. */ + } + } else { + /* ---------- + * No match found. Copy one literal byte. + * ---------- + */ + pglz_out_literal(ctrlp, ctrlb, ctrl, bp, *dp); + pglz_hist_add(hist_start, hist_next, dp, dend); + dp++; /* Do not do this ++ in the line above! */ + /* The macro would do it four times - Jan. */ + } + } + + /* ---------- + * Get rid of the history (if allocated) + * ---------- + */ + if (hist_alloc != hist_prealloc) + pfree((void *)hist_alloc); + + /* ---------- + * If we are still in compressing mode, write out the last + * control byte and determine if the compression gained the + * rate requested by the strategy. + * ---------- + */ + if (do_compress) + { + *ctrlp = ctrlb; + + result_size = bp - bstart; + if (result_size >= result_max) { + do_compress = 0; + } + } + + /* ---------- + * Done - if we successfully compressed and matched the + * strategy's constraints, return the compressed result. + * Otherwise copy the original source over it and return + * the original length. + * ---------- + */ + if (do_compress) + { + return (dest->varsize = result_size + sizeof(PGLZ_Header)); + } else { + memcpy(((char *)dest) + sizeof(PGLZ_Header), source, slen); + return (dest->varsize = slen + sizeof(PGLZ_Header)); + } +} + + +/* ---------- + * pglz_decompress - + * ---------- + */ +int +pglz_decompress (PGLZ_Header *source, char *dest) +{ + unsigned char *dp; + unsigned char *dend; + unsigned char *bp; + unsigned char ctrl; + int32 ctrlc; + int32 len; + int32 off; + + dp = ((unsigned char *)source) + sizeof(PGLZ_Header); + dend = ((unsigned char *)source) + source->varsize; + bp = (unsigned char *)dest; + + if (source->varsize == source->rawsize + sizeof(PGLZ_Header)) + { + memcpy(dest, dp, source->rawsize); + return source->rawsize; + } + + while (dp < dend) + { + /* ---------- + * Read one control byte and process the next 8 items. + * ---------- + */ + ctrl = *dp++; + for (ctrlc = 0; ctrlc < 8 && dp < dend; ctrlc++) + { + if (ctrl & 1) + { + /* ---------- + * Otherwise it contains the match length minus 3 + * and the upper 4 bits of the offset. The next following + * byte contains the lower 8 bits of the offset. If + * the length is coded as 18, another extension tag byte + * tells how much longer the match really was (0-255). + * ---------- + */ + len = (dp[0] & 0x0f) + 3; + off = ((dp[0] & 0xf0) << 4) | dp[1]; + dp += 2; + if (len == 18) + { + len += *dp++; + } + + /* ---------- + * Now we copy the bytes specified by the tag from + * OUTPUT to OUTPUT. It is dangerous and platform + * dependant to use memcpy() here, because the copied + * areas could overlap extremely! + * ---------- + */ + while (len--) + { + *bp = bp[-off]; + bp++; + } + } else { + /* ---------- + * An unset control bit means LITERAL BYTE. So we + * just copy one from INPUT to OUTPUT. + * ---------- + */ + *bp++ = *dp++; + } + + /* ---------- + * Advance the control bit + * ---------- + */ + ctrl >>= 1; + } + } + + /* ---------- + * That's it. + * ---------- + */ + return (char *)bp - dest; +} + + diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 89ff91c8e8..bb2a5b6dd7 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -6,7 +6,7 @@ * * Copyright (c) 1994, Regents of the University of California * - * $Id: pg_proc.h,v 1.105 1999/10/11 06:28:28 inoue Exp $ + * $Id: pg_proc.h,v 1.106 1999/11/17 21:21:50 wieck Exp $ * * NOTES * The script catalog/genbki.sh reads this file and generates .bki @@ -2338,6 +2338,28 @@ DESCR("larger of two numbers"); DATA(insert OID = 1769 ( numeric_cmp PGUID 11 f t t 2 f 23 "1700 1700" 100 0 0 100 numeric_cmp - )); DESCR("compare two numbers"); +/* OID's 1625 - 1639 LZTEXT data type */ +DATA(insert OID = 1626 ( lztextin PGUID 11 f t t 1 f 1625 "0" 100 0 0 100 lztextin - )); +DESCR("(internal)"); +DATA(insert OID = 1627 ( lztextout PGUID 11 f t t 1 f 23 "0" 100 0 0 100 lztextout - )); +DESCR("(internal)"); +DATA(insert OID = 1628 ( lztext_text PGUID 11 f t t 1 f 25 "1625" 100 0 0 100 lztext_text -)); +DESCR("convert lztext to text"); +DATA(insert OID = 1629 ( text PGUID 11 f t t 1 f 25 "1625" 100 0 0 100 lztext_text -)); +DESCR("convert lztext to text"); +DATA(insert OID = 1630 ( text_lztext PGUID 11 f t t 1 f 1625 "25" 100 0 0 100 text_lztext -)); +DESCR("convert text to lztext"); +DATA(insert OID = 1631 ( lztext PGUID 11 f t t 1 f 1625 "25" 100 0 0 100 text_lztext -)); +DESCR("convert text to lztext"); +DATA(insert OID = 1632 ( lztextlen PGUID 11 f t t 1 f 23 "1625" 100 0 1 0 lztextlen - )); +DESCR("length"); +DATA(insert OID = 1633 ( length PGUID 11 f t t 1 f 23 "1625" 100 0 1 0 lztextlen - )); +DESCR("length"); +DATA(insert OID = 1634 ( lztextoctetlen PGUID 11 f t t 1 f 23 "1625" 100 0 1 0 lztextoctetlen - )); +DESCR("octet length"); +DATA(insert OID = 1635 ( octet_length PGUID 11 f t t 1 f 23 "1625" 100 0 1 0 lztextoctetlen - )); +DESCR("octet length"); + /* * prototypes for functions pg_proc.c diff --git a/src/include/catalog/pg_type.h b/src/include/catalog/pg_type.h index aceadef757..f154d016ce 100644 --- a/src/include/catalog/pg_type.h +++ b/src/include/catalog/pg_type.h @@ -7,7 +7,7 @@ * * Copyright (c) 1994, Regents of the University of California * - * $Id: pg_type.h,v 1.70 1999/10/18 14:14:04 momjian Exp $ + * $Id: pg_type.h,v 1.71 1999/11/17 21:21:51 wieck Exp $ * * NOTES * the genbki.sh script reads this file and generates .bki @@ -382,6 +382,11 @@ DATA(insert OID = 1296 ( timestamp PGUID 4 19 t b t \054 0 0 timestamp_in time DESCR("date time timezone, limited-range ISO-formated date and time"); #define TIMESTAMPOID 1296 +/* OIDS 1625 - 1639 */ +DATA(insert OID = 1625 ( lztext PGUID -1 -1 f b t \054 0 0 lztextin lztextout lztextin lztextout i _null_ )); +DESCR("variable-length string, stored compressed"); +#define LZTEXTOID 1625 + /* OIDS 1700 - 1799 */ DATA(insert OID = 1700 ( numeric PGUID -1 -1 f b t \054 0 0 numeric_in numeric_out numeric_in numeric_out i _null_ )); DESCR("numeric(precision, decimal), arbitrary precision number"); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index fe6fd11718..1bf3273ca1 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -6,7 +6,7 @@ * * Copyright (c) 1994, Regents of the University of California * - * $Id: builtins.h,v 1.89 1999/10/11 06:28:28 inoue Exp $ + * $Id: builtins.h,v 1.90 1999/11/17 21:21:51 wieck Exp $ * * NOTES * This should normally only be included by fmgr.h. @@ -30,6 +30,7 @@ #include "utils/int8.h" #include "utils/nabstime.h" #include "utils/numeric.h" +#include "utils/lztext.h" #include "access/heapam.h" /* for HeapTuple */ /* @@ -627,4 +628,12 @@ HeapTuple RI_FKey_setnull_upd(FmgrInfo *proinfo); HeapTuple RI_FKey_setdefault_del(FmgrInfo *proinfo); HeapTuple RI_FKey_setdefault_upd(FmgrInfo *proinfo); +/* lztext.c */ +lztext *lztextin(char *str); +char *lztextout(lztext *lz); +text *lztext_text(lztext *lz); +lztext *text_lztext(text *txt); +int32 lztextlen(lztext *lz); +int32 lztextoctetlen(lztext *lz); + #endif /* BUILTINS_H */ diff --git a/src/include/utils/lztext.h b/src/include/utils/lztext.h new file mode 100644 index 0000000000..c83280661c --- /dev/null +++ b/src/include/utils/lztext.h @@ -0,0 +1,22 @@ +/* ---------- + * lztext.h + * + * $Header: /cvsroot/pgsql/src/include/utils/Attic/lztext.h,v 1.1 1999/11/17 21:21:51 wieck Exp $ + * + * Definitions for the lztext compressed data type + * ---------- + */ + +#ifndef _LZTEXT_H_ +#define _LZTEXT_H_ + +#include "utils/pg_lzcompress.h" + + +/* ---------- + * The internal storage format of an LZ compressed text field + * ---------- + */ +typedef PGLZ_Header lztext; + +#endif /* _LZTEXT_H_ */ diff --git a/src/include/utils/pg_lzcompress.h b/src/include/utils/pg_lzcompress.h new file mode 100644 index 0000000000..dba52fa588 --- /dev/null +++ b/src/include/utils/pg_lzcompress.h @@ -0,0 +1,125 @@ +/* ---------- + * pg_lzcompress.h - + * + * $Header: /cvsroot/pgsql/src/include/utils/pg_lzcompress.h,v 1.1 1999/11/17 21:21:51 wieck Exp $ + * + * Definitions for the builtin LZ compressor + * ---------- + */ + +#ifndef _PG_LZCOMPRESS_H_ +#define _PG_LZCOMPRESS_H_ + + +/* ---------- + * PGLZ_Header - + * + * The information at the top of the compressed data. + * The varsize must be kept the same data type as the value + * in front of all variable size data types in PostgreSQL. + * ---------- + */ +typedef struct PGLZ_Header { + int32 varsize; + int32 rawsize; +} PGLZ_Header; + + +/* ---------- + * PGLZ_MAX_OUTPUT - + * + * Macro to compute the maximum buffer required for the + * compression output. It is larger than the input, because + * in the worst case, we cannot write out one single tag but + * need one control byte per 8 literal data bytes plus the + * EOF mark at the end. + * ---------- + */ +#define PGLZ_MAX_OUTPUT(_dlen) ((_dlen) + (((_dlen) | 0x07) >> 3) \ + + sizeof(PGLZ_Header)) +#define PGLZ_RAW_SIZE(_lzdata) (_lzdata->rawsize) +#define PGLZ_IS_COMPRESSED(_lzdata) (_lzdata->varsize != \ + _lzdata->rawsize + sizeof(PGLZ_Header)) + +/* ---------- + * PGLZ_Strategy - + * + * Some values that control the compression algorithm. + * + * min_input_size Minimum input data size to start compression. + * + * force_input_size Input data size at which compressed storage is + * forced even if the compression rate drops below + * min_comp_rate (but not below 0). + * + * min_comp_rate Minimum compression rate (0-99%), the output + * must be smaller than the input. If that isn't + * the case, the compressor will throw away it's + * output and copy the original, uncompressed data + * to the output buffer. + * + * match_size_good The initial GOOD match size when starting history + * lookup. When looking up the history to find a + * match that could be expressed as a tag, the + * algorithm does not allways walk back entirely. + * A good match fast is usually better than the + * best possible one very late. For each iteration + * in the lookup, this value is lowered so the + * longer the lookup takes, the smaller matches + * are considered good. + * + * match_size_drop The percentage, match_size_good is lowered + * at each history check. Allowed values are + * 0 (no change until end) to 100 (only check + * latest history entry at all). + * ---------- + */ +typedef struct PGLZ_Strategy { + int32 min_input_size; + int32 force_input_size; + int32 min_comp_rate; + int32 match_size_good; + int32 match_size_drop; +} PGLZ_Strategy; + + +/* ---------- + * The standard strategies + * + * PGLZ_strategy_default Starts compression only if input is + * at least 256 bytes large. Stores output + * uncompressed if compression does not + * gain at least 20% size reducture but + * input does not exceed 6K. Stops history + * lookup if at least a 128 byte long + * match has been found. + * + * This is the default strategy if none + * is given to pglz_compress(). + * + * PGLZ_strategy_allways Starts compression on any infinitely + * small input and does fallback to + * uncompressed storage only if output + * would be larger than input. + * + * PGLZ_strategy_never Force pglz_compress to act as a custom + * interface for memcpy(). Only useful + * for generic interfacing. + * ---------- + */ +extern PGLZ_Strategy *PGLZ_strategy_default; +extern PGLZ_Strategy *PGLZ_strategy_allways; +extern PGLZ_Strategy *PGLZ_strategy_never; + + +/* ---------- + * Global function declarations + * ---------- + */ +int pglz_compress (char *source, int32 slen, PGLZ_Header *dest, + PGLZ_Strategy *strategy); +int pglz_decompress (PGLZ_Header *source, char *dest); + + +#endif /* _PG_LZCOMPRESS_H_ */ +