*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.83 2008/02/29 17:47:41 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/heap/tuptoaster.c,v 1.84 2008/03/07 23:20:21 tgl Exp $
*
*
* INTERFACE ROUTINES
/* ----------
* Compress and/or save external until data fits into target length
*
- * 1: Inline compress attributes with attstorage 'x'
+ * 1: Inline compress attributes with attstorage 'x', and store very
+ * large attributes with attstorage 'x' or 'e' external immediately
* 2: Store attributes with attstorage 'x' or 'e' external
* 3: Inline compress attributes with attstorage 'm'
* 4: Store attributes with attstorage 'm' external
maxDataLen = TOAST_TUPLE_TARGET - hoff;
/*
- * Look for attributes with attstorage 'x' to compress
+ * Look for attributes with attstorage 'x' to compress. Also find large
+ * attributes with attstorage 'x' or 'e', and store them external.
*/
while (heap_compute_data_size(tupleDesc,
toast_values, toast_isnull) > maxDataLen)
Datum new_value;
/*
- * Search for the biggest yet uncompressed internal attribute
+ * Search for the biggest yet unprocessed internal attribute
*/
for (i = 0; i < numAttrs; i++)
{
continue; /* can't happen, toast_action would be 'p' */
if (VARATT_IS_COMPRESSED(toast_values[i]))
continue;
- if (att[i]->attstorage != 'x')
+ if (att[i]->attstorage != 'x' && att[i]->attstorage != 'e')
continue;
if (toast_sizes[i] > biggest_size)
{
break;
/*
- * Attempt to compress it inline
+ * Attempt to compress it inline, if it has attstorage 'x'
*/
i = biggest_attno;
- old_value = toast_values[i];
- new_value = toast_compress_datum(old_value);
+ if (att[i]->attstorage == 'x')
+ {
+ old_value = toast_values[i];
+ new_value = toast_compress_datum(old_value);
- if (DatumGetPointer(new_value) != NULL)
+ if (DatumGetPointer(new_value) != NULL)
+ {
+ /* successful compression */
+ if (toast_free[i])
+ pfree(DatumGetPointer(old_value));
+ toast_values[i] = new_value;
+ toast_free[i] = true;
+ toast_sizes[i] = VARSIZE(toast_values[i]);
+ need_change = true;
+ need_free = true;
+ }
+ else
+ {
+ /* incompressible, ignore on subsequent compression passes */
+ toast_action[i] = 'x';
+ }
+ }
+ else
{
- /* successful compression */
+ /* has attstorage 'e', ignore on subsequent compression passes */
+ toast_action[i] = 'x';
+ }
+
+ /*
+ * If this value is by itself more than maxDataLen (after compression
+ * if any), push it out to the toast table immediately, if possible.
+ * This avoids uselessly compressing other fields in the common case
+ * where we have one long field and several short ones.
+ *
+ * XXX maybe the threshold should be less than maxDataLen?
+ */
+ if (toast_sizes[i] > maxDataLen &&
+ rel->rd_rel->reltoastrelid != InvalidOid)
+ {
+ old_value = toast_values[i];
+ toast_action[i] = 'p';
+ toast_values[i] = toast_save_datum(rel, toast_values[i],
+ use_wal, use_fsm);
if (toast_free[i])
pfree(DatumGetPointer(old_value));
- toast_values[i] = new_value;
toast_free[i] = true;
- toast_sizes[i] = VARSIZE(toast_values[i]);
need_change = true;
need_free = true;
}
- else
- {
- /*
- * incompressible data, ignore on subsequent compression passes
- */
- toast_action[i] = 'x';
- }
}
/*
}
else
{
- /*
- * incompressible data, ignore on subsequent compression passes
- */
+ /* incompressible, ignore on subsequent compression passes */
toast_action[i] = 'x';
}
}
Assert(!VARATT_IS_COMPRESSED(value));
/*
- * No point in wasting a palloc cycle if value is too short for
- * compression
+ * No point in wasting a palloc cycle if value size is out of the
+ * allowed range for compression
*/
- if (valsize < PGLZ_strategy_default->min_input_size)
+ if (valsize < PGLZ_strategy_default->min_input_size ||
+ valsize > PGLZ_strategy_default->max_input_size)
return PointerGetDatum(NULL);
tmp = (struct varlena *) palloc(PGLZ_MAX_OUTPUT(valsize));
+
+ /*
+ * We recheck the actual size even if pglz_compress() reports success,
+ * because it might be satisfied with having saved as little as one byte
+ * in the compressed data --- which could turn into a net loss once you
+ * consider header and alignment padding. Worst case, the compressed
+ * format might require three padding bytes (plus header, which is included
+ * in VARSIZE(tmp)), whereas the uncompressed format would take only one
+ * header byte and no padding if the value is short enough. So we insist
+ * on a savings of more than 2 bytes to ensure we have a gain.
+ */
if (pglz_compress(VARDATA_ANY(value), valsize,
(PGLZ_Header *) tmp, PGLZ_strategy_default) &&
- VARSIZE(tmp) < VARSIZE_ANY(value))
+ VARSIZE(tmp) < valsize - 2)
{
/* successful compression */
return PointerGetDatum(tmp);
* This is an implementation of LZ compression for PostgreSQL.
* It uses a simple history table and generates 2-3 byte tags
* capable of backward copy information for 3-273 bytes with
- * an offset of max. 4095.
+ * a max offset of 4095.
*
* Entry routines:
*
*
* Copyright (c) 1999-2008, PostgreSQL Global Development Group
*
- * $PostgreSQL: pgsql/src/backend/utils/adt/pg_lzcompress.c,v 1.29 2008/01/01 19:45:52 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/utils/adt/pg_lzcompress.c,v 1.30 2008/03/07 23:20:21 tgl Exp $
* ----------
*/
#include "postgres.h"
-#include <unistd.h>
-#include <fcntl.h>
+#include <limits.h>
#include "utils/pg_lzcompress.h"
* ----------
*/
static const PGLZ_Strategy strategy_default_data = {
- 256, /* Data chunks less than 256 bytes are not
- * compressed */
- 6144, /* Data chunks >= 6K force compression, unless
- * compressed output is larger than input */
- 20, /* Below 6K, compression rates below 20% mean
- * fallback to uncompressed */
- 128, /* Stop history lookup if a match of 128 bytes
- * is found */
- 10 /* Lower good match size by 10% at every
- * lookup loop iteration */
+ 32, /* Data chunks less than 32 bytes are not compressed */
+ 1024 * 1024, /* Data chunks over 1MB are not compressed either */
+ 25, /* Require 25% compression rate, or not worth it */
+ 1024, /* Give up if no compression in the first 1KB */
+ 128, /* Stop history lookup if a match of 128 bytes is found */
+ 10 /* Lower good match size by 10% at every loop iteration */
};
const PGLZ_Strategy *const PGLZ_strategy_default = &strategy_default_data;
static const PGLZ_Strategy strategy_always_data = {
- 0, /* Chunks of any size are compressed */
- 0,
- 0, /* It's enough to save one single byte */
- 128, /* Stop history lookup if a match of 128 bytes
- * is found */
- 6 /* Look harder for a good match */
+ 0, /* Chunks of any size are compressed */
+ INT_MAX,
+ 0, /* It's enough to save one single byte */
+ INT_MAX, /* Never give up early */
+ 128, /* Stop history lookup if a match of 128 bytes is found */
+ 6 /* Look harder for a good match */
};
const PGLZ_Strategy *const PGLZ_strategy_always = &strategy_always_data;
unsigned char *ctrlp = &ctrl_dummy;
unsigned char ctrlb = 0;
unsigned char ctrl = 0;
+ bool found_match = false;
int32 match_len;
int32 match_off;
int32 good_match;
strategy = PGLZ_strategy_default;
/*
- * If the strategy forbids compression (at all or if source chunk too
- * small), fail.
+ * If the strategy forbids compression (at all or if source chunk size
+ * out of range), fail.
*/
if (strategy->match_size_good <= 0 ||
- slen < strategy->min_input_size)
+ slen < strategy->min_input_size ||
+ slen > strategy->max_input_size)
return false;
/*
dest->rawsize = slen;
/*
- * Limit the match size to the maximum implementation allowed value
+ * Limit the match parameters to the supported range.
*/
- if ((good_match = strategy->match_size_good) > PGLZ_MAX_MATCH)
+ good_match = strategy->match_size_good;
+ if (good_match > PGLZ_MAX_MATCH)
good_match = PGLZ_MAX_MATCH;
- if (good_match < 17)
+ else if (good_match < 17)
good_match = 17;
- if ((good_drop = strategy->match_size_drop) < 0)
+ good_drop = strategy->match_size_drop;
+ if (good_drop < 0)
good_drop = 0;
- if (good_drop > 100)
+ else if (good_drop > 100)
good_drop = 100;
- /*
- * Initialize the history lists to empty. We do not need to zero the
- * hist_entries[] array; its entries are initialized as they are used.
- */
- memset((void *) hist_start, 0, sizeof(hist_start));
+ need_rate = strategy->min_comp_rate;
+ if (need_rate < 0)
+ need_rate = 0;
+ else if (need_rate > 99)
+ need_rate = 99;
/*
- * Compute the maximum result size allowed by the strategy. If the input
- * size exceeds force_input_size, the max result size is the input size
- * itself. Otherwise, it is the input size minus the minimum wanted
- * compression rate.
+ * Compute the maximum result size allowed by the strategy, namely
+ * the input size minus the minimum wanted compression rate. This had
+ * better be <= slen, else we might overrun the provided output buffer.
*/
- if (slen >= strategy->force_input_size)
- result_max = slen;
- else
+ if (slen > (INT_MAX/100))
{
- need_rate = strategy->min_comp_rate;
- if (need_rate < 0)
- need_rate = 0;
- else if (need_rate > 99)
- need_rate = 99;
- result_max = slen - ((slen * need_rate) / 100);
+ /* Approximate to avoid overflow */
+ result_max = (slen / 100) * (100 - need_rate);
}
+ else
+ result_max = (slen * (100 - need_rate)) / 100;
+
+ /*
+ * Initialize the history lists to empty. We do not need to zero the
+ * hist_entries[] array; its entries are initialized as they are used.
+ */
+ memset(hist_start, 0, sizeof(hist_start));
/*
* Compress the source directly into the output buffer.
if (bp - bstart >= result_max)
return false;
+ /*
+ * If we've emitted more than first_success_by bytes without finding
+ * anything compressible at all, fail. This lets us fall out
+ * reasonably quickly when looking at incompressible input (such as
+ * pre-compressed data).
+ */
+ if (!found_match && bp - bstart >= strategy->first_success_by)
+ return false;
+
/*
* Try to find a match in the history
*/
pglz_hist_add(hist_start, hist_entries,
hist_next, hist_recycle,
dp, dend);
- dp++; /* Do not do this ++ in the line above! */
+ dp++; /* Do not do this ++ in the line above! */
/* The macro would do it four times - Jan. */
}
+ found_match = true;
}
else
{
pglz_hist_add(hist_start, hist_entries,
hist_next, hist_recycle,
dp, dend);
- dp++; /* Do not do this ++ in the line above! */
+ dp++; /* Do not do this ++ in the line above! */
/* The macro would do it four times - Jan. */
}
}
*
* Definitions for the builtin LZ compressor
*
- * $PostgreSQL: pgsql/src/include/utils/pg_lzcompress.h,v 1.16 2007/11/15 21:14:45 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/utils/pg_lzcompress.h,v 1.17 2008/03/07 23:20:21 tgl Exp $
* ----------
*/
/* ----------
* PGLZ_Header -
*
- * The information at the top of the compressed data.
+ * The information at the start of the compressed data.
* ----------
*/
typedef struct PGLZ_Header
*
* Some values that control the compression algorithm.
*
- * min_input_size Minimum input data size to start compression.
+ * min_input_size Minimum input data size to consider compression.
*
- * force_input_size Minimum input data size to force compression
- * even if the compression rate drops below
- * min_comp_rate. But in any case the output
- * must be smaller than the input. If that isn't
- * the case, the compressor will throw away its
- * output and copy the original, uncompressed data
- * to the output buffer.
+ * max_input_size Maximum input data size to consider compression.
*
- * min_comp_rate Minimum compression rate (0-99%) to require for
- * inputs smaller than force_input_size. If not
- * achieved, the output will be uncompressed.
+ * min_comp_rate Minimum compression rate (0-99%) to require.
+ * Regardless of min_comp_rate, the output must be
+ * smaller than the input, else we don't store
+ * compressed.
+ *
+ * first_success_by Abandon compression if we find no compressible
+ * data within the first this-many bytes.
*
* match_size_good The initial GOOD match size when starting history
* lookup. When looking up the history to find a
typedef struct PGLZ_Strategy
{
int32 min_input_size;
- int32 force_input_size;
+ int32 max_input_size;
int32 min_comp_rate;
+ int32 first_success_by;
int32 match_size_good;
int32 match_size_drop;
} PGLZ_Strategy;
/* ----------
* The standard strategies
*
- * PGLZ_strategy_default Starts compression only if input is
- * at least 256 bytes large. Stores output
- * uncompressed if compression does not
- * gain at least 20% size reducture but
- * input does not exceed 6K. Stops history
- * lookup if at least a 128 byte long
- * match has been found.
- *
- * This is the default strategy if none
- * is given to pglz_compress().
+ * PGLZ_strategy_default Recommended default strategy for TOAST.
*
- * PGLZ_strategy_always Starts compression on any infinitely
- * small input and does fallback to
- * uncompressed storage only if output
- * would be larger than input.
+ * PGLZ_strategy_always Try to compress inputs of any length.
+ * Fallback to uncompressed storage only if
+ * output would be larger than input.
* ----------
*/
extern const PGLZ_Strategy *const PGLZ_strategy_default;