From f6b7d4fbbb72cedd53adb3bf10428dab559bb909 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Mon, 29 Sep 2014 12:29:24 -0400 Subject: [PATCH] Change JSONB's on-disk format for improved performance. The original design used an array of offsets into the variable-length portion of a JSONB container. However, such an array is basically uncompressible by simple compression techniques such as TOAST's LZ compressor. That's bad enough, but because the offset array is at the front, it tended to trigger the give-up-after-1KB heuristic in the TOAST code, so that the entire JSONB object was stored uncompressed; which was the root cause of bug #11109 from Larry White. To fix without losing the ability to extract a random array element in O(1) time, change this scheme so that most of the JEntry array elements hold lengths rather than offsets. With data that's compressible at all, there tend to be fewer distinct element lengths, so that there is scope for compression of the JEntry array. Every N'th entry is still an offset. To determine the length or offset of any specific element, we might have to examine up to N preceding JEntrys, but that's still O(1) so far as the total container size is concerned. Testing shows that this cost is negligible compared to other costs of accessing a JSONB field, and that the method does largely fix the incompressible-data problem. While at it, rearrange the order of elements in a JSONB object so that it's "all the keys, then all the values" not alternating keys and values. This doesn't really make much difference right at the moment, but it will allow providing a fast path for extracting individual object fields from large JSONB values stored EXTERNAL (ie, uncompressed), analogously to the existing optimization for substring extraction from large EXTERNAL text values. Bump catversion to denote the incompatibility in on-disk format. We will need to fix pg_upgrade to disallow upgrading jsonb data stored with 9.4 betas 1 and 2. Heikki Linnakangas and Tom Lane --- src/backend/utils/adt/jsonb.c | 4 +- src/backend/utils/adt/jsonb_util.c | 383 ++++++++++++++++++++--------- src/include/catalog/catversion.h | 2 +- src/include/utils/jsonb.h | 125 ++++++---- 4 files changed, 357 insertions(+), 157 deletions(-) diff --git a/src/backend/utils/adt/jsonb.c b/src/backend/utils/adt/jsonb.c index 2fd87fc9e1..9beebb3cb3 100644 --- a/src/backend/utils/adt/jsonb.c +++ b/src/backend/utils/adt/jsonb.c @@ -196,12 +196,12 @@ jsonb_from_cstring(char *json, int len) static size_t checkStringLen(size_t len) { - if (len > JENTRY_POSMASK) + if (len > JENTRY_OFFLENMASK) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("string too long to represent as jsonb string"), errdetail("Due to an implementation restriction, jsonb strings cannot exceed %d bytes.", - JENTRY_POSMASK))); + JENTRY_OFFLENMASK))); return len; } diff --git a/src/backend/utils/adt/jsonb_util.c b/src/backend/utils/adt/jsonb_util.c index 04f35bfffc..f157df3532 100644 --- a/src/backend/utils/adt/jsonb_util.c +++ b/src/backend/utils/adt/jsonb_util.c @@ -26,15 +26,16 @@ * in MaxAllocSize, and the number of elements (or pairs) must fit in the bits * reserved for that in the JsonbContainer.header field. * - * (the total size of an array's elements is also limited by JENTRY_POSMASK, - * but we're not concerned about that here) + * (The total size of an array's or object's elements is also limited by + * JENTRY_OFFLENMASK, but we're not concerned about that here.) */ #define JSONB_MAX_ELEMS (Min(MaxAllocSize / sizeof(JsonbValue), JB_CMASK)) #define JSONB_MAX_PAIRS (Min(MaxAllocSize / sizeof(JsonbPair), JB_CMASK)) -static void fillJsonbValue(JEntry *array, int index, char *base_addr, +static void fillJsonbValue(JsonbContainer *container, int index, + char *base_addr, uint32 offset, JsonbValue *result); -static bool equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b); +static bool equalsJsonbScalarValue(JsonbValue *a, JsonbValue *b); static int compareJsonbScalarValue(JsonbValue *a, JsonbValue *b); static Jsonb *convertToJsonb(JsonbValue *val); static void convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level); @@ -42,7 +43,7 @@ static void convertJsonbArray(StringInfo buffer, JEntry *header, JsonbValue *val static void convertJsonbObject(StringInfo buffer, JEntry *header, JsonbValue *val, int level); static void convertJsonbScalar(StringInfo buffer, JEntry *header, JsonbValue *scalarVal); -static int reserveFromBuffer(StringInfo buffer, int len); +static int reserveFromBuffer(StringInfo buffer, int len); static void appendToBuffer(StringInfo buffer, const char *data, int len); static void copyToBuffer(StringInfo buffer, int offset, const char *data, int len); static short padBufferToInt(StringInfo buffer); @@ -107,6 +108,58 @@ JsonbValueToJsonb(JsonbValue *val) return out; } +/* + * Get the offset of the variable-length portion of a Jsonb node within + * the variable-length-data part of its container. The node is identified + * by index within the container's JEntry array. + */ +uint32 +getJsonbOffset(const JsonbContainer *jc, int index) +{ + uint32 offset = 0; + int i; + + /* + * Start offset of this entry is equal to the end offset of the previous + * entry. Walk backwards to the most recent entry stored as an end + * offset, returning that offset plus any lengths in between. + */ + for (i = index - 1; i >= 0; i--) + { + offset += JBE_OFFLENFLD(jc->children[i]); + if (JBE_HAS_OFF(jc->children[i])) + break; + } + + return offset; +} + +/* + * Get the length of the variable-length portion of a Jsonb node. + * The node is identified by index within the container's JEntry array. + */ +uint32 +getJsonbLength(const JsonbContainer *jc, int index) +{ + uint32 off; + uint32 len; + + /* + * If the length is stored directly in the JEntry, just return it. + * Otherwise, get the begin offset of the entry, and subtract that from + * the stored end+1 offset. + */ + if (JBE_HAS_OFF(jc->children[index])) + { + off = getJsonbOffset(jc, index); + len = JBE_OFFLENFLD(jc->children[index]) - off; + } + else + len = JBE_OFFLENFLD(jc->children[index]); + + return len; +} + /* * BT comparator worker function. Returns an integer less than, equal to, or * greater than zero, indicating whether a is less than, equal to, or greater @@ -201,7 +254,7 @@ compareJsonbContainers(JsonbContainer *a, JsonbContainer *b) * * If the two values were of the same container type, then there'd * have been a chance to observe the variation in the number of - * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're + * elements/pairs (when processing WJB_BEGIN_OBJECT, say). They're * either two heterogeneously-typed containers, or a container and * some scalar type. * @@ -272,24 +325,33 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags, { JEntry *children = container->children; int count = (container->header & JB_CMASK); - JsonbValue *result = palloc(sizeof(JsonbValue)); + JsonbValue *result; Assert((flags & ~(JB_FARRAY | JB_FOBJECT)) == 0); + /* Quick out without a palloc cycle if object/array is empty */ + if (count <= 0) + return NULL; + + result = palloc(sizeof(JsonbValue)); + if (flags & JB_FARRAY & container->header) { char *base_addr = (char *) (children + count); + uint32 offset = 0; int i; for (i = 0; i < count; i++) { - fillJsonbValue(children, i, base_addr, result); + fillJsonbValue(container, i, base_addr, offset, result); if (key->type == result->type) { if (equalsJsonbScalarValue(key, result)) return result; } + + JBE_ADVANCE_OFFSET(offset, children[i]); } } else if (flags & JB_FOBJECT & container->header) @@ -297,36 +359,35 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags, /* Since this is an object, account for *Pairs* of Jentrys */ char *base_addr = (char *) (children + count * 2); uint32 stopLow = 0, - stopMiddle; + stopHigh = count; - /* Object key past by caller must be a string */ + /* Object key passed by caller must be a string */ Assert(key->type == jbvString); /* Binary search on object/pair keys *only* */ - while (stopLow < count) + while (stopLow < stopHigh) { - int index; + uint32 stopMiddle; int difference; JsonbValue candidate; - /* - * Note how we compensate for the fact that we're iterating - * through pairs (not entries) throughout. - */ - stopMiddle = stopLow + (count - stopLow) / 2; - - index = stopMiddle * 2; + stopMiddle = stopLow + (stopHigh - stopLow) / 2; candidate.type = jbvString; - candidate.val.string.val = base_addr + JBE_OFF(children, index); - candidate.val.string.len = JBE_LEN(children, index); + candidate.val.string.val = + base_addr + getJsonbOffset(container, stopMiddle); + candidate.val.string.len = getJsonbLength(container, stopMiddle); difference = lengthCompareJsonbStringValue(&candidate, key); if (difference == 0) { - /* Found our key, return value */ - fillJsonbValue(children, index + 1, base_addr, result); + /* Found our key, return corresponding value */ + int index = stopMiddle + count; + + fillJsonbValue(container, index, base_addr, + getJsonbOffset(container, index), + result); return result; } @@ -335,7 +396,7 @@ findJsonbValueFromContainer(JsonbContainer *container, uint32 flags, if (difference < 0) stopLow = stopMiddle + 1; else - count = stopMiddle; + stopHigh = stopMiddle; } } } @@ -368,7 +429,9 @@ getIthJsonbValueFromContainer(JsonbContainer *container, uint32 i) result = palloc(sizeof(JsonbValue)); - fillJsonbValue(container->children, i, base_addr, result); + fillJsonbValue(container, i, base_addr, + getJsonbOffset(container, i), + result); return result; } @@ -377,13 +440,20 @@ getIthJsonbValueFromContainer(JsonbContainer *container, uint32 i) * A helper function to fill in a JsonbValue to represent an element of an * array, or a key or value of an object. * + * The node's JEntry is at container->children[index], and its variable-length + * data is at base_addr + offset. We make the caller determine the offset + * since in many cases the caller can amortize that work across multiple + * children. When it can't, it can just call getJsonbOffset(). + * * A nested array or object will be returned as jbvBinary, ie. it won't be * expanded. */ static void -fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result) +fillJsonbValue(JsonbContainer *container, int index, + char *base_addr, uint32 offset, + JsonbValue *result) { - JEntry entry = children[index]; + JEntry entry = container->children[index]; if (JBE_ISNULL(entry)) { @@ -392,14 +462,14 @@ fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result) else if (JBE_ISSTRING(entry)) { result->type = jbvString; - result->val.string.val = base_addr + JBE_OFF(children, index); - result->val.string.len = JBE_LEN(children, index); + result->val.string.val = base_addr + offset; + result->val.string.len = getJsonbLength(container, index); Assert(result->val.string.len >= 0); } else if (JBE_ISNUMERIC(entry)) { result->type = jbvNumeric; - result->val.numeric = (Numeric) (base_addr + INTALIGN(JBE_OFF(children, index))); + result->val.numeric = (Numeric) (base_addr + INTALIGN(offset)); } else if (JBE_ISBOOL_TRUE(entry)) { @@ -415,8 +485,10 @@ fillJsonbValue(JEntry *children, int index, char *base_addr, JsonbValue *result) { Assert(JBE_ISCONTAINER(entry)); result->type = jbvBinary; - result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(JBE_OFF(children, index))); - result->val.binary.len = JBE_LEN(children, index) - (INTALIGN(JBE_OFF(children, index)) - JBE_OFF(children, index)); + /* Remove alignment padding from data pointer and length */ + result->val.binary.data = (JsonbContainer *) (base_addr + INTALIGN(offset)); + result->val.binary.len = getJsonbLength(container, index) - + (INTALIGN(offset) - offset); } } @@ -668,13 +740,15 @@ recurse: * a full conversion */ val->val.array.rawScalar = (*it)->isScalar; - (*it)->i = 0; + (*it)->curIndex = 0; + (*it)->curDataOffset = 0; + (*it)->curValueOffset = 0; /* not actually used */ /* Set state for next call */ (*it)->state = JBI_ARRAY_ELEM; return WJB_BEGIN_ARRAY; case JBI_ARRAY_ELEM: - if ((*it)->i >= (*it)->nElems) + if ((*it)->curIndex >= (*it)->nElems) { /* * All elements within array already processed. Report this @@ -686,7 +760,13 @@ recurse: return WJB_END_ARRAY; } - fillJsonbValue((*it)->children, (*it)->i++, (*it)->dataProper, val); + fillJsonbValue((*it)->container, (*it)->curIndex, + (*it)->dataProper, (*it)->curDataOffset, + val); + + JBE_ADVANCE_OFFSET((*it)->curDataOffset, + (*it)->children[(*it)->curIndex]); + (*it)->curIndex++; if (!IsAJsonbScalar(val) && !skipNested) { @@ -697,8 +777,8 @@ recurse: else { /* - * Scalar item in array, or a container and caller didn't - * want us to recurse into it. + * Scalar item in array, or a container and caller didn't want + * us to recurse into it. */ return WJB_ELEM; } @@ -712,13 +792,16 @@ recurse: * v->val.object.pairs is not actually set, because we aren't * doing a full conversion */ - (*it)->i = 0; + (*it)->curIndex = 0; + (*it)->curDataOffset = 0; + (*it)->curValueOffset = getJsonbOffset((*it)->container, + (*it)->nElems); /* Set state for next call */ (*it)->state = JBI_OBJECT_KEY; return WJB_BEGIN_OBJECT; case JBI_OBJECT_KEY: - if ((*it)->i >= (*it)->nElems) + if ((*it)->curIndex >= (*it)->nElems) { /* * All pairs within object already processed. Report this to @@ -732,7 +815,9 @@ recurse: else { /* Return key of a key/value pair. */ - fillJsonbValue((*it)->children, (*it)->i * 2, (*it)->dataProper, val); + fillJsonbValue((*it)->container, (*it)->curIndex, + (*it)->dataProper, (*it)->curDataOffset, + val); if (val->type != jbvString) elog(ERROR, "unexpected jsonb type as object key"); @@ -745,8 +830,15 @@ recurse: /* Set state for next call */ (*it)->state = JBI_OBJECT_KEY; - fillJsonbValue((*it)->children, ((*it)->i++) * 2 + 1, - (*it)->dataProper, val); + fillJsonbValue((*it)->container, (*it)->curIndex + (*it)->nElems, + (*it)->dataProper, (*it)->curValueOffset, + val); + + JBE_ADVANCE_OFFSET((*it)->curDataOffset, + (*it)->children[(*it)->curIndex]); + JBE_ADVANCE_OFFSET((*it)->curValueOffset, + (*it)->children[(*it)->curIndex + (*it)->nElems]); + (*it)->curIndex++; /* * Value may be a container, in which case we recurse with new, @@ -795,11 +887,6 @@ iteratorFromContainer(JsonbContainer *container, JsonbIterator *parent) break; case JB_FOBJECT: - - /* - * Offset reflects that nElems indicates JsonbPairs in an object. - * Each key and each value contain Jentry metadata just the same. - */ it->dataProper = (char *) it->children + it->nElems * sizeof(JEntry) * 2; it->state = JBI_OBJECT_START; @@ -1209,8 +1296,8 @@ reserveFromBuffer(StringInfo buffer, int len) buffer->len += len; /* - * Keep a trailing null in place, even though it's not useful for us; - * it seems best to preserve the invariants of StringInfos. + * Keep a trailing null in place, even though it's not useful for us; it + * seems best to preserve the invariants of StringInfos. */ buffer->data[buffer->len] = '\0'; @@ -1284,8 +1371,8 @@ convertToJsonb(JsonbValue *val) /* * Note: the JEntry of the root is discarded. Therefore the root - * JsonbContainer struct must contain enough information to tell what - * kind of value it is. + * JsonbContainer struct must contain enough information to tell what kind + * of value it is. */ res = (Jsonb *) buffer.data; @@ -1298,10 +1385,10 @@ convertToJsonb(JsonbValue *val) /* * Subroutine of convertJsonb: serialize a single JsonbValue into buffer. * - * The JEntry header for this node is returned in *header. It is filled in - * with the length of this value, but if it is stored in an array or an - * object (which is always, except for the root node), it is the caller's - * responsibility to adjust it with the offset within the container. + * The JEntry header for this node is returned in *header. It is filled in + * with the length of this value and appropriate type bits. If we wish to + * store an end offset rather than a length, it is the caller's responsibility + * to adjust for that. * * If the value is an array or an object, this recurses. 'level' is only used * for debugging purposes. @@ -1315,10 +1402,10 @@ convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level) return; /* - * A JsonbValue passed as val should never have a type of jbvBinary, - * and neither should any of its sub-components. Those values will be - * produced by convertJsonbArray and convertJsonbObject, the results of - * which will not be passed back to this function as an argument. + * A JsonbValue passed as val should never have a type of jbvBinary, and + * neither should any of its sub-components. Those values will be produced + * by convertJsonbArray and convertJsonbObject, the results of which will + * not be passed back to this function as an argument. */ if (IsAJsonbScalar(val)) @@ -1334,124 +1421,200 @@ convertJsonbValue(StringInfo buffer, JEntry *header, JsonbValue *val, int level) static void convertJsonbArray(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level) { - int offset; - int metaoffset; + int base_offset; + int jentry_offset; int i; int totallen; uint32 header; + int nElems = val->val.array.nElems; - /* Initialize pointer into conversion buffer at this level */ - offset = buffer->len; + /* Remember where in the buffer this array starts. */ + base_offset = buffer->len; + /* Align to 4-byte boundary (any padding counts as part of my data) */ padBufferToInt(buffer); /* - * Construct the header Jentry, stored in the beginning of the variable- - * length payload. + * Construct the header Jentry and store it in the beginning of the + * variable-length payload. */ - header = val->val.array.nElems | JB_FARRAY; + header = nElems | JB_FARRAY; if (val->val.array.rawScalar) { - Assert(val->val.array.nElems == 1); + Assert(nElems == 1); Assert(level == 0); header |= JB_FSCALAR; } appendToBuffer(buffer, (char *) &header, sizeof(uint32)); - /* reserve space for the JEntries of the elements. */ - metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.array.nElems); + + /* Reserve space for the JEntries of the elements. */ + jentry_offset = reserveFromBuffer(buffer, sizeof(JEntry) * nElems); totallen = 0; - for (i = 0; i < val->val.array.nElems; i++) + for (i = 0; i < nElems; i++) { JsonbValue *elem = &val->val.array.elems[i]; int len; JEntry meta; + /* + * Convert element, producing a JEntry and appending its + * variable-length data to buffer + */ convertJsonbValue(buffer, &meta, elem, level + 1); - len = meta & JENTRY_POSMASK; + + len = JBE_OFFLENFLD(meta); totallen += len; - if (totallen > JENTRY_POSMASK) + /* + * Bail out if total variable-length data exceeds what will fit in a + * JEntry length field. We check this in each iteration, not just + * once at the end, to forestall possible integer overflow. + */ + if (totallen > JENTRY_OFFLENMASK) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("total size of jsonb array elements exceeds the maximum of %u bytes", - JENTRY_POSMASK))); + JENTRY_OFFLENMASK))); + + /* + * Convert each JB_OFFSET_STRIDE'th length to an offset. + */ + if ((i % JB_OFFSET_STRIDE) == 0) + meta = (meta & JENTRY_TYPEMASK) | totallen | JENTRY_HAS_OFF; - if (i > 0) - meta = (meta & ~JENTRY_POSMASK) | totallen; - copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry)); - metaoffset += sizeof(JEntry); + copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry)); + jentry_offset += sizeof(JEntry); } - totallen = buffer->len - offset; + /* Total data size is everything we've appended to buffer */ + totallen = buffer->len - base_offset; + + /* Check length again, since we didn't include the metadata above */ + if (totallen > JENTRY_OFFLENMASK) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("total size of jsonb array elements exceeds the maximum of %u bytes", + JENTRY_OFFLENMASK))); - /* Initialize the header of this node, in the container's JEntry array */ + /* Initialize the header of this node in the container's JEntry array */ *pheader = JENTRY_ISCONTAINER | totallen; } static void convertJsonbObject(StringInfo buffer, JEntry *pheader, JsonbValue *val, int level) { - uint32 header; - int offset; - int metaoffset; + int base_offset; + int jentry_offset; int i; int totallen; + uint32 header; + int nPairs = val->val.object.nPairs; - /* Initialize pointer into conversion buffer at this level */ - offset = buffer->len; + /* Remember where in the buffer this object starts. */ + base_offset = buffer->len; + /* Align to 4-byte boundary (any padding counts as part of my data) */ padBufferToInt(buffer); - /* Initialize header */ - header = val->val.object.nPairs | JB_FOBJECT; + /* + * Construct the header Jentry and store it in the beginning of the + * variable-length payload. + */ + header = nPairs | JB_FOBJECT; appendToBuffer(buffer, (char *) &header, sizeof(uint32)); - /* reserve space for the JEntries of the keys and values */ - metaoffset = reserveFromBuffer(buffer, sizeof(JEntry) * val->val.object.nPairs * 2); + /* Reserve space for the JEntries of the keys and values. */ + jentry_offset = reserveFromBuffer(buffer, sizeof(JEntry) * nPairs * 2); + /* + * Iterate over the keys, then over the values, since that is the ordering + * we want in the on-disk representation. + */ totallen = 0; - for (i = 0; i < val->val.object.nPairs; i++) + for (i = 0; i < nPairs; i++) { - JsonbPair *pair = &val->val.object.pairs[i]; - int len; - JEntry meta; + JsonbPair *pair = &val->val.object.pairs[i]; + int len; + JEntry meta; - /* put key */ + /* + * Convert key, producing a JEntry and appending its variable-length + * data to buffer + */ convertJsonbScalar(buffer, &meta, &pair->key); - len = meta & JENTRY_POSMASK; + len = JBE_OFFLENFLD(meta); totallen += len; - if (totallen > JENTRY_POSMASK) + /* + * Bail out if total variable-length data exceeds what will fit in a + * JEntry length field. We check this in each iteration, not just + * once at the end, to forestall possible integer overflow. + */ + if (totallen > JENTRY_OFFLENMASK) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("total size of jsonb array elements exceeds the maximum of %u bytes", - JENTRY_POSMASK))); + errmsg("total size of jsonb object elements exceeds the maximum of %u bytes", + JENTRY_OFFLENMASK))); + + /* + * Convert each JB_OFFSET_STRIDE'th length to an offset. + */ + if ((i % JB_OFFSET_STRIDE) == 0) + meta = (meta & JENTRY_TYPEMASK) | totallen | JENTRY_HAS_OFF; + + copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry)); + jentry_offset += sizeof(JEntry); + } + for (i = 0; i < nPairs; i++) + { + JsonbPair *pair = &val->val.object.pairs[i]; + int len; + JEntry meta; - if (i > 0) - meta = (meta & ~JENTRY_POSMASK) | totallen; - copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry)); - metaoffset += sizeof(JEntry); + /* + * Convert value, producing a JEntry and appending its variable-length + * data to buffer + */ + convertJsonbValue(buffer, &meta, &pair->value, level + 1); - convertJsonbValue(buffer, &meta, &pair->value, level); - len = meta & JENTRY_POSMASK; + len = JBE_OFFLENFLD(meta); totallen += len; - if (totallen > JENTRY_POSMASK) + /* + * Bail out if total variable-length data exceeds what will fit in a + * JEntry length field. We check this in each iteration, not just + * once at the end, to forestall possible integer overflow. + */ + if (totallen > JENTRY_OFFLENMASK) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), - errmsg("total size of jsonb array elements exceeds the maximum of %u bytes", - JENTRY_POSMASK))); + errmsg("total size of jsonb object elements exceeds the maximum of %u bytes", + JENTRY_OFFLENMASK))); - meta = (meta & ~JENTRY_POSMASK) | totallen; - copyToBuffer(buffer, metaoffset, (char *) &meta, sizeof(JEntry)); - metaoffset += sizeof(JEntry); + /* + * Convert each JB_OFFSET_STRIDE'th length to an offset. + */ + if (((i + nPairs) % JB_OFFSET_STRIDE) == 0) + meta = (meta & JENTRY_TYPEMASK) | totallen | JENTRY_HAS_OFF; + + copyToBuffer(buffer, jentry_offset, (char *) &meta, sizeof(JEntry)); + jentry_offset += sizeof(JEntry); } - totallen = buffer->len - offset; + /* Total data size is everything we've appended to buffer */ + totallen = buffer->len - base_offset; + + /* Check length again, since we didn't include the metadata above */ + if (totallen > JENTRY_OFFLENMASK) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("total size of jsonb object elements exceeds the maximum of %u bytes", + JENTRY_OFFLENMASK))); + /* Initialize the header of this node in the container's JEntry array */ *pheader = JENTRY_ISCONTAINER | totallen; } diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 78d1c6d1d4..7424499451 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 201409161 +#define CATALOG_VERSION_NO 201409291 #endif diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h index 91e3e140b4..b89e4cb92c 100644 --- a/src/include/utils/jsonb.h +++ b/src/include/utils/jsonb.h @@ -83,9 +83,9 @@ typedef struct JsonbValue JsonbValue; * buffer is accessed, but they can also be deep copied and passed around. * * Jsonb is a tree structure. Each node in the tree consists of a JEntry - * header, and a variable-length content. The JEntry header indicates what - * kind of a node it is, e.g. a string or an array, and the offset and length - * of its variable-length portion within the container. + * header and a variable-length content (possibly of zero size). The JEntry + * header indicates what kind of a node it is, e.g. a string or an array, + * and provides the length of its variable-length portion. * * The JEntry and the content of a node are not stored physically together. * Instead, the container array or object has an array that holds the JEntrys @@ -95,40 +95,52 @@ typedef struct JsonbValue JsonbValue; * hold its JEntry. Hence, no JEntry header is stored for the root node. It * is implicitly known that the root node must be an array or an object, * so we can get away without the type indicator as long as we can distinguish - * the two. For that purpose, both an array and an object begins with a uint32 + * the two. For that purpose, both an array and an object begin with a uint32 * header field, which contains an JB_FOBJECT or JB_FARRAY flag. When a naked * scalar value needs to be stored as a Jsonb value, what we actually store is * an array with one element, with the flags in the array's header field set * to JB_FSCALAR | JB_FARRAY. * - * To encode the length and offset of the variable-length portion of each - * node in a compact way, the JEntry stores only the end offset within the - * variable-length portion of the container node. For the first JEntry in the - * container's JEntry array, that equals to the length of the node data. The - * begin offset and length of the rest of the entries can be calculated using - * the end offset of the previous JEntry in the array. - * * Overall, the Jsonb struct requires 4-bytes alignment. Within the struct, * the variable-length portion of some node types is aligned to a 4-byte * boundary, while others are not. When alignment is needed, the padding is * in the beginning of the node that requires it. For example, if a numeric * node is stored after a string node, so that the numeric node begins at * offset 3, the variable-length portion of the numeric node will begin with - * one padding byte. + * one padding byte so that the actual numeric data is 4-byte aligned. */ /* - * Jentry format. + * JEntry format. + * + * The least significant 28 bits store either the data length of the entry, + * or its end+1 offset from the start of the variable-length portion of the + * containing object. The next three bits store the type of the entry, and + * the high-order bit tells whether the least significant bits store a length + * or an offset. + * + * The reason for the offset-or-length complication is to compromise between + * access speed and data compressibility. In the initial design each JEntry + * always stored an offset, but this resulted in JEntry arrays with horrible + * compressibility properties, so that TOAST compression of a JSONB did not + * work well. Storing only lengths would greatly improve compressibility, + * but it makes random access into large arrays expensive (O(N) not O(1)). + * So what we do is store an offset in every JB_OFFSET_STRIDE'th JEntry and + * a length in the rest. This results in reasonably compressible data (as + * long as the stride isn't too small). We may have to examine as many as + * JB_OFFSET_STRIDE JEntrys in order to find out the offset or length of any + * given item, but that's still O(1) no matter how large the container is. * - * The least significant 28 bits store the end offset of the entry (see - * JBE_ENDPOS, JBE_OFF, JBE_LEN macros below). The next three bits - * are used to store the type of the entry. The most significant bit - * is unused, and should be set to zero. + * We could avoid eating a flag bit for this purpose if we were to store + * the stride in the container header, or if we were willing to treat the + * stride as an unchangeable constant. Neither of those options is very + * attractive though. */ typedef uint32 JEntry; -#define JENTRY_POSMASK 0x0FFFFFFF +#define JENTRY_OFFLENMASK 0x0FFFFFFF #define JENTRY_TYPEMASK 0x70000000 +#define JENTRY_HAS_OFF 0x80000000 /* values stored in the type bits */ #define JENTRY_ISSTRING 0x00000000 @@ -138,7 +150,9 @@ typedef uint32 JEntry; #define JENTRY_ISNULL 0x40000000 #define JENTRY_ISCONTAINER 0x50000000 /* array or object */ -/* Note possible multiple evaluations */ +/* Access macros. Note possible multiple evaluations */ +#define JBE_OFFLENFLD(je_) ((je_) & JENTRY_OFFLENMASK) +#define JBE_HAS_OFF(je_) (((je_) & JENTRY_HAS_OFF) != 0) #define JBE_ISSTRING(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISSTRING) #define JBE_ISNUMERIC(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISNUMERIC) #define JBE_ISCONTAINER(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISCONTAINER) @@ -147,20 +161,34 @@ typedef uint32 JEntry; #define JBE_ISBOOL_FALSE(je_) (((je_) & JENTRY_TYPEMASK) == JENTRY_ISBOOL_FALSE) #define JBE_ISBOOL(je_) (JBE_ISBOOL_TRUE(je_) || JBE_ISBOOL_FALSE(je_)) +/* Macro for advancing an offset variable to the next JEntry */ +#define JBE_ADVANCE_OFFSET(offset, je) \ + do { \ + JEntry je_ = (je); \ + if (JBE_HAS_OFF(je_)) \ + (offset) = JBE_OFFLENFLD(je_); \ + else \ + (offset) += JBE_OFFLENFLD(je_); \ + } while(0) + /* - * Macros for getting the offset and length of an element. Note multiple - * evaluations and access to prior array element. + * We store an offset, not a length, every JB_OFFSET_STRIDE children. + * Caution: this macro should only be referenced when creating a JSONB + * value. When examining an existing value, pay attention to the HAS_OFF + * bits instead. This allows changes in the offset-placement heuristic + * without breaking on-disk compatibility. */ -#define JBE_ENDPOS(je_) ((je_) & JENTRY_POSMASK) -#define JBE_OFF(ja, i) ((i) == 0 ? 0 : JBE_ENDPOS((ja)[i - 1])) -#define JBE_LEN(ja, i) ((i) == 0 ? JBE_ENDPOS((ja)[i]) \ - : JBE_ENDPOS((ja)[i]) - JBE_ENDPOS((ja)[i - 1])) +#define JB_OFFSET_STRIDE 32 /* * A jsonb array or object node, within a Jsonb Datum. * - * An array has one child for each element. An object has two children for - * each key/value pair. + * An array has one child for each element, stored in array order. + * + * An object has two children for each key/value pair. The keys all appear + * first, in key sort order; then the values appear, in an order matching the + * key order. This arrangement keeps the keys compact in memory, making a + * search for a particular key more cache-friendly. */ typedef struct JsonbContainer { @@ -172,8 +200,8 @@ typedef struct JsonbContainer } JsonbContainer; /* flags for the header-field in JsonbContainer */ -#define JB_CMASK 0x0FFFFFFF -#define JB_FSCALAR 0x10000000 +#define JB_CMASK 0x0FFFFFFF /* mask for count field */ +#define JB_FSCALAR 0x10000000 /* flag bits */ #define JB_FOBJECT 0x20000000 #define JB_FARRAY 0x40000000 @@ -248,18 +276,20 @@ struct JsonbValue (jsonbval)->type <= jbvBool) /* - * Pair within an Object. + * Key/value pair within an Object. * - * Pairs with duplicate keys are de-duplicated. We store the order for the - * benefit of doing so in a well-defined way with respect to the original - * observed order (which is "last observed wins"). This is only used briefly - * when originally constructing a Jsonb. + * This struct type is only used briefly while constructing a Jsonb; it is + * *not* the on-disk representation. + * + * Pairs with duplicate keys are de-duplicated. We store the originally + * observed pair ordering for the purpose of removing duplicates in a + * well-defined way (which is "last observed wins"). */ struct JsonbPair { JsonbValue key; /* Must be a jbvString */ JsonbValue value; /* May be of any type */ - uint32 order; /* preserves order of pairs with equal keys */ + uint32 order; /* Pair's index in original sequence */ }; /* Conversion state used when parsing Jsonb from text, or for type coercion */ @@ -287,20 +317,25 @@ typedef struct JsonbIterator { /* Container being iterated */ JsonbContainer *container; - uint32 nElems; /* Number of elements in children array (will be - * nPairs for objects) */ + uint32 nElems; /* Number of elements in children array (will + * be nPairs for objects) */ bool isScalar; /* Pseudo-array scalar value? */ - JEntry *children; + JEntry *children; /* JEntrys for child nodes */ + /* Data proper. This points to the beginning of the variable-length data */ + char *dataProper; - /* Current item in buffer (up to nElems, but must * 2 for objects) */ - int i; + /* Current item in buffer (up to nElems) */ + int curIndex; + + /* Data offset corresponding to current item */ + uint32 curDataOffset; /* - * Data proper. This points just past end of children array. - * We use the JBE_OFF() macro on the Jentrys to find offsets of each - * child in this area. + * If the container is an object, we want to return keys and values + * alternately; so curDataOffset points to the current key, and + * curValueOffset points to the current value. */ - char *dataProper; + uint32 curValueOffset; /* Private state */ JsonbIterState state; @@ -344,6 +379,8 @@ extern Datum gin_consistent_jsonb_path(PG_FUNCTION_ARGS); extern Datum gin_triconsistent_jsonb_path(PG_FUNCTION_ARGS); /* Support functions */ +extern uint32 getJsonbOffset(const JsonbContainer *jc, int index); +extern uint32 getJsonbLength(const JsonbContainer *jc, int index); extern int compareJsonbContainers(JsonbContainer *a, JsonbContainer *b); extern JsonbValue *findJsonbValueFromContainer(JsonbContainer *sheader, uint32 flags, -- 2.40.0