1 /*-------------------------------------------------------------------------
4 * utilities routines for the postgres GiST index access method.
7 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gist/gistutil.c
12 *-------------------------------------------------------------------------
18 #include "access/gist_private.h"
19 #include "access/reloptions.h"
20 #include "storage/indexfsm.h"
21 #include "storage/lmgr.h"
22 #include "utils/builtins.h"
25 * static *S used for temporary storage (saves stack and palloc() call)
28 static Datum attrS[INDEX_MAX_KEYS];
29 static bool isnullS[INDEX_MAX_KEYS];
32 * Write itup vector to page, has no control of free space.
35 gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off)
37 OffsetNumber l = InvalidOffsetNumber;
40 if (off == InvalidOffsetNumber)
41 off = (PageIsEmpty(page)) ? FirstOffsetNumber :
42 OffsetNumberNext(PageGetMaxOffsetNumber(page));
44 for (i = 0; i < len; i++)
46 Size sz = IndexTupleSize(itup[i]);
48 l = PageAddItem(page, (Item) itup[i], sz, off, false, false);
49 if (l == InvalidOffsetNumber)
50 elog(ERROR, "failed to add item to GiST index page, item %d out of %d, size %d bytes",
57 * Check space for itup vector on page
60 gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace)
62 unsigned int size = freespace,
66 for (i = 0; i < len; i++)
67 size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData);
69 if (todelete != InvalidOffsetNumber)
71 IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, todelete));
73 deleted = IndexTupleSize(itup) + sizeof(ItemIdData);
76 return (PageGetFreeSpace(page) + deleted < size);
80 gistfitpage(IndexTuple *itvec, int len)
85 for (i = 0; i < len; i++)
86 size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData);
88 /* TODO: Consider fillfactor */
89 return (size <= GiSTPageSize);
93 * Read buffer into itup vector
96 gistextractpage(Page page, int *len /* out */ )
102 maxoff = PageGetMaxOffsetNumber(page);
104 itvec = palloc(sizeof(IndexTuple) * maxoff);
105 for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
106 itvec[i - FirstOffsetNumber] = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
112 * join two vectors into one
115 gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen)
117 itvec = (IndexTuple *) repalloc((void *) itvec, sizeof(IndexTuple) * ((*len) + addlen));
118 memmove(&itvec[*len], additvec, sizeof(IndexTuple) * addlen);
124 * make plain IndexTupleVector
128 gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
136 for (i = 0; i < veclen; i++)
137 *memlen += IndexTupleSize(vec[i]);
139 ptr = ret = palloc(*memlen);
141 for (i = 0; i < veclen; i++)
143 memcpy(ptr, vec[i], IndexTupleSize(vec[i]));
144 ptr += IndexTupleSize(vec[i]);
147 return (IndexTupleData *) ret;
151 * Make unions of keys in IndexTuple vector.
152 * Resulting Datums aren't compressed.
156 gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len, int startkey,
157 Datum *attr, bool *isnull)
160 GistEntryVector *evec;
163 evec = (GistEntryVector *) palloc((len + 2) * sizeof(GISTENTRY) + GEVHDRSZ);
165 for (i = startkey; i < giststate->tupdesc->natts; i++)
172 gistentryinit(evec->vector[evec->n], attr[i],
173 NULL, NULL, (OffsetNumber) 0,
178 for (j = 0; j < len; j++)
183 datum = index_getattr(itvec[j], i + 1, giststate->tupdesc, &IsNull);
187 gistdentryinit(giststate, i,
188 evec->vector + evec->n,
190 NULL, NULL, (OffsetNumber) 0,
195 /* If this tuple vector was all NULLs, the union is NULL */
206 evec->vector[1] = evec->vector[0];
209 /* Make union and store in attr array */
210 attr[i] = FunctionCall2Coll(&giststate->unionFn[i],
211 giststate->supportCollation[i],
212 PointerGetDatum(evec),
213 PointerGetDatum(&attrsize));
221 * Return an IndexTuple containing the result of applying the "union"
222 * method to the specified IndexTuple vector.
225 gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate)
227 memset(isnullS, TRUE, sizeof(bool) * giststate->tupdesc->natts);
229 gistMakeUnionItVec(giststate, itvec, len, 0, attrS, isnullS);
231 return gistFormTuple(giststate, r, attrS, isnullS, false);
235 * makes union of two key
238 gistMakeUnionKey(GISTSTATE *giststate, int attno,
239 GISTENTRY *entry1, bool isnull1,
240 GISTENTRY *entry2, bool isnull2,
241 Datum *dst, bool *dstisnull)
246 static char storage[2 * sizeof(GISTENTRY) + GEVHDRSZ];
247 GistEntryVector *evec = (GistEntryVector *) storage;
251 if (isnull1 && isnull2)
258 if (isnull1 == FALSE && isnull2 == FALSE)
260 evec->vector[0] = *entry1;
261 evec->vector[1] = *entry2;
263 else if (isnull1 == FALSE)
265 evec->vector[0] = *entry1;
266 evec->vector[1] = *entry1;
270 evec->vector[0] = *entry2;
271 evec->vector[1] = *entry2;
275 *dst = FunctionCall2Coll(&giststate->unionFn[attno],
276 giststate->supportCollation[attno],
277 PointerGetDatum(evec),
278 PointerGetDatum(&dstsize));
283 gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b)
287 FunctionCall3Coll(&giststate->equalFn[attno],
288 giststate->supportCollation[attno],
290 PointerGetDatum(&result));
295 * Decompress all keys in tuple
298 gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p,
299 OffsetNumber o, GISTENTRY *attdata, bool *isnull)
303 for (i = 0; i < r->rd_att->natts; i++)
305 Datum datum = index_getattr(tuple, i + 1, giststate->tupdesc, &isnull[i]);
307 gistdentryinit(giststate, i, &attdata[i],
314 * Forms union of oldtup and addtup, if union == oldtup then return NULL
317 gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate)
319 bool neednew = FALSE;
320 GISTENTRY oldentries[INDEX_MAX_KEYS],
321 addentries[INDEX_MAX_KEYS];
322 bool oldisnull[INDEX_MAX_KEYS],
323 addisnull[INDEX_MAX_KEYS];
324 IndexTuple newtup = NULL;
327 gistDeCompressAtt(giststate, r, oldtup, NULL,
328 (OffsetNumber) 0, oldentries, oldisnull);
330 gistDeCompressAtt(giststate, r, addtup, NULL,
331 (OffsetNumber) 0, addentries, addisnull);
333 for (i = 0; i < r->rd_att->natts; i++)
335 gistMakeUnionKey(giststate, i,
336 oldentries + i, oldisnull[i],
337 addentries + i, addisnull[i],
338 attrS + i, isnullS + i);
341 /* we already need new key, so we can skip check */
345 /* union of key may be NULL if and only if both keys are NULL */
350 if (oldisnull[i] || gistKeyIsEQ(giststate, i, oldentries[i].key, attrS[i]) == false)
357 /* need to update key */
358 newtup = gistFormTuple(giststate, r, attrS, isnullS, false);
359 newtup->t_tid = oldtup->t_tid;
366 * Search an upper index page for the entry with lowest penalty for insertion
367 * of the new index key contained in "it".
369 * Returns the index of the page entry to insert into.
372 gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */
373 GISTSTATE *giststate)
378 float best_penalty[INDEX_MAX_KEYS];
380 identry[INDEX_MAX_KEYS];
381 bool isnull[INDEX_MAX_KEYS];
382 int keep_current_best;
384 Assert(!GistPageIsLeaf(p));
386 gistDeCompressAtt(giststate, r,
387 it, NULL, (OffsetNumber) 0,
390 /* we'll return FirstOffsetNumber if page is empty (shouldn't happen) */
391 result = FirstOffsetNumber;
394 * The index may have multiple columns, and there's a penalty value for
395 * each column. The penalty associated with a column that appears earlier
396 * in the index definition is strictly more important than the penalty of
397 * a column that appears later in the index definition.
399 * best_penalty[j] is the best penalty we have seen so far for column j,
400 * or -1 when we haven't yet examined column j. Array entries to the
401 * right of the first -1 are undefined.
403 best_penalty[0] = -1;
406 * If we find a tuple that's exactly as good as the currently best one, we
407 * could use either one. When inserting a lot of tuples with the same or
408 * similar keys, it's preferable to descend down the same path when
409 * possible, as that's more cache-friendly. On the other hand, if all
410 * inserts land on the same leaf page after a split, we're never going to
411 * insert anything to the other half of the split, and will end up using
412 * only 50% of the available space. Distributing the inserts evenly would
413 * lead to better space usage, but that hurts cache-locality during
414 * insertion. To get the best of both worlds, when we find a tuple that's
415 * exactly as good as the previous best, choose randomly whether to stick
416 * to the old best, or use the new one. Once we decide to stick to the
417 * old best, we keep sticking to it for any subsequent equally good tuples
418 * we might find. This favors tuples with low offsets, but still allows
419 * some inserts to go to other equally-good subtrees.
421 * keep_current_best is -1 if we haven't yet had to make a random choice
422 * whether to keep the current best tuple. If we have done so, and
423 * decided to keep it, keep_current_best is 1; if we've decided to
424 * replace, keep_current_best is 0. (This state will be reset to -1 as
425 * soon as we've made the replacement, but sometimes we make the choice in
426 * advance of actually finding a replacement best tuple.)
428 keep_current_best = -1;
431 * Loop over tuples on page.
433 maxoff = PageGetMaxOffsetNumber(p);
434 Assert(maxoff >= FirstOffsetNumber);
436 for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
438 IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i));
444 /* Loop over index attributes. */
445 for (j = 0; j < r->rd_att->natts; j++)
451 /* Compute penalty for this column. */
452 datum = index_getattr(itup, j + 1, giststate->tupdesc, &IsNull);
453 gistdentryinit(giststate, j, &entry, datum, r, p, i,
455 usize = gistpenalty(giststate, j, &entry, IsNull,
456 &identry[j], isnull[j]);
458 zero_penalty = false;
460 if (best_penalty[j] < 0 || usize < best_penalty[j])
463 * New best penalty for column. Tentatively select this tuple
464 * as the target, and record the best penalty. Then reset the
465 * next column's penalty to "unknown" (and indirectly, the
466 * same for all the ones to its right). This will force us to
467 * adopt this tuple's penalty values as the best for all the
468 * remaining columns during subsequent loop iterations.
471 best_penalty[j] = usize;
473 if (j < r->rd_att->natts - 1)
474 best_penalty[j + 1] = -1;
476 /* we have new best, so reset keep-it decision */
477 keep_current_best = -1;
479 else if (best_penalty[j] == usize)
482 * The current tuple is exactly as good for this column as the
483 * best tuple seen so far. The next iteration of this loop
484 * will compare the next column.
490 * The current tuple is worse for this column than the best
491 * tuple seen so far. Skip the remaining columns and move on
492 * to the next tuple, if any.
494 zero_penalty = false; /* so outer loop won't exit */
500 * If we looped past the last column, and did not update "result",
501 * then this tuple is exactly as good as the prior best tuple.
503 if (j == r->rd_att->natts && result != i)
505 if (keep_current_best == -1)
507 /* we didn't make the random choice yet for this old best */
508 keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0;
510 if (keep_current_best == 0)
512 /* we choose to use the new tuple */
514 /* choose again if there are even more exactly-as-good ones */
515 keep_current_best = -1;
520 * If we find a tuple with zero penalty for all columns, and we've
521 * decided we don't want to search for another tuple with equal
522 * penalty, there's no need to examine remaining tuples; just break
523 * out of the loop and return it.
527 if (keep_current_best == -1)
529 /* we didn't make the random choice yet for this old best */
530 keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0;
532 if (keep_current_best == 1)
541 * initialize a GiST entry with a decompressed version of key
544 gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
545 Datum k, Relation r, Page pg, OffsetNumber o,
552 gistentryinit(*e, k, r, pg, o, l);
554 DatumGetPointer(FunctionCall1Coll(&giststate->decompressFn[nkey],
555 giststate->supportCollation[nkey],
556 PointerGetDatum(e)));
557 /* decompressFn may just return the given pointer */
559 gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset,
563 gistentryinit(*e, (Datum) 0, r, pg, o, l);
568 * initialize a GiST entry with a compressed version of key
571 gistcentryinit(GISTSTATE *giststate, int nkey,
572 GISTENTRY *e, Datum k, Relation r,
573 Page pg, OffsetNumber o, bool l, bool isNull)
579 gistentryinit(*e, k, r, pg, o, l);
581 DatumGetPointer(FunctionCall1Coll(&giststate->compressFn[nkey],
582 giststate->supportCollation[nkey],
583 PointerGetDatum(e)));
584 /* compressFn may just return the given pointer */
586 gistentryinit(*e, cep->key, cep->rel, cep->page, cep->offset,
590 gistentryinit(*e, (Datum) 0, r, pg, o, l);
594 gistFormTuple(GISTSTATE *giststate, Relation r,
595 Datum attdata[], bool isnull[], bool newValues)
597 GISTENTRY centry[INDEX_MAX_KEYS];
598 Datum compatt[INDEX_MAX_KEYS];
602 for (i = 0; i < r->rd_att->natts; i++)
605 compatt[i] = (Datum) 0;
608 gistcentryinit(giststate, i, ¢ry[i], attdata[i],
609 r, NULL, (OffsetNumber) 0,
612 compatt[i] = centry[i].key;
616 res = index_form_tuple(giststate->tupdesc, compatt, isnull);
619 * The offset number on tuples on internal pages is unused. For historical
620 * reasons, it is set 0xffff.
622 ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff);
627 gistpenalty(GISTSTATE *giststate, int attno,
628 GISTENTRY *orig, bool isNullOrig,
629 GISTENTRY *add, bool isNullAdd)
633 if (giststate->penaltyFn[attno].fn_strict == FALSE ||
634 (isNullOrig == FALSE && isNullAdd == FALSE))
636 FunctionCall3Coll(&giststate->penaltyFn[attno],
637 giststate->supportCollation[attno],
638 PointerGetDatum(orig),
639 PointerGetDatum(add),
640 PointerGetDatum(&penalty));
641 /* disallow negative or NaN penalty */
642 if (isnan(penalty) || penalty < 0.0)
645 else if (isNullOrig && isNullAdd)
649 /* try to prevent mixing null and non-null values */
650 penalty = get_float4_infinity();
657 * Initialize a new index page
660 GISTInitBuffer(Buffer b, uint32 f)
662 GISTPageOpaque opaque;
666 pageSize = BufferGetPageSize(b);
667 page = BufferGetPage(b);
668 PageInit(page, pageSize, sizeof(GISTPageOpaqueData));
670 opaque = GistPageGetOpaque(page);
671 /* page was already zeroed by PageInit, so this is not needed: */
672 /* memset(&(opaque->nsn), 0, sizeof(GistNSN)); */
673 opaque->rightlink = InvalidBlockNumber;
675 opaque->gist_page_id = GIST_PAGE_ID;
679 * Verify that a freshly-read page looks sane.
682 gistcheckpage(Relation rel, Buffer buf)
684 Page page = BufferGetPage(buf);
687 * ReadBuffer verifies that every newly-read page passes
688 * PageHeaderIsValid, which means it either contains a reasonably sane
689 * page header or is all-zero. We have to defend against the all-zero
694 (errcode(ERRCODE_INDEX_CORRUPTED),
695 errmsg("index \"%s\" contains unexpected zero page at block %u",
696 RelationGetRelationName(rel),
697 BufferGetBlockNumber(buf)),
698 errhint("Please REINDEX it.")));
701 * Additionally check that the special area looks sane.
703 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(GISTPageOpaqueData)))
705 (errcode(ERRCODE_INDEX_CORRUPTED),
706 errmsg("index \"%s\" contains corrupted page at block %u",
707 RelationGetRelationName(rel),
708 BufferGetBlockNumber(buf)),
709 errhint("Please REINDEX it.")));
714 * Allocate a new page (either by recycling, or by extending the index file)
716 * The returned buffer is already pinned and exclusive-locked
718 * Caller is responsible for initializing the page by calling GISTInitBuffer
721 gistNewBuffer(Relation r)
726 /* First, try to get a page from FSM */
729 BlockNumber blkno = GetFreeIndexPage(r);
731 if (blkno == InvalidBlockNumber)
732 break; /* nothing left in FSM */
734 buffer = ReadBuffer(r, blkno);
737 * We have to guard against the possibility that someone else already
738 * recycled this page; the buffer may be locked if so.
740 if (ConditionalLockBuffer(buffer))
742 Page page = BufferGetPage(buffer);
745 return buffer; /* OK to use, if never initialized */
747 gistcheckpage(r, buffer);
749 if (GistPageIsDeleted(page))
750 return buffer; /* OK to use */
752 LockBuffer(buffer, GIST_UNLOCK);
755 /* Can't use it, so release buffer and try again */
756 ReleaseBuffer(buffer);
759 /* Must extend the file */
760 needLock = !RELATION_IS_LOCAL(r);
763 LockRelationForExtension(r, ExclusiveLock);
765 buffer = ReadBuffer(r, P_NEW);
766 LockBuffer(buffer, GIST_EXCLUSIVE);
769 UnlockRelationForExtension(r, ExclusiveLock);
775 gistoptions(PG_FUNCTION_ARGS)
777 Datum reloptions = PG_GETARG_DATUM(0);
778 bool validate = PG_GETARG_BOOL(1);
779 relopt_value *options;
782 static const relopt_parse_elt tab[] = {
783 {"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)},
784 {"buffering", RELOPT_TYPE_STRING, offsetof(GiSTOptions, bufferingModeOffset)}
787 options = parseRelOptions(reloptions, validate, RELOPT_KIND_GIST,
790 /* if none set, we're done */
794 rdopts = allocateReloptStruct(sizeof(GiSTOptions), options, numoptions);
796 fillRelOptions((void *) rdopts, sizeof(GiSTOptions), options, numoptions,
797 validate, tab, lengthof(tab));
801 PG_RETURN_BYTEA_P(rdopts);
806 * Temporary GiST indexes are not WAL-logged, but we need LSNs to detect
807 * concurrent page splits anyway. GetXLogRecPtrForTemp() provides a fake
808 * sequence of LSNs for that purpose. Each call generates an LSN that is
809 * greater than any previous value returned by this function in the same
813 GetXLogRecPtrForTemp(void)
815 static XLogRecPtr counter = 1;