1 /*-------------------------------------------------------------------------
4 * utilities routines for the postgres GiST index access method.
7 * Portions Copyright (c) 1996-2013, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gist/gistutil.c
12 *-------------------------------------------------------------------------
18 #include "access/gist_private.h"
19 #include "access/reloptions.h"
20 #include "storage/indexfsm.h"
21 #include "storage/lmgr.h"
22 #include "utils/builtins.h"
26 * Write itup vector to page, has no control of free space.
29 gistfillbuffer(Page page, IndexTuple *itup, int len, OffsetNumber off)
31 OffsetNumber l = InvalidOffsetNumber;
34 if (off == InvalidOffsetNumber)
35 off = (PageIsEmpty(page)) ? FirstOffsetNumber :
36 OffsetNumberNext(PageGetMaxOffsetNumber(page));
38 for (i = 0; i < len; i++)
40 Size sz = IndexTupleSize(itup[i]);
42 l = PageAddItem(page, (Item) itup[i], sz, off, false, false);
43 if (l == InvalidOffsetNumber)
44 elog(ERROR, "failed to add item to GiST index page, item %d out of %d, size %d bytes",
51 * Check space for itup vector on page
54 gistnospace(Page page, IndexTuple *itvec, int len, OffsetNumber todelete, Size freespace)
56 unsigned int size = freespace,
60 for (i = 0; i < len; i++)
61 size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData);
63 if (todelete != InvalidOffsetNumber)
65 IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, todelete));
67 deleted = IndexTupleSize(itup) + sizeof(ItemIdData);
70 return (PageGetFreeSpace(page) + deleted < size);
74 gistfitpage(IndexTuple *itvec, int len)
79 for (i = 0; i < len; i++)
80 size += IndexTupleSize(itvec[i]) + sizeof(ItemIdData);
82 /* TODO: Consider fillfactor */
83 return (size <= GiSTPageSize);
87 * Read buffer into itup vector
90 gistextractpage(Page page, int *len /* out */ )
96 maxoff = PageGetMaxOffsetNumber(page);
98 itvec = palloc(sizeof(IndexTuple) * maxoff);
99 for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
100 itvec[i - FirstOffsetNumber] = (IndexTuple) PageGetItem(page, PageGetItemId(page, i));
106 * join two vectors into one
109 gistjoinvector(IndexTuple *itvec, int *len, IndexTuple *additvec, int addlen)
111 itvec = (IndexTuple *) repalloc((void *) itvec, sizeof(IndexTuple) * ((*len) + addlen));
112 memmove(&itvec[*len], additvec, sizeof(IndexTuple) * addlen);
118 * make plain IndexTupleVector
122 gistfillitupvec(IndexTuple *vec, int veclen, int *memlen)
130 for (i = 0; i < veclen; i++)
131 *memlen += IndexTupleSize(vec[i]);
133 ptr = ret = palloc(*memlen);
135 for (i = 0; i < veclen; i++)
137 memcpy(ptr, vec[i], IndexTupleSize(vec[i]));
138 ptr += IndexTupleSize(vec[i]);
141 return (IndexTupleData *) ret;
145 * Make unions of keys in IndexTuple vector (one union datum per index column).
146 * Union Datums are returned into the attr/isnull arrays.
147 * Resulting Datums aren't compressed.
150 gistMakeUnionItVec(GISTSTATE *giststate, IndexTuple *itvec, int len,
151 Datum *attr, bool *isnull)
154 GistEntryVector *evec;
157 evec = (GistEntryVector *) palloc((len + 2) * sizeof(GISTENTRY) + GEVHDRSZ);
159 for (i = 0; i < giststate->tupdesc->natts; i++)
163 /* Collect non-null datums for this column */
165 for (j = 0; j < len; j++)
170 datum = index_getattr(itvec[j], i + 1, giststate->tupdesc, &IsNull);
174 gistdentryinit(giststate, i,
175 evec->vector + evec->n,
177 NULL, NULL, (OffsetNumber) 0,
182 /* If this column was all NULLs, the union is NULL */
192 /* unionFn may expect at least two inputs */
194 evec->vector[1] = evec->vector[0];
197 /* Make union and store in attr array */
198 attr[i] = FunctionCall2Coll(&giststate->unionFn[i],
199 giststate->supportCollation[i],
200 PointerGetDatum(evec),
201 PointerGetDatum(&attrsize));
209 * Return an IndexTuple containing the result of applying the "union"
210 * method to the specified IndexTuple vector.
213 gistunion(Relation r, IndexTuple *itvec, int len, GISTSTATE *giststate)
215 Datum attr[INDEX_MAX_KEYS];
216 bool isnull[INDEX_MAX_KEYS];
218 gistMakeUnionItVec(giststate, itvec, len, attr, isnull);
220 return gistFormTuple(giststate, r, attr, isnull, false);
224 * makes union of two key
227 gistMakeUnionKey(GISTSTATE *giststate, int attno,
228 GISTENTRY *entry1, bool isnull1,
229 GISTENTRY *entry2, bool isnull2,
230 Datum *dst, bool *dstisnull)
232 /* we need a GistEntryVector with room for exactly 2 elements */
236 char padding[2 * sizeof(GISTENTRY) + GEVHDRSZ];
238 GistEntryVector *evec = &storage.gev;
243 if (isnull1 && isnull2)
250 if (isnull1 == FALSE && isnull2 == FALSE)
252 evec->vector[0] = *entry1;
253 evec->vector[1] = *entry2;
255 else if (isnull1 == FALSE)
257 evec->vector[0] = *entry1;
258 evec->vector[1] = *entry1;
262 evec->vector[0] = *entry2;
263 evec->vector[1] = *entry2;
267 *dst = FunctionCall2Coll(&giststate->unionFn[attno],
268 giststate->supportCollation[attno],
269 PointerGetDatum(evec),
270 PointerGetDatum(&dstsize));
275 gistKeyIsEQ(GISTSTATE *giststate, int attno, Datum a, Datum b)
279 FunctionCall3Coll(&giststate->equalFn[attno],
280 giststate->supportCollation[attno],
282 PointerGetDatum(&result));
287 * Decompress all keys in tuple
290 gistDeCompressAtt(GISTSTATE *giststate, Relation r, IndexTuple tuple, Page p,
291 OffsetNumber o, GISTENTRY *attdata, bool *isnull)
295 for (i = 0; i < r->rd_att->natts; i++)
297 Datum datum = index_getattr(tuple, i + 1, giststate->tupdesc, &isnull[i]);
299 gistdentryinit(giststate, i, &attdata[i],
306 * Forms union of oldtup and addtup, if union == oldtup then return NULL
309 gistgetadjusted(Relation r, IndexTuple oldtup, IndexTuple addtup, GISTSTATE *giststate)
311 bool neednew = FALSE;
312 GISTENTRY oldentries[INDEX_MAX_KEYS],
313 addentries[INDEX_MAX_KEYS];
314 bool oldisnull[INDEX_MAX_KEYS],
315 addisnull[INDEX_MAX_KEYS];
316 Datum attr[INDEX_MAX_KEYS];
317 bool isnull[INDEX_MAX_KEYS];
318 IndexTuple newtup = NULL;
321 gistDeCompressAtt(giststate, r, oldtup, NULL,
322 (OffsetNumber) 0, oldentries, oldisnull);
324 gistDeCompressAtt(giststate, r, addtup, NULL,
325 (OffsetNumber) 0, addentries, addisnull);
327 for (i = 0; i < r->rd_att->natts; i++)
329 gistMakeUnionKey(giststate, i,
330 oldentries + i, oldisnull[i],
331 addentries + i, addisnull[i],
332 attr + i, isnull + i);
335 /* we already need new key, so we can skip check */
339 /* union of key may be NULL if and only if both keys are NULL */
345 !gistKeyIsEQ(giststate, i, oldentries[i].key, attr[i]))
352 /* need to update key */
353 newtup = gistFormTuple(giststate, r, attr, isnull, false);
354 newtup->t_tid = oldtup->t_tid;
361 * Search an upper index page for the entry with lowest penalty for insertion
362 * of the new index key contained in "it".
364 * Returns the index of the page entry to insert into.
367 gistchoose(Relation r, Page p, IndexTuple it, /* it has compressed entry */
368 GISTSTATE *giststate)
373 float best_penalty[INDEX_MAX_KEYS];
375 identry[INDEX_MAX_KEYS];
376 bool isnull[INDEX_MAX_KEYS];
377 int keep_current_best;
379 Assert(!GistPageIsLeaf(p));
381 gistDeCompressAtt(giststate, r,
382 it, NULL, (OffsetNumber) 0,
385 /* we'll return FirstOffsetNumber if page is empty (shouldn't happen) */
386 result = FirstOffsetNumber;
389 * The index may have multiple columns, and there's a penalty value for
390 * each column. The penalty associated with a column that appears earlier
391 * in the index definition is strictly more important than the penalty of
392 * a column that appears later in the index definition.
394 * best_penalty[j] is the best penalty we have seen so far for column j,
395 * or -1 when we haven't yet examined column j. Array entries to the
396 * right of the first -1 are undefined.
398 best_penalty[0] = -1;
401 * If we find a tuple that's exactly as good as the currently best one, we
402 * could use either one. When inserting a lot of tuples with the same or
403 * similar keys, it's preferable to descend down the same path when
404 * possible, as that's more cache-friendly. On the other hand, if all
405 * inserts land on the same leaf page after a split, we're never going to
406 * insert anything to the other half of the split, and will end up using
407 * only 50% of the available space. Distributing the inserts evenly would
408 * lead to better space usage, but that hurts cache-locality during
409 * insertion. To get the best of both worlds, when we find a tuple that's
410 * exactly as good as the previous best, choose randomly whether to stick
411 * to the old best, or use the new one. Once we decide to stick to the
412 * old best, we keep sticking to it for any subsequent equally good tuples
413 * we might find. This favors tuples with low offsets, but still allows
414 * some inserts to go to other equally-good subtrees.
416 * keep_current_best is -1 if we haven't yet had to make a random choice
417 * whether to keep the current best tuple. If we have done so, and
418 * decided to keep it, keep_current_best is 1; if we've decided to
419 * replace, keep_current_best is 0. (This state will be reset to -1 as
420 * soon as we've made the replacement, but sometimes we make the choice in
421 * advance of actually finding a replacement best tuple.)
423 keep_current_best = -1;
426 * Loop over tuples on page.
428 maxoff = PageGetMaxOffsetNumber(p);
429 Assert(maxoff >= FirstOffsetNumber);
431 for (i = FirstOffsetNumber; i <= maxoff; i = OffsetNumberNext(i))
433 IndexTuple itup = (IndexTuple) PageGetItem(p, PageGetItemId(p, i));
439 /* Loop over index attributes. */
440 for (j = 0; j < r->rd_att->natts; j++)
446 /* Compute penalty for this column. */
447 datum = index_getattr(itup, j + 1, giststate->tupdesc, &IsNull);
448 gistdentryinit(giststate, j, &entry, datum, r, p, i,
450 usize = gistpenalty(giststate, j, &entry, IsNull,
451 &identry[j], isnull[j]);
453 zero_penalty = false;
455 if (best_penalty[j] < 0 || usize < best_penalty[j])
458 * New best penalty for column. Tentatively select this tuple
459 * as the target, and record the best penalty. Then reset the
460 * next column's penalty to "unknown" (and indirectly, the
461 * same for all the ones to its right). This will force us to
462 * adopt this tuple's penalty values as the best for all the
463 * remaining columns during subsequent loop iterations.
466 best_penalty[j] = usize;
468 if (j < r->rd_att->natts - 1)
469 best_penalty[j + 1] = -1;
471 /* we have new best, so reset keep-it decision */
472 keep_current_best = -1;
474 else if (best_penalty[j] == usize)
477 * The current tuple is exactly as good for this column as the
478 * best tuple seen so far. The next iteration of this loop
479 * will compare the next column.
485 * The current tuple is worse for this column than the best
486 * tuple seen so far. Skip the remaining columns and move on
487 * to the next tuple, if any.
489 zero_penalty = false; /* so outer loop won't exit */
495 * If we looped past the last column, and did not update "result",
496 * then this tuple is exactly as good as the prior best tuple.
498 if (j == r->rd_att->natts && result != i)
500 if (keep_current_best == -1)
502 /* we didn't make the random choice yet for this old best */
503 keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0;
505 if (keep_current_best == 0)
507 /* we choose to use the new tuple */
509 /* choose again if there are even more exactly-as-good ones */
510 keep_current_best = -1;
515 * If we find a tuple with zero penalty for all columns, and we've
516 * decided we don't want to search for another tuple with equal
517 * penalty, there's no need to examine remaining tuples; just break
518 * out of the loop and return it.
522 if (keep_current_best == -1)
524 /* we didn't make the random choice yet for this old best */
525 keep_current_best = (random() <= (MAX_RANDOM_VALUE / 2)) ? 1 : 0;
527 if (keep_current_best == 1)
536 * initialize a GiST entry with a decompressed version of key
539 gistdentryinit(GISTSTATE *giststate, int nkey, GISTENTRY *e,
540 Datum k, Relation r, Page pg, OffsetNumber o,
547 gistentryinit(*e, k, r, pg, o, l);
549 DatumGetPointer(FunctionCall1Coll(&giststate->decompressFn[nkey],
550 giststate->supportCollation[nkey],
551 PointerGetDatum(e)));
552 /* decompressFn may just return the given pointer */
554 gistentryinit(*e, dep->key, dep->rel, dep->page, dep->offset,
558 gistentryinit(*e, (Datum) 0, r, pg, o, l);
563 * initialize a GiST entry with a compressed version of key
566 gistcentryinit(GISTSTATE *giststate, int nkey,
567 GISTENTRY *e, Datum k, Relation r,
568 Page pg, OffsetNumber o, bool l, bool isNull)
574 gistentryinit(*e, k, r, pg, o, l);
576 DatumGetPointer(FunctionCall1Coll(&giststate->compressFn[nkey],
577 giststate->supportCollation[nkey],
578 PointerGetDatum(e)));
579 /* compressFn may just return the given pointer */
581 gistentryinit(*e, cep->key, cep->rel, cep->page, cep->offset,
585 gistentryinit(*e, (Datum) 0, r, pg, o, l);
589 gistFormTuple(GISTSTATE *giststate, Relation r,
590 Datum attdata[], bool isnull[], bool newValues)
592 GISTENTRY centry[INDEX_MAX_KEYS];
593 Datum compatt[INDEX_MAX_KEYS];
597 for (i = 0; i < r->rd_att->natts; i++)
600 compatt[i] = (Datum) 0;
603 gistcentryinit(giststate, i, ¢ry[i], attdata[i],
604 r, NULL, (OffsetNumber) 0,
607 compatt[i] = centry[i].key;
611 res = index_form_tuple(giststate->tupdesc, compatt, isnull);
614 * The offset number on tuples on internal pages is unused. For historical
615 * reasons, it is set 0xffff.
617 ItemPointerSetOffsetNumber(&(res->t_tid), 0xffff);
622 gistpenalty(GISTSTATE *giststate, int attno,
623 GISTENTRY *orig, bool isNullOrig,
624 GISTENTRY *add, bool isNullAdd)
628 if (giststate->penaltyFn[attno].fn_strict == FALSE ||
629 (isNullOrig == FALSE && isNullAdd == FALSE))
631 FunctionCall3Coll(&giststate->penaltyFn[attno],
632 giststate->supportCollation[attno],
633 PointerGetDatum(orig),
634 PointerGetDatum(add),
635 PointerGetDatum(&penalty));
636 /* disallow negative or NaN penalty */
637 if (isnan(penalty) || penalty < 0.0)
640 else if (isNullOrig && isNullAdd)
644 /* try to prevent mixing null and non-null values */
645 penalty = get_float4_infinity();
652 * Initialize a new index page
655 GISTInitBuffer(Buffer b, uint32 f)
657 GISTPageOpaque opaque;
661 pageSize = BufferGetPageSize(b);
662 page = BufferGetPage(b);
663 PageInit(page, pageSize, sizeof(GISTPageOpaqueData));
665 opaque = GistPageGetOpaque(page);
666 /* page was already zeroed by PageInit, so this is not needed: */
667 /* memset(&(opaque->nsn), 0, sizeof(GistNSN)); */
668 opaque->rightlink = InvalidBlockNumber;
670 opaque->gist_page_id = GIST_PAGE_ID;
674 * Verify that a freshly-read page looks sane.
677 gistcheckpage(Relation rel, Buffer buf)
679 Page page = BufferGetPage(buf);
682 * ReadBuffer verifies that every newly-read page passes
683 * PageHeaderIsValid, which means it either contains a reasonably sane
684 * page header or is all-zero. We have to defend against the all-zero
689 (errcode(ERRCODE_INDEX_CORRUPTED),
690 errmsg("index \"%s\" contains unexpected zero page at block %u",
691 RelationGetRelationName(rel),
692 BufferGetBlockNumber(buf)),
693 errhint("Please REINDEX it.")));
696 * Additionally check that the special area looks sane.
698 if (PageGetSpecialSize(page) != MAXALIGN(sizeof(GISTPageOpaqueData)))
700 (errcode(ERRCODE_INDEX_CORRUPTED),
701 errmsg("index \"%s\" contains corrupted page at block %u",
702 RelationGetRelationName(rel),
703 BufferGetBlockNumber(buf)),
704 errhint("Please REINDEX it.")));
709 * Allocate a new page (either by recycling, or by extending the index file)
711 * The returned buffer is already pinned and exclusive-locked
713 * Caller is responsible for initializing the page by calling GISTInitBuffer
716 gistNewBuffer(Relation r)
721 /* First, try to get a page from FSM */
724 BlockNumber blkno = GetFreeIndexPage(r);
726 if (blkno == InvalidBlockNumber)
727 break; /* nothing left in FSM */
729 buffer = ReadBuffer(r, blkno);
732 * We have to guard against the possibility that someone else already
733 * recycled this page; the buffer may be locked if so.
735 if (ConditionalLockBuffer(buffer))
737 Page page = BufferGetPage(buffer);
740 return buffer; /* OK to use, if never initialized */
742 gistcheckpage(r, buffer);
744 if (GistPageIsDeleted(page))
745 return buffer; /* OK to use */
747 LockBuffer(buffer, GIST_UNLOCK);
750 /* Can't use it, so release buffer and try again */
751 ReleaseBuffer(buffer);
754 /* Must extend the file */
755 needLock = !RELATION_IS_LOCAL(r);
758 LockRelationForExtension(r, ExclusiveLock);
760 buffer = ReadBuffer(r, P_NEW);
761 LockBuffer(buffer, GIST_EXCLUSIVE);
764 UnlockRelationForExtension(r, ExclusiveLock);
770 gistoptions(PG_FUNCTION_ARGS)
772 Datum reloptions = PG_GETARG_DATUM(0);
773 bool validate = PG_GETARG_BOOL(1);
774 relopt_value *options;
777 static const relopt_parse_elt tab[] = {
778 {"fillfactor", RELOPT_TYPE_INT, offsetof(GiSTOptions, fillfactor)},
779 {"buffering", RELOPT_TYPE_STRING, offsetof(GiSTOptions, bufferingModeOffset)}
782 options = parseRelOptions(reloptions, validate, RELOPT_KIND_GIST,
785 /* if none set, we're done */
789 rdopts = allocateReloptStruct(sizeof(GiSTOptions), options, numoptions);
791 fillRelOptions((void *) rdopts, sizeof(GiSTOptions), options, numoptions,
792 validate, tab, lengthof(tab));
796 PG_RETURN_BYTEA_P(rdopts);
801 * Temporary GiST indexes are not WAL-logged, but we need LSNs to detect
802 * concurrent page splits anyway. GetXLogRecPtrForTemp() provides a fake
803 * sequence of LSNs for that purpose. Each call generates an LSN that is
804 * greater than any previous value returned by this function in the same
808 GetXLogRecPtrForTemp(void)
810 static XLogRecPtr counter = 1;