1 /*-------------------------------------------------------------------------
4 * fetch tuples from a GIN scan.
7 * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
11 * src/backend/access/gin/ginget.c
12 *-------------------------------------------------------------------------
17 #include "access/gin_private.h"
18 #include "access/relscan.h"
19 #include "miscadmin.h"
20 #include "utils/datum.h"
21 #include "utils/memutils.h"
24 int GinFuzzySearchLimit = 0;
26 typedef struct pendingPosition
29 OffsetNumber firstOffset;
30 OffsetNumber lastOffset;
37 * Goes to the next page if current offset is outside of bounds
40 moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack)
42 Page page = BufferGetPage(stack->buffer);
44 if (stack->off > PageGetMaxOffsetNumber(page))
47 * We scanned the whole page, so we should take right page
49 if (GinPageRightMost(page))
50 return false; /* no more pages */
52 stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
53 stack->blkno = BufferGetBlockNumber(stack->buffer);
54 stack->off = FirstOffsetNumber;
61 * Scan all pages of a posting tree and save all its heap ItemPointers
62 * in scanEntry->matchBitmap
65 scanPostingTree(Relation index, GinScanEntry scanEntry,
66 BlockNumber rootPostingTree)
73 /* Descend to the leftmost leaf page */
74 stack = ginScanBeginPostingTree(&btree, index, rootPostingTree);
75 buffer = stack->buffer;
76 IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
78 freeGinBtreeStack(stack);
81 * Loop iterates through all leaf pages of posting tree
85 page = BufferGetPage(buffer);
86 if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
88 int n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
90 scanEntry->predictNumberResult += n;
93 if (GinPageRightMost(page))
94 break; /* no more pages */
96 buffer = ginStepRight(buffer, index, GIN_SHARE);
99 UnlockReleaseBuffer(buffer);
103 * Collects TIDs into scanEntry->matchBitmap for all heap tuples that
104 * match the search entry. This supports three different match modes:
106 * 1. Partial-match support: scan from current point until the
107 * comparePartialFn says we're done.
108 * 2. SEARCH_MODE_ALL: scan from current point (which should be first
109 * key for the current attnum) until we hit null items or end of attnum
110 * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
111 * key for the current attnum) until we hit end of attnum
113 * Returns true if done, false if it's necessary to restart scan from scratch
116 collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
117 GinScanEntry scanEntry)
120 Form_pg_attribute attr;
122 /* Initialize empty bitmap result */
123 scanEntry->matchBitmap = tbm_create(work_mem * 1024L);
125 /* Null query cannot partial-match anything */
126 if (scanEntry->isPartialMatch &&
127 scanEntry->queryCategory != GIN_CAT_NORM_KEY)
130 /* Locate tupdesc entry for key column (for attbyval/attlen data) */
131 attnum = scanEntry->attnum;
132 attr = btree->ginstate->origTupdesc->attrs[attnum - 1];
139 GinNullCategory icategory;
142 * stack->off points to the interested entry, buffer is already locked
144 if (moveRightIfItNeeded(btree, stack) == false)
147 page = BufferGetPage(stack->buffer);
148 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
151 * If tuple stores another attribute then stop scan
153 if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
156 /* Safe to fetch attribute value */
157 idatum = gintuple_get_key(btree->ginstate, itup, &icategory);
160 * Check for appropriate scan stop conditions
162 if (scanEntry->isPartialMatch)
167 * In partial match, stop scan at any null (including
168 * placeholders); partial matches never match nulls
170 if (icategory != GIN_CAT_NORM_KEY)
174 * Check of partial match.
175 * case cmp == 0 => match
176 * case cmp > 0 => not match and finish scan
177 * case cmp < 0 => not match and continue scan
180 cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1],
181 btree->ginstate->supportCollation[attnum - 1],
184 UInt16GetDatum(scanEntry->strategy),
185 PointerGetDatum(scanEntry->extra_data)));
195 else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
198 * In ALL mode, we are not interested in null items, so we can
199 * stop if we get to a null-item placeholder (which will be the
200 * last entry for a given attnum). We do want to include NULL_KEY
201 * and EMPTY_ITEM entries, though.
203 if (icategory == GIN_CAT_NULL_ITEM)
208 * OK, we want to return the TIDs listed in this entry.
210 if (GinIsPostingTree(itup))
212 BlockNumber rootPostingTree = GinGetPostingTree(itup);
215 * We should unlock current page (but not unpin) during tree scan
216 * to prevent deadlock with vacuum processes.
218 * We save current entry value (idatum) to be able to re-find our
219 * tuple after re-locking
221 if (icategory == GIN_CAT_NORM_KEY)
222 idatum = datumCopy(idatum, attr->attbyval, attr->attlen);
224 LockBuffer(stack->buffer, GIN_UNLOCK);
226 /* Collect all the TIDs in this entry's posting tree */
227 scanPostingTree(btree->index, scanEntry, rootPostingTree);
230 * We lock again the entry page and while it was unlocked insert
231 * might have occurred, so we need to re-find our position.
233 LockBuffer(stack->buffer, GIN_SHARE);
234 page = BufferGetPage(stack->buffer);
235 if (!GinPageIsLeaf(page))
238 * Root page becomes non-leaf while we unlock it. We will
239 * start again, this situation doesn't occur often - root can
240 * became a non-leaf only once per life of index.
245 /* Search forward to re-find idatum */
249 GinNullCategory newCategory;
251 if (moveRightIfItNeeded(btree, stack) == false)
252 elog(ERROR, "lost saved point in index"); /* must not happen !!! */
254 page = BufferGetPage(stack->buffer);
255 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
257 if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
258 elog(ERROR, "lost saved point in index"); /* must not happen !!! */
259 newDatum = gintuple_get_key(btree->ginstate, itup,
262 if (ginCompareEntries(btree->ginstate, attnum,
263 newDatum, newCategory,
264 idatum, icategory) == 0)
270 if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
271 pfree(DatumGetPointer(idatum));
278 ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
279 tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
280 scanEntry->predictNumberResult += GinGetNPosting(itup);
284 * Done with this entry, go to the next
291 * Start* functions setup beginning state of searches: finds correct buffer and pins it.
294 startScanEntry(GinState *ginstate, GinScanEntry entry)
296 GinBtreeData btreeEntry;
297 GinBtreeStack *stackEntry;
302 entry->buffer = InvalidBuffer;
303 ItemPointerSetMin(&entry->curItem);
304 entry->offset = InvalidOffsetNumber;
307 entry->matchBitmap = NULL;
308 entry->matchResult = NULL;
309 entry->reduceResult = FALSE;
310 entry->predictNumberResult = 0;
313 * we should find entry, and begin scan of posting tree or just store
314 * posting list in memory
316 ginPrepareEntryScan(&btreeEntry, entry->attnum,
317 entry->queryKey, entry->queryCategory,
319 stackEntry = ginFindLeafPage(&btreeEntry, true);
320 page = BufferGetPage(stackEntry->buffer);
323 entry->isFinished = TRUE;
325 if (entry->isPartialMatch ||
326 entry->queryCategory == GIN_CAT_EMPTY_QUERY)
329 * btreeEntry.findItem locates the first item >= given search key.
330 * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
331 * because of the way the GIN_CAT_EMPTY_QUERY category code is
332 * assigned.) We scan forward from there and collect all TIDs needed
333 * for the entry type.
335 btreeEntry.findItem(&btreeEntry, stackEntry);
336 if (collectMatchBitmap(&btreeEntry, stackEntry, entry) == false)
339 * GIN tree was seriously restructured, so we will cleanup all
340 * found data and rescan. See comments near 'return false' in
341 * collectMatchBitmap()
343 if (entry->matchBitmap)
345 if (entry->matchIterator)
346 tbm_end_iterate(entry->matchIterator);
347 entry->matchIterator = NULL;
348 tbm_free(entry->matchBitmap);
349 entry->matchBitmap = NULL;
351 LockBuffer(stackEntry->buffer, GIN_UNLOCK);
352 freeGinBtreeStack(stackEntry);
353 goto restartScanEntry;
356 if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
358 entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
359 entry->isFinished = FALSE;
362 else if (btreeEntry.findItem(&btreeEntry, stackEntry))
364 IndexTuple itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
366 if (GinIsPostingTree(itup))
368 BlockNumber rootPostingTree = GinGetPostingTree(itup);
369 GinBtreeStack *stack;
371 ItemPointerData minItem;
374 * We should unlock entry page before touching posting tree to
375 * prevent deadlocks with vacuum processes. Because entry is never
376 * deleted from page and posting tree is never reduced to the
377 * posting list, we can unlock page after getting BlockNumber of
378 * root of posting tree.
380 LockBuffer(stackEntry->buffer, GIN_UNLOCK);
383 stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
385 entry->buffer = stack->buffer;
388 * We keep buffer pinned because we need to prevent deletion of
389 * page during scan. See GIN's vacuum implementation. RefCount is
390 * increased to keep buffer pinned after freeGinBtreeStack() call.
392 IncrBufferRefCount(entry->buffer);
394 page = BufferGetPage(entry->buffer);
397 * Load the first page into memory.
399 ItemPointerSetMin(&minItem);
400 entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem);
402 entry->predictNumberResult = stack->predictNumber * entry->nlist;
404 LockBuffer(entry->buffer, GIN_UNLOCK);
405 freeGinBtreeStack(stack);
406 entry->isFinished = FALSE;
408 else if (GinGetNPosting(itup) > 0)
410 entry->list = ginReadTuple(ginstate, entry->attnum, itup,
412 entry->predictNumberResult = entry->nlist;
414 entry->isFinished = FALSE;
419 LockBuffer(stackEntry->buffer, GIN_UNLOCK);
420 freeGinBtreeStack(stackEntry);
424 * Comparison function for scan entry indexes. Sorts by predictNumberResult,
425 * least frequent items first.
428 entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
430 const GinScanKey key = (const GinScanKey) arg;
431 int i1 = *(const int *) a1;
432 int i2 = *(const int *) a2;
433 uint32 n1 = key->scanEntry[i1]->predictNumberResult;
434 uint32 n2 = key->scanEntry[i2]->predictNumberResult;
445 startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
447 MemoryContext oldCtx = CurrentMemoryContext;
452 ItemPointerSetMin(&key->curItem);
453 key->curItemMatches = false;
454 key->recheckCurItem = false;
455 key->isFinished = false;
458 * Divide the entries into two distinct sets: required and additional.
459 * Additional entries are not enough for a match alone, without any items
460 * from the required set, but are needed by the consistent function to
461 * decide if an item matches. When scanning, we can skip over items from
462 * additional entries that have no corresponding matches in any of the
463 * required entries. That speeds up queries like "frequent & rare"
464 * considerably, if the frequent term can be put in the additional set.
466 * There can be many legal ways to divide them entries into these two
467 * sets. A conservative division is to just put everything in the required
468 * set, but the more you can put in the additional set, the more you can
469 * skip during the scan. To maximize skipping, we try to put as many
470 * frequent items as possible into additional, and less frequent ones into
471 * required. To do that, sort the entries by frequency
472 * (predictNumberResult), and put entries into the required set in that
473 * order, until the consistent function says that none of the remaining
474 * entries can form a match, without any items from the required set. The
475 * rest go to the additional set.
477 if (key->nentries > 1)
479 MemoryContextSwitchTo(so->tempCtx);
481 entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
482 for (i = 0; i < key->nentries; i++)
484 qsort_arg(entryIndexes, key->nentries, sizeof(int),
485 entryIndexByFrequencyCmp, key);
487 for (i = 0; i < key->nentries - 1; i++)
489 /* Pass all entries <= i as FALSE, and the rest as MAYBE */
490 for (j = 0; j <= i; j++)
491 key->entryRes[entryIndexes[j]] = GIN_FALSE;
492 for (j = i + 1; j < key->nentries; j++)
493 key->entryRes[entryIndexes[j]] = GIN_MAYBE;
495 if (key->triConsistentFn(key) == GIN_FALSE)
498 /* i is now the last required entry. */
500 MemoryContextSwitchTo(so->keyCtx);
502 key->nrequired = i + 1;
503 key->nadditional = key->nentries - key->nrequired;
504 key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
505 key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
508 for (i = 0; i < key->nrequired; i++)
509 key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
510 for (i = 0; i < key->nadditional; i++)
511 key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
513 /* clean up after consistentFn calls (also frees entryIndexes) */
514 MemoryContextReset(so->tempCtx);
518 MemoryContextSwitchTo(so->keyCtx);
521 key->nadditional = 0;
522 key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
523 key->requiredEntries[0] = key->scanEntry[0];
525 MemoryContextSwitchTo(oldCtx);
529 startScan(IndexScanDesc scan)
531 GinScanOpaque so = (GinScanOpaque) scan->opaque;
532 GinState *ginstate = &so->ginstate;
535 for (i = 0; i < so->totalentries; i++)
536 startScanEntry(ginstate, so->entries[i]);
538 if (GinFuzzySearchLimit > 0)
541 * If all of keys more than threshold we will try to reduce result, we
542 * hope (and only hope, for intersection operation of array our
543 * supposition isn't true), that total result will not more than
544 * minimal predictNumberResult.
548 for (i = 0; i < so->totalentries; i++)
550 if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
558 for (i = 0; i < so->totalentries; i++)
560 so->entries[i]->predictNumberResult /= so->totalentries;
561 so->entries[i]->reduceResult = TRUE;
567 * Now that we have the estimates for the entry frequencies, finish
568 * initializing the scan keys.
570 for (i = 0; i < so->nkeys; i++)
571 startScanKey(ginstate, so, so->keys + i);
575 * Load the next batch of item pointers from a posting tree.
577 * Note that we copy the page into GinScanEntry->list array and unlock it, but
578 * keep it pinned to prevent interference with vacuum.
581 entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast)
587 if (!BufferIsValid(entry->buffer))
589 entry->isFinished = true;
594 * We have two strategies for finding the correct page: step right from
595 * the current page, or descend the tree again from the root. If
596 * advancePast equals the current item, the next matching item should be
597 * on the next page, so we step right. Otherwise, descend from root.
599 if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
602 LockBuffer(entry->buffer, GIN_SHARE);
606 GinBtreeStack *stack;
608 ReleaseBuffer(entry->buffer);
611 * Set the search key, and find the correct leaf page.
613 if (ItemPointerIsLossyPage(&advancePast))
615 ItemPointerSet(&entry->btree.itemptr,
616 GinItemPointerGetBlockNumber(&advancePast) + 1,
621 entry->btree.itemptr = advancePast;
622 entry->btree.itemptr.ip_posid++;
624 entry->btree.fullScan = false;
625 stack = ginFindLeafPage(&entry->btree, true);
627 /* we don't need the stack, just the buffer. */
628 entry->buffer = stack->buffer;
629 IncrBufferRefCount(entry->buffer);
630 freeGinBtreeStack(stack);
634 elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
635 GinItemPointerGetBlockNumber(&advancePast),
636 GinItemPointerGetOffsetNumber(&advancePast),
639 page = BufferGetPage(entry->buffer);
642 entry->offset = InvalidOffsetNumber;
653 * We've processed all the entries on this page. If it was the
654 * last page in the tree, we're done.
656 if (GinPageRightMost(page))
658 UnlockReleaseBuffer(entry->buffer);
659 entry->buffer = InvalidBuffer;
660 entry->isFinished = TRUE;
665 * Step to next page, following the right link. then find the
666 * first ItemPointer greater than advancePast.
668 entry->buffer = ginStepRight(entry->buffer,
671 page = BufferGetPage(entry->buffer);
675 if (GinPageGetOpaque(page)->flags & GIN_DELETED)
676 continue; /* page was deleted by concurrent vacuum */
679 * The first item > advancePast might not be on this page, but
680 * somewhere to the right, if the page was split, or a non-match from
681 * another key in the query allowed us to skip some items from this
682 * entry. Keep following the right-links until we re-find the correct
685 if (!GinPageRightMost(page) &&
686 ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
689 * the item we're looking is > the right bound of the page, so it
690 * can't be on this page.
695 entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
697 for (i = 0; i < entry->nlist; i++)
699 if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
703 if (GinPageRightMost(page))
705 /* after processing the copied items, we're done. */
706 UnlockReleaseBuffer(entry->buffer);
707 entry->buffer = InvalidBuffer;
710 LockBuffer(entry->buffer, GIN_UNLOCK);
717 #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE))
718 #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
721 * Sets entry->curItem to next heap item pointer > advancePast, for one entry
722 * of one scan key, or sets entry->isFinished to TRUE if there are no more.
724 * Item pointers are returned in ascending order.
726 * Note: this can return a "lossy page" item pointer, indicating that the
727 * entry potentially matches all items on that heap page. However, it is
728 * not allowed to return both a lossy page pointer and exact (regular)
729 * item pointers for the same page. (Doing so would break the key-combination
730 * logic in keyGetItem and scanGetItem; see comment in scanGetItem.) In the
731 * current implementation this is guaranteed by the behavior of tidbitmaps.
734 entryGetItem(GinState *ginstate, GinScanEntry entry,
735 ItemPointerData advancePast)
737 Assert(!entry->isFinished);
739 Assert(!ItemPointerIsValid(&entry->curItem) ||
740 ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
742 if (entry->matchBitmap)
744 /* A bitmap result */
745 BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
746 OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
747 bool gotitem = false;
752 * If we've exhausted all items on this block, move to next block
755 while (entry->matchResult == NULL ||
756 (entry->matchResult->ntuples >= 0 &&
757 entry->offset >= entry->matchResult->ntuples) ||
758 entry->matchResult->blockno < advancePastBlk ||
759 (ItemPointerIsLossyPage(&advancePast) &&
760 entry->matchResult->blockno == advancePastBlk))
762 entry->matchResult = tbm_iterate(entry->matchIterator);
764 if (entry->matchResult == NULL)
766 ItemPointerSetInvalid(&entry->curItem);
767 tbm_end_iterate(entry->matchIterator);
768 entry->matchIterator = NULL;
769 entry->isFinished = TRUE;
774 * Reset counter to the beginning of entry->matchResult. Note:
775 * entry->offset is still greater than matchResult->ntuples if
776 * matchResult is lossy. So, on next call we will get next
777 * result from TIDBitmap.
781 if (entry->isFinished)
785 * We're now on the first page after advancePast which has any
786 * items on it. If it's a lossy result, return that.
788 if (entry->matchResult->ntuples < 0)
790 ItemPointerSetLossyPage(&entry->curItem,
791 entry->matchResult->blockno);
794 * We might as well fall out of the loop; we could not
795 * estimate number of results on this page to support correct
796 * reducing of result even if it's enabled.
803 * Not a lossy page. Skip over any offsets <= advancePast, and
806 if (entry->matchResult->blockno == advancePastBlk)
809 * First, do a quick check against the last offset on the
810 * page. If that's > advancePast, so are all the other
813 if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
815 entry->offset = entry->matchResult->ntuples;
819 /* Otherwise scan to find the first item > advancePast */
820 while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
824 ItemPointerSet(&entry->curItem,
825 entry->matchResult->blockno,
826 entry->matchResult->offsets[entry->offset]);
829 } while (!gotitem || (entry->reduceResult == TRUE && dropItem(entry)));
831 else if (!BufferIsValid(entry->buffer))
834 * A posting list from an entry tuple, or the last page of a posting
839 if (entry->offset >= entry->nlist)
841 ItemPointerSetInvalid(&entry->curItem);
842 entry->isFinished = TRUE;
846 entry->curItem = entry->list[entry->offset++];
847 } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
848 /* XXX: shouldn't we apply the fuzzy search limit here? */
855 /* If we've processed the current batch, load more items */
856 while (entry->offset >= entry->nlist)
858 entryLoadMoreItems(ginstate, entry, advancePast);
860 if (entry->isFinished)
862 ItemPointerSetInvalid(&entry->curItem);
867 entry->curItem = entry->list[entry->offset++];
869 } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
870 (entry->reduceResult == TRUE && dropItem(entry)));
875 * Identify the "current" item among the input entry streams for this scan key
876 * that is greater than advancePast, and test whether it passes the scan key
879 * The current item is the smallest curItem among the inputs. key->curItem
880 * is set to that value. key->curItemMatches is set to indicate whether that
881 * TID passes the consistentFn test. If so, key->recheckCurItem is set true
882 * iff recheck is needed for this item pointer (including the case where the
883 * item pointer is a lossy page pointer).
885 * If all entry streams are exhausted, sets key->isFinished to TRUE.
887 * Item pointers must be returned in ascending order.
889 * Note: this can return a "lossy page" item pointer, indicating that the
890 * key potentially matches all items on that heap page. However, it is
891 * not allowed to return both a lossy page pointer and exact (regular)
892 * item pointers for the same page. (Doing so would break the key-combination
893 * logic in scanGetItem.)
896 keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
897 ItemPointerData advancePast)
899 ItemPointerData minItem;
900 ItemPointerData curPageLossy;
905 MemoryContext oldCtx;
908 Assert(!key->isFinished);
911 * We might have already tested this item; if so, no need to repeat work.
912 * (Note: the ">" case can happen, if advancePast is exact but we
913 * previously had to set curItem to a lossy-page pointer.)
915 if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
919 * Find the minimum item > advancePast among the active entry streams.
921 * Note: a lossy-page entry is encoded by a ItemPointer with max value for
922 * offset (0xffff), so that it will sort after any exact entries for the
923 * same page. So we'll prefer to return exact pointers not lossy
924 * pointers, which is good.
926 ItemPointerSetMax(&minItem);
928 for (i = 0; i < key->nrequired; i++)
930 entry = key->requiredEntries[i];
932 if (entry->isFinished)
936 * Advance this stream if necessary.
938 * In particular, since entry->curItem was initialized with
939 * ItemPointerSetMin, this ensures we fetch the first item for each
940 * entry on the first call.
942 if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
944 entryGetItem(ginstate, entry, advancePast);
945 if (entry->isFinished)
950 if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
951 minItem = entry->curItem;
956 /* all entries are finished */
957 key->isFinished = TRUE;
962 * Ok, we now know that there are no matches < minItem.
964 * If minItem is lossy, it means that there were no exact items on the
965 * page among requiredEntries, because lossy pointers sort after exact
966 * items. However, there might be exact items for the same page among
967 * additionalEntries, so we mustn't advance past them.
969 if (ItemPointerIsLossyPage(&minItem))
971 if (GinItemPointerGetBlockNumber(&advancePast) <
972 GinItemPointerGetBlockNumber(&minItem))
974 advancePast.ip_blkid = minItem.ip_blkid;
975 advancePast.ip_posid = 0;
980 Assert(minItem.ip_posid > 0);
981 advancePast = minItem;
982 advancePast.ip_posid--;
986 * We might not have loaded all the entry streams for this TID yet. We
987 * could call the consistent function, passing MAYBE for those entries, to
988 * see if it can decide if this TID matches based on the information we
989 * have. But if the consistent-function is expensive, and cannot in fact
990 * decide with partial information, that could be a big loss. So, load all
991 * the additional entries, before calling the consistent function.
993 for (i = 0; i < key->nadditional; i++)
995 entry = key->additionalEntries[i];
997 if (entry->isFinished)
1000 if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1002 entryGetItem(ginstate, entry, advancePast);
1003 if (entry->isFinished)
1008 * Normally, none of the items in additionalEntries can have a curItem
1009 * larger than minItem. But if minItem is a lossy page, then there
1010 * might be exact items on the same page among additionalEntries.
1012 if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
1014 Assert(ItemPointerIsLossyPage(&minItem));
1015 minItem = entry->curItem;
1020 * Ok, we've advanced all the entries up to minItem now. Set key->curItem,
1021 * and perform consistentFn test.
1023 * Lossy-page entries pose a problem, since we don't know the correct
1024 * entryRes state to pass to the consistentFn, and we also don't know what
1025 * its combining logic will be (could be AND, OR, or even NOT). If the
1026 * logic is OR then the consistentFn might succeed for all items in the
1027 * lossy page even when none of the other entries match.
1029 * Our strategy is to call the tri-state consistent function, with the
1030 * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
1031 * returns FALSE, none of the lossy items alone are enough for a match, so
1032 * we don't need to return a lossy-page pointer. Otherwise, return a
1033 * lossy-page pointer to indicate that the whole heap page must be
1034 * checked. (On subsequent calls, we'll do nothing until minItem is past
1035 * the page altogether, thus ensuring that we never return both regular
1036 * and lossy pointers for the same page.)
1038 * An exception is that it doesn't matter what we pass for lossy pointers
1039 * in "hidden" entries, because the consistentFn's result can't depend on
1040 * them. We could pass them as MAYBE as well, but if we're using the
1041 * "shim" implementation of a tri-state consistent function (see
1042 * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
1045 * Note that only lossy-page entries pointing to the current item's page
1046 * should trigger this processing; we might have future lossy pages in the
1047 * entry array, but they aren't relevant yet.
1049 key->curItem = minItem;
1050 ItemPointerSetLossyPage(&curPageLossy,
1051 GinItemPointerGetBlockNumber(&key->curItem));
1052 haveLossyEntry = false;
1053 for (i = 0; i < key->nentries; i++)
1055 entry = key->scanEntry[i];
1056 if (entry->isFinished == FALSE &&
1057 ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
1059 if (i < key->nuserentries)
1060 key->entryRes[i] = GIN_MAYBE;
1062 key->entryRes[i] = GIN_TRUE;
1063 haveLossyEntry = true;
1066 key->entryRes[i] = GIN_FALSE;
1069 /* prepare for calling consistentFn in temp context */
1070 oldCtx = MemoryContextSwitchTo(tempCtx);
1074 /* Have lossy-page entries, so see if whole page matches */
1075 res = key->triConsistentFn(key);
1077 if (res == GIN_TRUE || res == GIN_MAYBE)
1079 /* Yes, so clean up ... */
1080 MemoryContextSwitchTo(oldCtx);
1081 MemoryContextReset(tempCtx);
1083 /* and return lossy pointer for whole page */
1084 key->curItem = curPageLossy;
1085 key->curItemMatches = true;
1086 key->recheckCurItem = true;
1092 * At this point we know that we don't need to return a lossy whole-page
1093 * pointer, but we might have matches for individual exact item pointers,
1094 * possibly in combination with a lossy pointer. Pass lossy pointers as
1095 * MAYBE to the ternary consistent function, to let it decide if this
1096 * tuple satisfies the overall key, even though we don't know if the lossy
1099 * Prepare entryRes array to be passed to consistentFn.
1101 for (i = 0; i < key->nentries; i++)
1103 entry = key->scanEntry[i];
1104 if (entry->isFinished)
1105 key->entryRes[i] = GIN_FALSE;
1109 * This case can't currently happen, because we loaded all the entries
1110 * for this item earlier.
1112 else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1113 key->entryRes[i] = GIN_MAYBE;
1115 else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
1116 key->entryRes[i] = GIN_MAYBE;
1117 else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
1118 key->entryRes[i] = GIN_TRUE;
1120 key->entryRes[i] = GIN_FALSE;
1123 res = key->triConsistentFn(key);
1128 key->curItemMatches = true;
1129 /* triConsistentFn set recheckCurItem */
1133 key->curItemMatches = false;
1137 key->curItemMatches = true;
1138 key->recheckCurItem = true;
1144 * the 'default' case shouldn't happen, but if the consistent
1145 * function returns something bogus, this is the safe result
1147 key->curItemMatches = true;
1148 key->recheckCurItem = true;
1153 * We have a tuple, and we know if it matches or not. If it's a non-match,
1154 * we could continue to find the next matching tuple, but let's break out
1155 * and give scanGetItem a chance to advance the other keys. They might be
1156 * able to skip past to a much higher TID, allowing us to save work.
1159 /* clean up after consistentFn calls */
1160 MemoryContextSwitchTo(oldCtx);
1161 MemoryContextReset(tempCtx);
1165 * Get next heap item pointer (after advancePast) from scan.
1166 * Returns true if anything found.
1167 * On success, *item and *recheck are set.
1169 * Note: this is very nearly the same logic as in keyGetItem(), except
1170 * that we know the keys are to be combined with AND logic, whereas in
1171 * keyGetItem() the combination logic is known only to the consistentFn.
1174 scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
1175 ItemPointerData *item, bool *recheck)
1177 GinScanOpaque so = (GinScanOpaque) scan->opaque;
1182 * Advance the scan keys in lock-step, until we find an item that matches
1183 * all the keys. If any key reports isFinished, meaning its subset of the
1184 * entries is exhausted, we can stop. Otherwise, set *item to the next
1187 * This logic works only if a keyGetItem stream can never contain both
1188 * exact and lossy pointers for the same page. Else we could have a
1197 * We would conclude that 42/6 is not a match and advance stream 1,
1198 * thus never detecting the match to the lossy pointer in stream 2.
1199 * (keyGetItem has a similar problem versus entryGetItem.)
1204 ItemPointerSetMin(item);
1206 for (i = 0; i < so->nkeys && match; i++)
1208 GinScanKey key = so->keys + i;
1210 /* Fetch the next item for this key that is > advancePast. */
1211 keyGetItem(&so->ginstate, so->tempCtx, key, advancePast);
1213 if (key->isFinished)
1217 * If it's not a match, we can immediately conclude that nothing
1218 * <= this item matches, without checking the rest of the keys.
1220 if (!key->curItemMatches)
1222 advancePast = key->curItem;
1228 * It's a match. We can conclude that nothing < matches, so the
1229 * other key streams can skip to this item.
1231 * Beware of lossy pointers, though; from a lossy pointer, we can
1232 * only conclude that nothing smaller than this *block* matches.
1234 if (ItemPointerIsLossyPage(&key->curItem))
1236 if (GinItemPointerGetBlockNumber(&advancePast) <
1237 GinItemPointerGetBlockNumber(&key->curItem))
1239 advancePast.ip_blkid = key->curItem.ip_blkid;
1240 advancePast.ip_posid = 0;
1245 Assert(key->curItem.ip_posid > 0);
1246 advancePast = key->curItem;
1247 advancePast.ip_posid--;
1251 * If this is the first key, remember this location as a potential
1252 * match, and proceed to check the rest of the keys.
1254 * Otherwise, check if this is the same item that we checked the
1255 * previous keys for (or a lossy pointer for the same page). If
1256 * not, loop back to check the previous keys for this item (we
1257 * will check this key again too, but keyGetItem returns quickly
1262 *item = key->curItem;
1266 if (ItemPointerIsLossyPage(&key->curItem) ||
1267 ItemPointerIsLossyPage(item))
1269 Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
1270 match = (GinItemPointerGetBlockNumber(&key->curItem) ==
1271 GinItemPointerGetBlockNumber(item));
1275 Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
1276 match = (ginCompareItemPointers(&key->curItem, item) == 0);
1282 Assert(!ItemPointerIsMin(item));
1285 * Now *item contains the first ItemPointer after previous result that
1286 * satisfied all the keys for that exact TID, or a lossy reference to the
1289 * We must return recheck = true if any of the keys are marked recheck.
1292 for (i = 0; i < so->nkeys; i++)
1294 GinScanKey key = so->keys + i;
1296 if (key->recheckCurItem)
1308 * Functions for scanning the pending list
1313 * Get ItemPointer of next heap row to be checked from pending list.
1314 * Returns false if there are no more. On pages with several heap rows
1315 * it returns each row separately, on page with part of heap row returns
1316 * per page data. pos->firstOffset and pos->lastOffset are set to identify
1317 * the range of pending-list tuples belonging to this heap row.
1319 * The pendingBuffer is presumed pinned and share-locked on entry, and is
1320 * pinned and share-locked on success exit. On failure exit it's released.
1323 scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
1325 OffsetNumber maxoff;
1329 ItemPointerSetInvalid(&pos->item);
1332 page = BufferGetPage(pos->pendingBuffer);
1334 maxoff = PageGetMaxOffsetNumber(page);
1335 if (pos->firstOffset > maxoff)
1337 BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
1339 if (blkno == InvalidBlockNumber)
1341 UnlockReleaseBuffer(pos->pendingBuffer);
1342 pos->pendingBuffer = InvalidBuffer;
1349 * Here we must prevent deletion of next page by insertcleanup
1350 * process, which may be trying to obtain exclusive lock on
1351 * current page. So, we lock next page before releasing the
1354 Buffer tmpbuf = ReadBuffer(scan->indexRelation, blkno);
1356 LockBuffer(tmpbuf, GIN_SHARE);
1357 UnlockReleaseBuffer(pos->pendingBuffer);
1359 pos->pendingBuffer = tmpbuf;
1360 pos->firstOffset = FirstOffsetNumber;
1365 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset));
1366 pos->item = itup->t_tid;
1367 if (GinPageHasFullRow(page))
1370 * find itempointer to the next row
1372 for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++)
1374 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset));
1375 if (!ItemPointerEquals(&pos->item, &itup->t_tid))
1382 * All itempointers are the same on this page
1384 pos->lastOffset = maxoff + 1;
1388 * Now pos->firstOffset points to the first tuple of current heap
1389 * row, pos->lastOffset points to the first tuple of next heap row
1390 * (or to the end of page)
1400 * Scan pending-list page from current tuple (off) up till the first of:
1401 * - match is found (then returns true)
1402 * - no later match is possible
1403 * - tuple's attribute number is not equal to entry's attrnum
1404 * - reach end of page
1406 * datum[]/category[]/datumExtracted[] arrays are used to cache the results
1407 * of gintuple_get_key() on the current page.
1410 matchPartialInPendingList(GinState *ginstate, Page page,
1411 OffsetNumber off, OffsetNumber maxoff,
1413 Datum *datum, GinNullCategory *category,
1414 bool *datumExtracted)
1419 /* Partial match to a null is not possible */
1420 if (entry->queryCategory != GIN_CAT_NORM_KEY)
1423 while (off < maxoff)
1425 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
1427 if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
1430 if (datumExtracted[off - 1] == false)
1432 datum[off - 1] = gintuple_get_key(ginstate, itup,
1433 &category[off - 1]);
1434 datumExtracted[off - 1] = true;
1437 /* Once we hit nulls, no further match is possible */
1438 if (category[off - 1] != GIN_CAT_NORM_KEY)
1442 * Check partial match.
1443 * case cmp == 0 => match
1444 * case cmp > 0 => not match and end scan (no later match possible)
1445 * case cmp < 0 => not match and continue scan
1448 cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1],
1449 ginstate->supportCollation[entry->attnum - 1],
1452 UInt16GetDatum(entry->strategy),
1453 PointerGetDatum(entry->extra_data)));
1466 * Set up the entryRes array for each key by looking at
1467 * every entry for current heap row in pending list.
1469 * Returns true if each scan key has at least one entryRes match.
1470 * This corresponds to the situations where the normal index search will
1471 * try to apply the key's consistentFn. (A tuple not meeting that requirement
1472 * cannot be returned by the normal search since no entry stream will
1475 * The pendingBuffer is presumed pinned and share-locked on entry.
1478 collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
1480 GinScanOpaque so = (GinScanOpaque) scan->opaque;
1481 OffsetNumber attrnum;
1488 * Reset all entryRes and hasMatchKey flags
1490 for (i = 0; i < so->nkeys; i++)
1492 GinScanKey key = so->keys + i;
1494 memset(key->entryRes, GIN_FALSE, key->nentries);
1496 memset(pos->hasMatchKey, FALSE, so->nkeys);
1499 * Outer loop iterates over multiple pending-list pages when a single heap
1500 * row has entries spanning those pages.
1504 Datum datum[BLCKSZ / sizeof(IndexTupleData)];
1505 GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
1506 bool datumExtracted[BLCKSZ / sizeof(IndexTupleData)];
1508 Assert(pos->lastOffset > pos->firstOffset);
1509 memset(datumExtracted + pos->firstOffset - 1, 0,
1510 sizeof(bool) * (pos->lastOffset - pos->firstOffset));
1512 page = BufferGetPage(pos->pendingBuffer);
1514 for (i = 0; i < so->nkeys; i++)
1516 GinScanKey key = so->keys + i;
1518 for (j = 0; j < key->nentries; j++)
1520 GinScanEntry entry = key->scanEntry[j];
1521 OffsetNumber StopLow = pos->firstOffset,
1522 StopHigh = pos->lastOffset,
1525 /* If already matched on earlier page, do no extra work */
1526 if (key->entryRes[j])
1530 * Interesting tuples are from pos->firstOffset to
1531 * pos->lastOffset and they are ordered by (attnum, Datum) as
1532 * it's done in entry tree. So we can use binary search to
1533 * avoid linear scanning.
1535 while (StopLow < StopHigh)
1539 StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1541 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));
1543 attrnum = gintuple_get_attrnum(&so->ginstate, itup);
1545 if (key->attnum < attrnum)
1547 StopHigh = StopMiddle;
1550 if (key->attnum > attrnum)
1552 StopLow = StopMiddle + 1;
1556 if (datumExtracted[StopMiddle - 1] == false)
1558 datum[StopMiddle - 1] =
1559 gintuple_get_key(&so->ginstate, itup,
1560 &category[StopMiddle - 1]);
1561 datumExtracted[StopMiddle - 1] = true;
1564 if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
1566 /* special behavior depending on searchMode */
1567 if (entry->searchMode == GIN_SEARCH_MODE_ALL)
1569 /* match anything except NULL_ITEM */
1570 if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM)
1577 /* match everything */
1583 res = ginCompareEntries(&so->ginstate,
1586 entry->queryCategory,
1587 datum[StopMiddle - 1],
1588 category[StopMiddle - 1]);
1594 * Found exact match (there can be only one, except in
1595 * EMPTY_QUERY mode).
1597 * If doing partial match, scan forward from here to
1598 * end of page to check for matches.
1600 * See comment above about tuple's ordering.
1602 if (entry->isPartialMatch)
1604 matchPartialInPendingList(&so->ginstate,
1613 key->entryRes[j] = true;
1615 /* done with binary search */
1619 StopHigh = StopMiddle;
1621 StopLow = StopMiddle + 1;
1624 if (StopLow >= StopHigh && entry->isPartialMatch)
1627 * No exact match on this page. If doing partial match,
1628 * scan from the first tuple greater than target value to
1629 * end of page. Note that since we don't remember whether
1630 * the comparePartialFn told us to stop early on a
1631 * previous page, we will uselessly apply comparePartialFn
1632 * to the first tuple on each subsequent page.
1635 matchPartialInPendingList(&so->ginstate,
1645 pos->hasMatchKey[i] |= key->entryRes[j];
1649 /* Advance firstOffset over the scanned tuples */
1650 pos->firstOffset = pos->lastOffset;
1652 if (GinPageHasFullRow(page))
1655 * We have examined all pending entries for the current heap row.
1656 * Break out of loop over pages.
1663 * Advance to next page of pending entries for the current heap
1664 * row. Complain if there isn't one.
1666 ItemPointerData item = pos->item;
1668 if (scanGetCandidate(scan, pos) == false ||
1669 !ItemPointerEquals(&pos->item, &item))
1670 elog(ERROR, "could not find additional pending pages for same heap tuple");
1675 * Now return "true" if all scan keys have at least one matching datum
1677 for (i = 0; i < so->nkeys; i++)
1679 if (pos->hasMatchKey[i] == false)
1687 * Collect all matched rows from pending list into bitmap
1690 scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
1692 GinScanOpaque so = (GinScanOpaque) scan->opaque;
1693 MemoryContext oldCtx;
1697 pendingPosition pos;
1698 Buffer metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO);
1703 LockBuffer(metabuffer, GIN_SHARE);
1704 blkno = GinPageGetMeta(BufferGetPage(metabuffer))->head;
1707 * fetch head of list before unlocking metapage. head page must be pinned
1708 * to prevent deletion by vacuum process
1710 if (blkno == InvalidBlockNumber)
1712 /* No pending list, so proceed with normal scan */
1713 UnlockReleaseBuffer(metabuffer);
1717 pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
1718 LockBuffer(pos.pendingBuffer, GIN_SHARE);
1719 pos.firstOffset = FirstOffsetNumber;
1720 UnlockReleaseBuffer(metabuffer);
1721 pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys);
1724 * loop for each heap row. scanGetCandidate returns full row or row's
1725 * tuples from first page.
1727 while (scanGetCandidate(scan, &pos))
1730 * Check entries in tuple and set up entryRes array.
1732 * If pending tuples belonging to the current heap row are spread
1733 * across several pages, collectMatchesForHeapRow will read all of
1736 if (!collectMatchesForHeapRow(scan, &pos))
1740 * Matching of entries of one row is finished, so check row using
1741 * consistent functions.
1743 oldCtx = MemoryContextSwitchTo(so->tempCtx);
1747 for (i = 0; i < so->nkeys; i++)
1749 GinScanKey key = so->keys + i;
1751 if (!key->boolConsistentFn(key))
1756 recheck |= key->recheckCurItem;
1759 MemoryContextSwitchTo(oldCtx);
1760 MemoryContextReset(so->tempCtx);
1764 tbm_add_tuples(tbm, &pos.item, 1, recheck);
1769 pfree(pos.hasMatchKey);
1773 #define GinIsVoidRes(s) ( ((GinScanOpaque) scan->opaque)->isVoidRes )
1776 gingetbitmap(PG_FUNCTION_ARGS)
1778 IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
1779 TIDBitmap *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
1780 GinScanOpaque so = (GinScanOpaque) scan->opaque;
1782 ItemPointerData iptr;
1786 * Set up the scan keys, and check for unsatisfiable query.
1788 ginFreeScanKeys(so); /* there should be no keys yet, but just to be sure */
1789 ginNewScanKey(scan);
1791 if (GinIsVoidRes(scan))
1797 * First, scan the pending list and collect any matching entries into the
1798 * bitmap. After we scan a pending item, some other backend could post it
1799 * into the main index, and so we might visit it a second time during the
1800 * main scan. This is okay because we'll just re-set the same bit in the
1801 * bitmap. (The possibility of duplicate visits is a major reason why GIN
1802 * can't support the amgettuple API, however.) Note that it would not do
1803 * to scan the main index before the pending list, since concurrent
1804 * cleanup could then make us miss entries entirely.
1806 scanPendingInsert(scan, tbm, &ntids);
1809 * Now scan the main index.
1813 ItemPointerSetMin(&iptr);
1817 CHECK_FOR_INTERRUPTS();
1819 if (!scanGetItem(scan, iptr, &iptr, &recheck))
1822 if (ItemPointerIsLossyPage(&iptr))
1823 tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr));
1825 tbm_add_tuples(tbm, &iptr, 1, recheck);
1829 PG_RETURN_INT64(ntids);