]> granicus.if.org Git - postgresql/blob - src/backend/access/gin/ginget.c
Fix initialization of fake LSN for unlogged relations
[postgresql] / src / backend / access / gin / ginget.c
1 /*-------------------------------------------------------------------------
2  *
3  * ginget.c
4  *        fetch tuples from a GIN scan.
5  *
6  *
7  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *                      src/backend/access/gin/ginget.c
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include "access/gin_private.h"
18 #include "access/relscan.h"
19 #include "miscadmin.h"
20 #include "storage/predicate.h"
21 #include "utils/datum.h"
22 #include "utils/memutils.h"
23 #include "utils/rel.h"
24
25 /* GUC parameter */
26 int                     GinFuzzySearchLimit = 0;
27
28 typedef struct pendingPosition
29 {
30         Buffer          pendingBuffer;
31         OffsetNumber firstOffset;
32         OffsetNumber lastOffset;
33         ItemPointerData item;
34         bool       *hasMatchKey;
35 } pendingPosition;
36
37
38 /*
39  * Goes to the next page if current offset is outside of bounds
40  */
41 static bool
42 moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack, Snapshot snapshot)
43 {
44         Page            page = BufferGetPage(stack->buffer);
45
46         if (stack->off > PageGetMaxOffsetNumber(page))
47         {
48                 /*
49                  * We scanned the whole page, so we should take right page
50                  */
51                 if (GinPageRightMost(page))
52                         return false;           /* no more pages */
53
54                 stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
55                 stack->blkno = BufferGetBlockNumber(stack->buffer);
56                 stack->off = FirstOffsetNumber;
57                 PredicateLockPage(btree->index, stack->blkno, snapshot);
58         }
59
60         return true;
61 }
62
63 /*
64  * Scan all pages of a posting tree and save all its heap ItemPointers
65  * in scanEntry->matchBitmap
66  */
67 static void
68 scanPostingTree(Relation index, GinScanEntry scanEntry,
69                                 BlockNumber rootPostingTree, Snapshot snapshot)
70 {
71         GinBtreeData btree;
72         GinBtreeStack *stack;
73         Buffer          buffer;
74         Page            page;
75
76         /* Descend to the leftmost leaf page */
77         stack = ginScanBeginPostingTree(&btree, index, rootPostingTree, snapshot);
78         buffer = stack->buffer;
79
80         IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
81
82         freeGinBtreeStack(stack);
83
84         /*
85          * Loop iterates through all leaf pages of posting tree
86          */
87         for (;;)
88         {
89                 page = BufferGetPage(buffer);
90                 if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
91                 {
92                         int                     n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
93
94                         scanEntry->predictNumberResult += n;
95                 }
96
97                 if (GinPageRightMost(page))
98                         break;                          /* no more pages */
99
100                 buffer = ginStepRight(buffer, index, GIN_SHARE);
101         }
102
103         UnlockReleaseBuffer(buffer);
104 }
105
106 /*
107  * Collects TIDs into scanEntry->matchBitmap for all heap tuples that
108  * match the search entry.  This supports three different match modes:
109  *
110  * 1. Partial-match support: scan from current point until the
111  *        comparePartialFn says we're done.
112  * 2. SEARCH_MODE_ALL: scan from current point (which should be first
113  *        key for the current attnum) until we hit null items or end of attnum
114  * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
115  *        key for the current attnum) until we hit end of attnum
116  *
117  * Returns true if done, false if it's necessary to restart scan from scratch
118  */
119 static bool
120 collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
121                                    GinScanEntry scanEntry, Snapshot snapshot)
122 {
123         OffsetNumber attnum;
124         Form_pg_attribute attr;
125
126         /* Initialize empty bitmap result */
127         scanEntry->matchBitmap = tbm_create(work_mem * 1024L, NULL);
128
129         /* Null query cannot partial-match anything */
130         if (scanEntry->isPartialMatch &&
131                 scanEntry->queryCategory != GIN_CAT_NORM_KEY)
132                 return true;
133
134         /* Locate tupdesc entry for key column (for attbyval/attlen data) */
135         attnum = scanEntry->attnum;
136         attr = TupleDescAttr(btree->ginstate->origTupdesc, attnum - 1);
137
138         /*
139          * Predicate lock entry leaf page, following pages will be locked by
140          * moveRightIfItNeeded()
141          */
142         PredicateLockPage(btree->index, stack->buffer, snapshot);
143
144         for (;;)
145         {
146                 Page            page;
147                 IndexTuple      itup;
148                 Datum           idatum;
149                 GinNullCategory icategory;
150
151                 /*
152                  * stack->off points to the interested entry, buffer is already locked
153                  */
154                 if (moveRightIfItNeeded(btree, stack, snapshot) == false)
155                         return true;
156
157                 page = BufferGetPage(stack->buffer);
158                 TestForOldSnapshot(snapshot, btree->index, page);
159                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
160
161                 /*
162                  * If tuple stores another attribute then stop scan
163                  */
164                 if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
165                         return true;
166
167                 /* Safe to fetch attribute value */
168                 idatum = gintuple_get_key(btree->ginstate, itup, &icategory);
169
170                 /*
171                  * Check for appropriate scan stop conditions
172                  */
173                 if (scanEntry->isPartialMatch)
174                 {
175                         int32           cmp;
176
177                         /*
178                          * In partial match, stop scan at any null (including
179                          * placeholders); partial matches never match nulls
180                          */
181                         if (icategory != GIN_CAT_NORM_KEY)
182                                 return true;
183
184                         /*----------
185                          * Check of partial match.
186                          * case cmp == 0 => match
187                          * case cmp > 0 => not match and finish scan
188                          * case cmp < 0 => not match and continue scan
189                          *----------
190                          */
191                         cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1],
192                                                                                                   btree->ginstate->supportCollation[attnum - 1],
193                                                                                                   scanEntry->queryKey,
194                                                                                                   idatum,
195                                                                                                   UInt16GetDatum(scanEntry->strategy),
196                                                                                                   PointerGetDatum(scanEntry->extra_data)));
197
198                         if (cmp > 0)
199                                 return true;
200                         else if (cmp < 0)
201                         {
202                                 stack->off++;
203                                 continue;
204                         }
205                 }
206                 else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
207                 {
208                         /*
209                          * In ALL mode, we are not interested in null items, so we can
210                          * stop if we get to a null-item placeholder (which will be the
211                          * last entry for a given attnum).  We do want to include NULL_KEY
212                          * and EMPTY_ITEM entries, though.
213                          */
214                         if (icategory == GIN_CAT_NULL_ITEM)
215                                 return true;
216                 }
217
218                 /*
219                  * OK, we want to return the TIDs listed in this entry.
220                  */
221                 if (GinIsPostingTree(itup))
222                 {
223                         BlockNumber rootPostingTree = GinGetPostingTree(itup);
224
225                         /*
226                          * We should unlock current page (but not unpin) during tree scan
227                          * to prevent deadlock with vacuum processes.
228                          *
229                          * We save current entry value (idatum) to be able to re-find our
230                          * tuple after re-locking
231                          */
232                         if (icategory == GIN_CAT_NORM_KEY)
233                                 idatum = datumCopy(idatum, attr->attbyval, attr->attlen);
234
235                         LockBuffer(stack->buffer, GIN_UNLOCK);
236
237                         /*
238                          * Acquire predicate lock on the posting tree.  We already hold a
239                          * lock on the entry page, but insertions to the posting tree
240                          * don't check for conflicts on that level.
241                          */
242                         PredicateLockPage(btree->index, rootPostingTree, snapshot);
243
244                         /* Collect all the TIDs in this entry's posting tree */
245                         scanPostingTree(btree->index, scanEntry, rootPostingTree,
246                                                         snapshot);
247
248                         /*
249                          * We lock again the entry page and while it was unlocked insert
250                          * might have occurred, so we need to re-find our position.
251                          */
252                         LockBuffer(stack->buffer, GIN_SHARE);
253                         page = BufferGetPage(stack->buffer);
254                         if (!GinPageIsLeaf(page))
255                         {
256                                 /*
257                                  * Root page becomes non-leaf while we unlock it. We will
258                                  * start again, this situation doesn't occur often - root can
259                                  * became a non-leaf only once per life of index.
260                                  */
261                                 return false;
262                         }
263
264                         /* Search forward to re-find idatum */
265                         for (;;)
266                         {
267                                 Datum           newDatum;
268                                 GinNullCategory newCategory;
269
270                                 if (moveRightIfItNeeded(btree, stack, snapshot) == false)
271                                         elog(ERROR, "lost saved point in index");       /* must not happen !!! */
272
273                                 page = BufferGetPage(stack->buffer);
274                                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
275
276                                 if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
277                                         elog(ERROR, "lost saved point in index");       /* must not happen !!! */
278                                 newDatum = gintuple_get_key(btree->ginstate, itup,
279                                                                                         &newCategory);
280
281                                 if (ginCompareEntries(btree->ginstate, attnum,
282                                                                           newDatum, newCategory,
283                                                                           idatum, icategory) == 0)
284                                         break;          /* Found! */
285
286                                 stack->off++;
287                         }
288
289                         if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
290                                 pfree(DatumGetPointer(idatum));
291                 }
292                 else
293                 {
294                         ItemPointer ipd;
295                         int                     nipd;
296
297                         ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
298                         tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
299                         scanEntry->predictNumberResult += GinGetNPosting(itup);
300                         pfree(ipd);
301                 }
302
303                 /*
304                  * Done with this entry, go to the next
305                  */
306                 stack->off++;
307         }
308 }
309
310 /*
311  * Start* functions setup beginning state of searches: finds correct buffer and pins it.
312  */
313 static void
314 startScanEntry(GinState *ginstate, GinScanEntry entry, Snapshot snapshot)
315 {
316         GinBtreeData btreeEntry;
317         GinBtreeStack *stackEntry;
318         Page            page;
319         bool            needUnlock;
320
321 restartScanEntry:
322         entry->buffer = InvalidBuffer;
323         ItemPointerSetMin(&entry->curItem);
324         entry->offset = InvalidOffsetNumber;
325         if (entry->list)
326                 pfree(entry->list);
327         entry->list = NULL;
328         entry->nlist = 0;
329         entry->matchBitmap = NULL;
330         entry->matchResult = NULL;
331         entry->reduceResult = false;
332         entry->predictNumberResult = 0;
333
334         /*
335          * we should find entry, and begin scan of posting tree or just store
336          * posting list in memory
337          */
338         ginPrepareEntryScan(&btreeEntry, entry->attnum,
339                                                 entry->queryKey, entry->queryCategory,
340                                                 ginstate);
341         stackEntry = ginFindLeafPage(&btreeEntry, true, false, snapshot);
342         page = BufferGetPage(stackEntry->buffer);
343
344         /* ginFindLeafPage() will have already checked snapshot age. */
345         needUnlock = true;
346
347         entry->isFinished = true;
348
349         if (entry->isPartialMatch ||
350                 entry->queryCategory == GIN_CAT_EMPTY_QUERY)
351         {
352                 /*
353                  * btreeEntry.findItem locates the first item >= given search key.
354                  * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
355                  * because of the way the GIN_CAT_EMPTY_QUERY category code is
356                  * assigned.)  We scan forward from there and collect all TIDs needed
357                  * for the entry type.
358                  */
359                 btreeEntry.findItem(&btreeEntry, stackEntry);
360                 if (collectMatchBitmap(&btreeEntry, stackEntry, entry, snapshot)
361                         == false)
362                 {
363                         /*
364                          * GIN tree was seriously restructured, so we will cleanup all
365                          * found data and rescan. See comments near 'return false' in
366                          * collectMatchBitmap()
367                          */
368                         if (entry->matchBitmap)
369                         {
370                                 if (entry->matchIterator)
371                                         tbm_end_iterate(entry->matchIterator);
372                                 entry->matchIterator = NULL;
373                                 tbm_free(entry->matchBitmap);
374                                 entry->matchBitmap = NULL;
375                         }
376                         LockBuffer(stackEntry->buffer, GIN_UNLOCK);
377                         freeGinBtreeStack(stackEntry);
378                         goto restartScanEntry;
379                 }
380
381                 if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
382                 {
383                         entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
384                         entry->isFinished = false;
385                 }
386         }
387         else if (btreeEntry.findItem(&btreeEntry, stackEntry))
388         {
389                 IndexTuple      itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
390
391                 if (GinIsPostingTree(itup))
392                 {
393                         BlockNumber rootPostingTree = GinGetPostingTree(itup);
394                         GinBtreeStack *stack;
395                         Page            page;
396                         ItemPointerData minItem;
397
398                         /*
399                          * This is an equality scan, so lock the root of the posting tree.
400                          * It represents a lock on the exact key value, and covers all the
401                          * items in the posting tree.
402                          */
403                         PredicateLockPage(ginstate->index, rootPostingTree, snapshot);
404
405                         /*
406                          * We should unlock entry page before touching posting tree to
407                          * prevent deadlocks with vacuum processes. Because entry is never
408                          * deleted from page and posting tree is never reduced to the
409                          * posting list, we can unlock page after getting BlockNumber of
410                          * root of posting tree.
411                          */
412                         LockBuffer(stackEntry->buffer, GIN_UNLOCK);
413                         needUnlock = false;
414
415                         stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
416                                                                                         rootPostingTree, snapshot);
417                         entry->buffer = stack->buffer;
418
419                         /*
420                          * We keep buffer pinned because we need to prevent deletion of
421                          * page during scan. See GIN's vacuum implementation. RefCount is
422                          * increased to keep buffer pinned after freeGinBtreeStack() call.
423                          */
424                         IncrBufferRefCount(entry->buffer);
425
426                         page = BufferGetPage(entry->buffer);
427
428                         /*
429                          * Load the first page into memory.
430                          */
431                         ItemPointerSetMin(&minItem);
432                         entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem);
433
434                         entry->predictNumberResult = stack->predictNumber * entry->nlist;
435
436                         LockBuffer(entry->buffer, GIN_UNLOCK);
437                         freeGinBtreeStack(stack);
438                         entry->isFinished = false;
439                 }
440                 else
441                 {
442                         /*
443                          * Lock the entry leaf page.  This is more coarse-grained than
444                          * necessary, because it will conflict with any insertions that
445                          * land on the same leaf page, not only the exact key we searched
446                          * for.  But locking an individual tuple would require updating
447                          * that lock whenever it moves because of insertions or vacuums,
448                          * which seems too complicated.
449                          */
450                         PredicateLockPage(ginstate->index,
451                                                           BufferGetBlockNumber(stackEntry->buffer),
452                                                           snapshot);
453                         if (GinGetNPosting(itup) > 0)
454                         {
455                                 entry->list = ginReadTuple(ginstate, entry->attnum, itup,
456                                                                                    &entry->nlist);
457                                 entry->predictNumberResult = entry->nlist;
458
459                                 entry->isFinished = false;
460                         }
461                 }
462         }
463         else
464         {
465                 /*
466                  * No entry found.  Predicate lock the leaf page, to lock the place
467                  * where the entry would've been, had there been one.
468                  */
469                 PredicateLockPage(ginstate->index,
470                                                   BufferGetBlockNumber(stackEntry->buffer), snapshot);
471         }
472
473         if (needUnlock)
474                 LockBuffer(stackEntry->buffer, GIN_UNLOCK);
475         freeGinBtreeStack(stackEntry);
476 }
477
478 /*
479  * Comparison function for scan entry indexes. Sorts by predictNumberResult,
480  * least frequent items first.
481  */
482 static int
483 entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
484 {
485         const GinScanKey key = (const GinScanKey) arg;
486         int                     i1 = *(const int *) a1;
487         int                     i2 = *(const int *) a2;
488         uint32          n1 = key->scanEntry[i1]->predictNumberResult;
489         uint32          n2 = key->scanEntry[i2]->predictNumberResult;
490
491         if (n1 < n2)
492                 return -1;
493         else if (n1 == n2)
494                 return 0;
495         else
496                 return 1;
497 }
498
499 static void
500 startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
501 {
502         MemoryContext oldCtx = CurrentMemoryContext;
503         int                     i;
504         int                     j;
505         int                *entryIndexes;
506
507         ItemPointerSetMin(&key->curItem);
508         key->curItemMatches = false;
509         key->recheckCurItem = false;
510         key->isFinished = false;
511
512         /*
513          * Divide the entries into two distinct sets: required and additional.
514          * Additional entries are not enough for a match alone, without any items
515          * from the required set, but are needed by the consistent function to
516          * decide if an item matches. When scanning, we can skip over items from
517          * additional entries that have no corresponding matches in any of the
518          * required entries. That speeds up queries like "frequent & rare"
519          * considerably, if the frequent term can be put in the additional set.
520          *
521          * There can be many legal ways to divide them entries into these two
522          * sets. A conservative division is to just put everything in the required
523          * set, but the more you can put in the additional set, the more you can
524          * skip during the scan. To maximize skipping, we try to put as many
525          * frequent items as possible into additional, and less frequent ones into
526          * required. To do that, sort the entries by frequency
527          * (predictNumberResult), and put entries into the required set in that
528          * order, until the consistent function says that none of the remaining
529          * entries can form a match, without any items from the required set. The
530          * rest go to the additional set.
531          */
532         if (key->nentries > 1)
533         {
534                 MemoryContextSwitchTo(so->tempCtx);
535
536                 entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
537                 for (i = 0; i < key->nentries; i++)
538                         entryIndexes[i] = i;
539                 qsort_arg(entryIndexes, key->nentries, sizeof(int),
540                                   entryIndexByFrequencyCmp, key);
541
542                 for (i = 0; i < key->nentries - 1; i++)
543                 {
544                         /* Pass all entries <= i as FALSE, and the rest as MAYBE */
545                         for (j = 0; j <= i; j++)
546                                 key->entryRes[entryIndexes[j]] = GIN_FALSE;
547                         for (j = i + 1; j < key->nentries; j++)
548                                 key->entryRes[entryIndexes[j]] = GIN_MAYBE;
549
550                         if (key->triConsistentFn(key) == GIN_FALSE)
551                                 break;
552                 }
553                 /* i is now the last required entry. */
554
555                 MemoryContextSwitchTo(so->keyCtx);
556
557                 key->nrequired = i + 1;
558                 key->nadditional = key->nentries - key->nrequired;
559                 key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
560                 key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
561
562                 j = 0;
563                 for (i = 0; i < key->nrequired; i++)
564                         key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
565                 for (i = 0; i < key->nadditional; i++)
566                         key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
567
568                 /* clean up after consistentFn calls (also frees entryIndexes) */
569                 MemoryContextReset(so->tempCtx);
570         }
571         else
572         {
573                 MemoryContextSwitchTo(so->keyCtx);
574
575                 key->nrequired = 1;
576                 key->nadditional = 0;
577                 key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
578                 key->requiredEntries[0] = key->scanEntry[0];
579         }
580         MemoryContextSwitchTo(oldCtx);
581 }
582
583 static void
584 startScan(IndexScanDesc scan)
585 {
586         GinScanOpaque so = (GinScanOpaque) scan->opaque;
587         GinState   *ginstate = &so->ginstate;
588         uint32          i;
589
590         for (i = 0; i < so->totalentries; i++)
591                 startScanEntry(ginstate, so->entries[i], scan->xs_snapshot);
592
593         if (GinFuzzySearchLimit > 0)
594         {
595                 /*
596                  * If all of keys more than threshold we will try to reduce result, we
597                  * hope (and only hope, for intersection operation of array our
598                  * supposition isn't true), that total result will not more than
599                  * minimal predictNumberResult.
600                  */
601                 bool            reduce = true;
602
603                 for (i = 0; i < so->totalentries; i++)
604                 {
605                         if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
606                         {
607                                 reduce = false;
608                                 break;
609                         }
610                 }
611                 if (reduce)
612                 {
613                         for (i = 0; i < so->totalentries; i++)
614                         {
615                                 so->entries[i]->predictNumberResult /= so->totalentries;
616                                 so->entries[i]->reduceResult = true;
617                         }
618                 }
619         }
620
621         /*
622          * Now that we have the estimates for the entry frequencies, finish
623          * initializing the scan keys.
624          */
625         for (i = 0; i < so->nkeys; i++)
626                 startScanKey(ginstate, so, so->keys + i);
627 }
628
629 /*
630  * Load the next batch of item pointers from a posting tree.
631  *
632  * Note that we copy the page into GinScanEntry->list array and unlock it, but
633  * keep it pinned to prevent interference with vacuum.
634  */
635 static void
636 entryLoadMoreItems(GinState *ginstate, GinScanEntry entry,
637                                    ItemPointerData advancePast, Snapshot snapshot)
638 {
639         Page            page;
640         int                     i;
641         bool            stepright;
642
643         if (!BufferIsValid(entry->buffer))
644         {
645                 entry->isFinished = true;
646                 return;
647         }
648
649         /*
650          * We have two strategies for finding the correct page: step right from
651          * the current page, or descend the tree again from the root. If
652          * advancePast equals the current item, the next matching item should be
653          * on the next page, so we step right. Otherwise, descend from root.
654          */
655         if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
656         {
657                 stepright = true;
658                 LockBuffer(entry->buffer, GIN_SHARE);
659         }
660         else
661         {
662                 GinBtreeStack *stack;
663
664                 ReleaseBuffer(entry->buffer);
665
666                 /*
667                  * Set the search key, and find the correct leaf page.
668                  */
669                 if (ItemPointerIsLossyPage(&advancePast))
670                 {
671                         ItemPointerSet(&entry->btree.itemptr,
672                                                    GinItemPointerGetBlockNumber(&advancePast) + 1,
673                                                    FirstOffsetNumber);
674                 }
675                 else
676                 {
677                         ItemPointerSet(&entry->btree.itemptr,
678                                                    GinItemPointerGetBlockNumber(&advancePast),
679                                                    OffsetNumberNext(GinItemPointerGetOffsetNumber(&advancePast)));
680                 }
681                 entry->btree.fullScan = false;
682                 stack = ginFindLeafPage(&entry->btree, true, false, snapshot);
683
684                 /* we don't need the stack, just the buffer. */
685                 entry->buffer = stack->buffer;
686                 IncrBufferRefCount(entry->buffer);
687                 freeGinBtreeStack(stack);
688                 stepright = false;
689         }
690
691         elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
692                  GinItemPointerGetBlockNumber(&advancePast),
693                  GinItemPointerGetOffsetNumber(&advancePast),
694                  !stepright);
695
696         page = BufferGetPage(entry->buffer);
697         for (;;)
698         {
699                 entry->offset = InvalidOffsetNumber;
700                 if (entry->list)
701                 {
702                         pfree(entry->list);
703                         entry->list = NULL;
704                         entry->nlist = 0;
705                 }
706
707                 if (stepright)
708                 {
709                         /*
710                          * We've processed all the entries on this page. If it was the
711                          * last page in the tree, we're done.
712                          */
713                         if (GinPageRightMost(page))
714                         {
715                                 UnlockReleaseBuffer(entry->buffer);
716                                 entry->buffer = InvalidBuffer;
717                                 entry->isFinished = true;
718                                 return;
719                         }
720
721                         /*
722                          * Step to next page, following the right link. then find the
723                          * first ItemPointer greater than advancePast.
724                          */
725                         entry->buffer = ginStepRight(entry->buffer,
726                                                                                  ginstate->index,
727                                                                                  GIN_SHARE);
728                         page = BufferGetPage(entry->buffer);
729                 }
730                 stepright = true;
731
732                 if (GinPageGetOpaque(page)->flags & GIN_DELETED)
733                         continue;                       /* page was deleted by concurrent vacuum */
734
735                 /*
736                  * The first item > advancePast might not be on this page, but
737                  * somewhere to the right, if the page was split, or a non-match from
738                  * another key in the query allowed us to skip some items from this
739                  * entry. Keep following the right-links until we re-find the correct
740                  * page.
741                  */
742                 if (!GinPageRightMost(page) &&
743                         ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
744                 {
745                         /*
746                          * the item we're looking is > the right bound of the page, so it
747                          * can't be on this page.
748                          */
749                         continue;
750                 }
751
752                 entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
753
754                 for (i = 0; i < entry->nlist; i++)
755                 {
756                         if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
757                         {
758                                 entry->offset = i;
759
760                                 if (GinPageRightMost(page))
761                                 {
762                                         /* after processing the copied items, we're done. */
763                                         UnlockReleaseBuffer(entry->buffer);
764                                         entry->buffer = InvalidBuffer;
765                                 }
766                                 else
767                                         LockBuffer(entry->buffer, GIN_UNLOCK);
768                                 return;
769                         }
770                 }
771         }
772 }
773
774 #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE))
775 #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
776
777 /*
778  * Sets entry->curItem to next heap item pointer > advancePast, for one entry
779  * of one scan key, or sets entry->isFinished to true if there are no more.
780  *
781  * Item pointers are returned in ascending order.
782  *
783  * Note: this can return a "lossy page" item pointer, indicating that the
784  * entry potentially matches all items on that heap page.  However, it is
785  * not allowed to return both a lossy page pointer and exact (regular)
786  * item pointers for the same page.  (Doing so would break the key-combination
787  * logic in keyGetItem and scanGetItem; see comment in scanGetItem.)  In the
788  * current implementation this is guaranteed by the behavior of tidbitmaps.
789  */
790 static void
791 entryGetItem(GinState *ginstate, GinScanEntry entry,
792                          ItemPointerData advancePast, Snapshot snapshot)
793 {
794         Assert(!entry->isFinished);
795
796         Assert(!ItemPointerIsValid(&entry->curItem) ||
797                    ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
798
799         if (entry->matchBitmap)
800         {
801                 /* A bitmap result */
802                 BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
803                 OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
804                 bool            gotitem = false;
805
806                 do
807                 {
808                         /*
809                          * If we've exhausted all items on this block, move to next block
810                          * in the bitmap.
811                          */
812                         while (entry->matchResult == NULL ||
813                                    (entry->matchResult->ntuples >= 0 &&
814                                         entry->offset >= entry->matchResult->ntuples) ||
815                                    entry->matchResult->blockno < advancePastBlk ||
816                                    (ItemPointerIsLossyPage(&advancePast) &&
817                                         entry->matchResult->blockno == advancePastBlk))
818                         {
819                                 entry->matchResult = tbm_iterate(entry->matchIterator);
820
821                                 if (entry->matchResult == NULL)
822                                 {
823                                         ItemPointerSetInvalid(&entry->curItem);
824                                         tbm_end_iterate(entry->matchIterator);
825                                         entry->matchIterator = NULL;
826                                         entry->isFinished = true;
827                                         break;
828                                 }
829
830                                 /*
831                                  * Reset counter to the beginning of entry->matchResult. Note:
832                                  * entry->offset is still greater than matchResult->ntuples if
833                                  * matchResult is lossy.  So, on next call we will get next
834                                  * result from TIDBitmap.
835                                  */
836                                 entry->offset = 0;
837                         }
838                         if (entry->isFinished)
839                                 break;
840
841                         /*
842                          * We're now on the first page after advancePast which has any
843                          * items on it. If it's a lossy result, return that.
844                          */
845                         if (entry->matchResult->ntuples < 0)
846                         {
847                                 ItemPointerSetLossyPage(&entry->curItem,
848                                                                                 entry->matchResult->blockno);
849
850                                 /*
851                                  * We might as well fall out of the loop; we could not
852                                  * estimate number of results on this page to support correct
853                                  * reducing of result even if it's enabled.
854                                  */
855                                 gotitem = true;
856                                 break;
857                         }
858
859                         /*
860                          * Not a lossy page. Skip over any offsets <= advancePast, and
861                          * return that.
862                          */
863                         if (entry->matchResult->blockno == advancePastBlk)
864                         {
865                                 /*
866                                  * First, do a quick check against the last offset on the
867                                  * page. If that's > advancePast, so are all the other
868                                  * offsets.
869                                  */
870                                 if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
871                                 {
872                                         entry->offset = entry->matchResult->ntuples;
873                                         continue;
874                                 }
875
876                                 /* Otherwise scan to find the first item > advancePast */
877                                 while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
878                                         entry->offset++;
879                         }
880
881                         ItemPointerSet(&entry->curItem,
882                                                    entry->matchResult->blockno,
883                                                    entry->matchResult->offsets[entry->offset]);
884                         entry->offset++;
885                         gotitem = true;
886                 } while (!gotitem || (entry->reduceResult == true && dropItem(entry)));
887         }
888         else if (!BufferIsValid(entry->buffer))
889         {
890                 /*
891                  * A posting list from an entry tuple, or the last page of a posting
892                  * tree.
893                  */
894                 do
895                 {
896                         if (entry->offset >= entry->nlist)
897                         {
898                                 ItemPointerSetInvalid(&entry->curItem);
899                                 entry->isFinished = true;
900                                 break;
901                         }
902
903                         entry->curItem = entry->list[entry->offset++];
904                 } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
905                 /* XXX: shouldn't we apply the fuzzy search limit here? */
906         }
907         else
908         {
909                 /* A posting tree */
910                 do
911                 {
912                         /* If we've processed the current batch, load more items */
913                         while (entry->offset >= entry->nlist)
914                         {
915                                 entryLoadMoreItems(ginstate, entry, advancePast, snapshot);
916
917                                 if (entry->isFinished)
918                                 {
919                                         ItemPointerSetInvalid(&entry->curItem);
920                                         return;
921                                 }
922                         }
923
924                         entry->curItem = entry->list[entry->offset++];
925
926                 } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
927                                  (entry->reduceResult == true && dropItem(entry)));
928         }
929 }
930
931 /*
932  * Identify the "current" item among the input entry streams for this scan key
933  * that is greater than advancePast, and test whether it passes the scan key
934  * qual condition.
935  *
936  * The current item is the smallest curItem among the inputs.  key->curItem
937  * is set to that value.  key->curItemMatches is set to indicate whether that
938  * TID passes the consistentFn test.  If so, key->recheckCurItem is set true
939  * iff recheck is needed for this item pointer (including the case where the
940  * item pointer is a lossy page pointer).
941  *
942  * If all entry streams are exhausted, sets key->isFinished to true.
943  *
944  * Item pointers must be returned in ascending order.
945  *
946  * Note: this can return a "lossy page" item pointer, indicating that the
947  * key potentially matches all items on that heap page.  However, it is
948  * not allowed to return both a lossy page pointer and exact (regular)
949  * item pointers for the same page.  (Doing so would break the key-combination
950  * logic in scanGetItem.)
951  */
952 static void
953 keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
954                    ItemPointerData advancePast, Snapshot snapshot)
955 {
956         ItemPointerData minItem;
957         ItemPointerData curPageLossy;
958         uint32          i;
959         bool            haveLossyEntry;
960         GinScanEntry entry;
961         GinTernaryValue res;
962         MemoryContext oldCtx;
963         bool            allFinished;
964
965         Assert(!key->isFinished);
966
967         /*
968          * We might have already tested this item; if so, no need to repeat work.
969          * (Note: the ">" case can happen, if advancePast is exact but we
970          * previously had to set curItem to a lossy-page pointer.)
971          */
972         if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
973                 return;
974
975         /*
976          * Find the minimum item > advancePast among the active entry streams.
977          *
978          * Note: a lossy-page entry is encoded by a ItemPointer with max value for
979          * offset (0xffff), so that it will sort after any exact entries for the
980          * same page.  So we'll prefer to return exact pointers not lossy
981          * pointers, which is good.
982          */
983         ItemPointerSetMax(&minItem);
984         allFinished = true;
985         for (i = 0; i < key->nrequired; i++)
986         {
987                 entry = key->requiredEntries[i];
988
989                 if (entry->isFinished)
990                         continue;
991
992                 /*
993                  * Advance this stream if necessary.
994                  *
995                  * In particular, since entry->curItem was initialized with
996                  * ItemPointerSetMin, this ensures we fetch the first item for each
997                  * entry on the first call.
998                  */
999                 if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1000                 {
1001                         entryGetItem(ginstate, entry, advancePast, snapshot);
1002                         if (entry->isFinished)
1003                                 continue;
1004                 }
1005
1006                 allFinished = false;
1007                 if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
1008                         minItem = entry->curItem;
1009         }
1010
1011         if (allFinished)
1012         {
1013                 /* all entries are finished */
1014                 key->isFinished = true;
1015                 return;
1016         }
1017
1018         /*
1019          * Ok, we now know that there are no matches < minItem.
1020          *
1021          * If minItem is lossy, it means that there were no exact items on the
1022          * page among requiredEntries, because lossy pointers sort after exact
1023          * items. However, there might be exact items for the same page among
1024          * additionalEntries, so we mustn't advance past them.
1025          */
1026         if (ItemPointerIsLossyPage(&minItem))
1027         {
1028                 if (GinItemPointerGetBlockNumber(&advancePast) <
1029                         GinItemPointerGetBlockNumber(&minItem))
1030                 {
1031                         ItemPointerSet(&advancePast,
1032                                                    GinItemPointerGetBlockNumber(&minItem),
1033                                                    InvalidOffsetNumber);
1034                 }
1035         }
1036         else
1037         {
1038                 Assert(GinItemPointerGetOffsetNumber(&minItem) > 0);
1039                 ItemPointerSet(&advancePast,
1040                                            GinItemPointerGetBlockNumber(&minItem),
1041                                            OffsetNumberPrev(GinItemPointerGetOffsetNumber(&minItem)));
1042         }
1043
1044         /*
1045          * We might not have loaded all the entry streams for this TID yet. We
1046          * could call the consistent function, passing MAYBE for those entries, to
1047          * see if it can decide if this TID matches based on the information we
1048          * have. But if the consistent-function is expensive, and cannot in fact
1049          * decide with partial information, that could be a big loss. So, load all
1050          * the additional entries, before calling the consistent function.
1051          */
1052         for (i = 0; i < key->nadditional; i++)
1053         {
1054                 entry = key->additionalEntries[i];
1055
1056                 if (entry->isFinished)
1057                         continue;
1058
1059                 if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1060                 {
1061                         entryGetItem(ginstate, entry, advancePast, snapshot);
1062                         if (entry->isFinished)
1063                                 continue;
1064                 }
1065
1066                 /*
1067                  * Normally, none of the items in additionalEntries can have a curItem
1068                  * larger than minItem. But if minItem is a lossy page, then there
1069                  * might be exact items on the same page among additionalEntries.
1070                  */
1071                 if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
1072                 {
1073                         Assert(ItemPointerIsLossyPage(&minItem));
1074                         minItem = entry->curItem;
1075                 }
1076         }
1077
1078         /*
1079          * Ok, we've advanced all the entries up to minItem now. Set key->curItem,
1080          * and perform consistentFn test.
1081          *
1082          * Lossy-page entries pose a problem, since we don't know the correct
1083          * entryRes state to pass to the consistentFn, and we also don't know what
1084          * its combining logic will be (could be AND, OR, or even NOT). If the
1085          * logic is OR then the consistentFn might succeed for all items in the
1086          * lossy page even when none of the other entries match.
1087          *
1088          * Our strategy is to call the tri-state consistent function, with the
1089          * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
1090          * returns FALSE, none of the lossy items alone are enough for a match, so
1091          * we don't need to return a lossy-page pointer. Otherwise, return a
1092          * lossy-page pointer to indicate that the whole heap page must be
1093          * checked.  (On subsequent calls, we'll do nothing until minItem is past
1094          * the page altogether, thus ensuring that we never return both regular
1095          * and lossy pointers for the same page.)
1096          *
1097          * An exception is that it doesn't matter what we pass for lossy pointers
1098          * in "hidden" entries, because the consistentFn's result can't depend on
1099          * them. We could pass them as MAYBE as well, but if we're using the
1100          * "shim" implementation of a tri-state consistent function (see
1101          * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
1102          * them as true.
1103          *
1104          * Note that only lossy-page entries pointing to the current item's page
1105          * should trigger this processing; we might have future lossy pages in the
1106          * entry array, but they aren't relevant yet.
1107          */
1108         key->curItem = minItem;
1109         ItemPointerSetLossyPage(&curPageLossy,
1110                                                         GinItemPointerGetBlockNumber(&key->curItem));
1111         haveLossyEntry = false;
1112         for (i = 0; i < key->nentries; i++)
1113         {
1114                 entry = key->scanEntry[i];
1115                 if (entry->isFinished == false &&
1116                         ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
1117                 {
1118                         if (i < key->nuserentries)
1119                                 key->entryRes[i] = GIN_MAYBE;
1120                         else
1121                                 key->entryRes[i] = GIN_TRUE;
1122                         haveLossyEntry = true;
1123                 }
1124                 else
1125                         key->entryRes[i] = GIN_FALSE;
1126         }
1127
1128         /* prepare for calling consistentFn in temp context */
1129         oldCtx = MemoryContextSwitchTo(tempCtx);
1130
1131         if (haveLossyEntry)
1132         {
1133                 /* Have lossy-page entries, so see if whole page matches */
1134                 res = key->triConsistentFn(key);
1135
1136                 if (res == GIN_TRUE || res == GIN_MAYBE)
1137                 {
1138                         /* Yes, so clean up ... */
1139                         MemoryContextSwitchTo(oldCtx);
1140                         MemoryContextReset(tempCtx);
1141
1142                         /* and return lossy pointer for whole page */
1143                         key->curItem = curPageLossy;
1144                         key->curItemMatches = true;
1145                         key->recheckCurItem = true;
1146                         return;
1147                 }
1148         }
1149
1150         /*
1151          * At this point we know that we don't need to return a lossy whole-page
1152          * pointer, but we might have matches for individual exact item pointers,
1153          * possibly in combination with a lossy pointer. Pass lossy pointers as
1154          * MAYBE to the ternary consistent function, to let it decide if this
1155          * tuple satisfies the overall key, even though we don't know if the lossy
1156          * entries match.
1157          *
1158          * Prepare entryRes array to be passed to consistentFn.
1159          */
1160         for (i = 0; i < key->nentries; i++)
1161         {
1162                 entry = key->scanEntry[i];
1163                 if (entry->isFinished)
1164                         key->entryRes[i] = GIN_FALSE;
1165 #if 0
1166
1167                 /*
1168                  * This case can't currently happen, because we loaded all the entries
1169                  * for this item earlier.
1170                  */
1171                 else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1172                         key->entryRes[i] = GIN_MAYBE;
1173 #endif
1174                 else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
1175                         key->entryRes[i] = GIN_MAYBE;
1176                 else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
1177                         key->entryRes[i] = GIN_TRUE;
1178                 else
1179                         key->entryRes[i] = GIN_FALSE;
1180         }
1181
1182         res = key->triConsistentFn(key);
1183
1184         switch (res)
1185         {
1186                 case GIN_TRUE:
1187                         key->curItemMatches = true;
1188                         /* triConsistentFn set recheckCurItem */
1189                         break;
1190
1191                 case GIN_FALSE:
1192                         key->curItemMatches = false;
1193                         break;
1194
1195                 case GIN_MAYBE:
1196                         key->curItemMatches = true;
1197                         key->recheckCurItem = true;
1198                         break;
1199
1200                 default:
1201
1202                         /*
1203                          * the 'default' case shouldn't happen, but if the consistent
1204                          * function returns something bogus, this is the safe result
1205                          */
1206                         key->curItemMatches = true;
1207                         key->recheckCurItem = true;
1208                         break;
1209         }
1210
1211         /*
1212          * We have a tuple, and we know if it matches or not. If it's a non-match,
1213          * we could continue to find the next matching tuple, but let's break out
1214          * and give scanGetItem a chance to advance the other keys. They might be
1215          * able to skip past to a much higher TID, allowing us to save work.
1216          */
1217
1218         /* clean up after consistentFn calls */
1219         MemoryContextSwitchTo(oldCtx);
1220         MemoryContextReset(tempCtx);
1221 }
1222
1223 /*
1224  * Get next heap item pointer (after advancePast) from scan.
1225  * Returns true if anything found.
1226  * On success, *item and *recheck are set.
1227  *
1228  * Note: this is very nearly the same logic as in keyGetItem(), except
1229  * that we know the keys are to be combined with AND logic, whereas in
1230  * keyGetItem() the combination logic is known only to the consistentFn.
1231  */
1232 static bool
1233 scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
1234                         ItemPointerData *item, bool *recheck)
1235 {
1236         GinScanOpaque so = (GinScanOpaque) scan->opaque;
1237         uint32          i;
1238         bool            match;
1239
1240         /*----------
1241          * Advance the scan keys in lock-step, until we find an item that matches
1242          * all the keys. If any key reports isFinished, meaning its subset of the
1243          * entries is exhausted, we can stop.  Otherwise, set *item to the next
1244          * matching item.
1245          *
1246          * This logic works only if a keyGetItem stream can never contain both
1247          * exact and lossy pointers for the same page.  Else we could have a
1248          * case like
1249          *
1250          *              stream 1                stream 2
1251          *              ...             ...
1252          *              42/6                    42/7
1253          *              50/1                    42/0xffff
1254          *              ...             ...
1255          *
1256          * We would conclude that 42/6 is not a match and advance stream 1,
1257          * thus never detecting the match to the lossy pointer in stream 2.
1258          * (keyGetItem has a similar problem versus entryGetItem.)
1259          *----------
1260          */
1261         do
1262         {
1263                 ItemPointerSetMin(item);
1264                 match = true;
1265                 for (i = 0; i < so->nkeys && match; i++)
1266                 {
1267                         GinScanKey      key = so->keys + i;
1268
1269                         /* Fetch the next item for this key that is > advancePast. */
1270                         keyGetItem(&so->ginstate, so->tempCtx, key, advancePast,
1271                                            scan->xs_snapshot);
1272
1273                         if (key->isFinished)
1274                                 return false;
1275
1276                         /*
1277                          * If it's not a match, we can immediately conclude that nothing
1278                          * <= this item matches, without checking the rest of the keys.
1279                          */
1280                         if (!key->curItemMatches)
1281                         {
1282                                 advancePast = key->curItem;
1283                                 match = false;
1284                                 break;
1285                         }
1286
1287                         /*
1288                          * It's a match. We can conclude that nothing < matches, so the
1289                          * other key streams can skip to this item.
1290                          *
1291                          * Beware of lossy pointers, though; from a lossy pointer, we can
1292                          * only conclude that nothing smaller than this *block* matches.
1293                          */
1294                         if (ItemPointerIsLossyPage(&key->curItem))
1295                         {
1296                                 if (GinItemPointerGetBlockNumber(&advancePast) <
1297                                         GinItemPointerGetBlockNumber(&key->curItem))
1298                                 {
1299                                         ItemPointerSet(&advancePast,
1300                                                                    GinItemPointerGetBlockNumber(&key->curItem),
1301                                                                    InvalidOffsetNumber);
1302                                 }
1303                         }
1304                         else
1305                         {
1306                                 Assert(GinItemPointerGetOffsetNumber(&key->curItem) > 0);
1307                                 ItemPointerSet(&advancePast,
1308                                                            GinItemPointerGetBlockNumber(&key->curItem),
1309                                                            OffsetNumberPrev(GinItemPointerGetOffsetNumber(&key->curItem)));
1310                         }
1311
1312                         /*
1313                          * If this is the first key, remember this location as a potential
1314                          * match, and proceed to check the rest of the keys.
1315                          *
1316                          * Otherwise, check if this is the same item that we checked the
1317                          * previous keys for (or a lossy pointer for the same page). If
1318                          * not, loop back to check the previous keys for this item (we
1319                          * will check this key again too, but keyGetItem returns quickly
1320                          * for that)
1321                          */
1322                         if (i == 0)
1323                         {
1324                                 *item = key->curItem;
1325                         }
1326                         else
1327                         {
1328                                 if (ItemPointerIsLossyPage(&key->curItem) ||
1329                                         ItemPointerIsLossyPage(item))
1330                                 {
1331                                         Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
1332                                         match = (GinItemPointerGetBlockNumber(&key->curItem) ==
1333                                                          GinItemPointerGetBlockNumber(item));
1334                                 }
1335                                 else
1336                                 {
1337                                         Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
1338                                         match = (ginCompareItemPointers(&key->curItem, item) == 0);
1339                                 }
1340                         }
1341                 }
1342         } while (!match);
1343
1344         Assert(!ItemPointerIsMin(item));
1345
1346         /*
1347          * Now *item contains the first ItemPointer after previous result that
1348          * satisfied all the keys for that exact TID, or a lossy reference to the
1349          * same page.
1350          *
1351          * We must return recheck = true if any of the keys are marked recheck.
1352          */
1353         *recheck = false;
1354         for (i = 0; i < so->nkeys; i++)
1355         {
1356                 GinScanKey      key = so->keys + i;
1357
1358                 if (key->recheckCurItem)
1359                 {
1360                         *recheck = true;
1361                         break;
1362                 }
1363         }
1364
1365         return true;
1366 }
1367
1368
1369 /*
1370  * Functions for scanning the pending list
1371  */
1372
1373
1374 /*
1375  * Get ItemPointer of next heap row to be checked from pending list.
1376  * Returns false if there are no more. On pages with several heap rows
1377  * it returns each row separately, on page with part of heap row returns
1378  * per page data.  pos->firstOffset and pos->lastOffset are set to identify
1379  * the range of pending-list tuples belonging to this heap row.
1380  *
1381  * The pendingBuffer is presumed pinned and share-locked on entry, and is
1382  * pinned and share-locked on success exit.  On failure exit it's released.
1383  */
1384 static bool
1385 scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
1386 {
1387         OffsetNumber maxoff;
1388         Page            page;
1389         IndexTuple      itup;
1390
1391         ItemPointerSetInvalid(&pos->item);
1392         for (;;)
1393         {
1394                 page = BufferGetPage(pos->pendingBuffer);
1395                 TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1396
1397                 maxoff = PageGetMaxOffsetNumber(page);
1398                 if (pos->firstOffset > maxoff)
1399                 {
1400                         BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
1401
1402                         if (blkno == InvalidBlockNumber)
1403                         {
1404                                 UnlockReleaseBuffer(pos->pendingBuffer);
1405                                 pos->pendingBuffer = InvalidBuffer;
1406
1407                                 return false;
1408                         }
1409                         else
1410                         {
1411                                 /*
1412                                  * Here we must prevent deletion of next page by insertcleanup
1413                                  * process, which may be trying to obtain exclusive lock on
1414                                  * current page.  So, we lock next page before releasing the
1415                                  * current one
1416                                  */
1417                                 Buffer          tmpbuf = ReadBuffer(scan->indexRelation, blkno);
1418
1419                                 LockBuffer(tmpbuf, GIN_SHARE);
1420                                 UnlockReleaseBuffer(pos->pendingBuffer);
1421
1422                                 pos->pendingBuffer = tmpbuf;
1423                                 pos->firstOffset = FirstOffsetNumber;
1424                         }
1425                 }
1426                 else
1427                 {
1428                         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset));
1429                         pos->item = itup->t_tid;
1430                         if (GinPageHasFullRow(page))
1431                         {
1432                                 /*
1433                                  * find itempointer to the next row
1434                                  */
1435                                 for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++)
1436                                 {
1437                                         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset));
1438                                         if (!ItemPointerEquals(&pos->item, &itup->t_tid))
1439                                                 break;
1440                                 }
1441                         }
1442                         else
1443                         {
1444                                 /*
1445                                  * All itempointers are the same on this page
1446                                  */
1447                                 pos->lastOffset = maxoff + 1;
1448                         }
1449
1450                         /*
1451                          * Now pos->firstOffset points to the first tuple of current heap
1452                          * row, pos->lastOffset points to the first tuple of next heap row
1453                          * (or to the end of page)
1454                          */
1455                         break;
1456                 }
1457         }
1458
1459         return true;
1460 }
1461
1462 /*
1463  * Scan pending-list page from current tuple (off) up till the first of:
1464  * - match is found (then returns true)
1465  * - no later match is possible
1466  * - tuple's attribute number is not equal to entry's attrnum
1467  * - reach end of page
1468  *
1469  * datum[]/category[]/datumExtracted[] arrays are used to cache the results
1470  * of gintuple_get_key() on the current page.
1471  */
1472 static bool
1473 matchPartialInPendingList(GinState *ginstate, Page page,
1474                                                   OffsetNumber off, OffsetNumber maxoff,
1475                                                   GinScanEntry entry,
1476                                                   Datum *datum, GinNullCategory *category,
1477                                                   bool *datumExtracted)
1478 {
1479         IndexTuple      itup;
1480         int32           cmp;
1481
1482         /* Partial match to a null is not possible */
1483         if (entry->queryCategory != GIN_CAT_NORM_KEY)
1484                 return false;
1485
1486         while (off < maxoff)
1487         {
1488                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
1489
1490                 if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
1491                         return false;
1492
1493                 if (datumExtracted[off - 1] == false)
1494                 {
1495                         datum[off - 1] = gintuple_get_key(ginstate, itup,
1496                                                                                           &category[off - 1]);
1497                         datumExtracted[off - 1] = true;
1498                 }
1499
1500                 /* Once we hit nulls, no further match is possible */
1501                 if (category[off - 1] != GIN_CAT_NORM_KEY)
1502                         return false;
1503
1504                 /*----------
1505                  * Check partial match.
1506                  * case cmp == 0 => match
1507                  * case cmp > 0 => not match and end scan (no later match possible)
1508                  * case cmp < 0 => not match and continue scan
1509                  *----------
1510                  */
1511                 cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1],
1512                                                                                           ginstate->supportCollation[entry->attnum - 1],
1513                                                                                           entry->queryKey,
1514                                                                                           datum[off - 1],
1515                                                                                           UInt16GetDatum(entry->strategy),
1516                                                                                           PointerGetDatum(entry->extra_data)));
1517                 if (cmp == 0)
1518                         return true;
1519                 else if (cmp > 0)
1520                         return false;
1521
1522                 off++;
1523         }
1524
1525         return false;
1526 }
1527
1528 /*
1529  * Set up the entryRes array for each key by looking at
1530  * every entry for current heap row in pending list.
1531  *
1532  * Returns true if each scan key has at least one entryRes match.
1533  * This corresponds to the situations where the normal index search will
1534  * try to apply the key's consistentFn.  (A tuple not meeting that requirement
1535  * cannot be returned by the normal search since no entry stream will
1536  * source its TID.)
1537  *
1538  * The pendingBuffer is presumed pinned and share-locked on entry.
1539  */
1540 static bool
1541 collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
1542 {
1543         GinScanOpaque so = (GinScanOpaque) scan->opaque;
1544         OffsetNumber attrnum;
1545         Page            page;
1546         IndexTuple      itup;
1547         int                     i,
1548                                 j;
1549
1550         /*
1551          * Reset all entryRes and hasMatchKey flags
1552          */
1553         for (i = 0; i < so->nkeys; i++)
1554         {
1555                 GinScanKey      key = so->keys + i;
1556
1557                 memset(key->entryRes, GIN_FALSE, key->nentries);
1558         }
1559         memset(pos->hasMatchKey, false, so->nkeys);
1560
1561         /*
1562          * Outer loop iterates over multiple pending-list pages when a single heap
1563          * row has entries spanning those pages.
1564          */
1565         for (;;)
1566         {
1567                 Datum           datum[BLCKSZ / sizeof(IndexTupleData)];
1568                 GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
1569                 bool            datumExtracted[BLCKSZ / sizeof(IndexTupleData)];
1570
1571                 Assert(pos->lastOffset > pos->firstOffset);
1572                 memset(datumExtracted + pos->firstOffset - 1, 0,
1573                            sizeof(bool) * (pos->lastOffset - pos->firstOffset));
1574
1575                 page = BufferGetPage(pos->pendingBuffer);
1576                 TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1577
1578                 for (i = 0; i < so->nkeys; i++)
1579                 {
1580                         GinScanKey      key = so->keys + i;
1581
1582                         for (j = 0; j < key->nentries; j++)
1583                         {
1584                                 GinScanEntry entry = key->scanEntry[j];
1585                                 OffsetNumber StopLow = pos->firstOffset,
1586                                                         StopHigh = pos->lastOffset,
1587                                                         StopMiddle;
1588
1589                                 /* If already matched on earlier page, do no extra work */
1590                                 if (key->entryRes[j])
1591                                         continue;
1592
1593                                 /*
1594                                  * Interesting tuples are from pos->firstOffset to
1595                                  * pos->lastOffset and they are ordered by (attnum, Datum) as
1596                                  * it's done in entry tree.  So we can use binary search to
1597                                  * avoid linear scanning.
1598                                  */
1599                                 while (StopLow < StopHigh)
1600                                 {
1601                                         int                     res;
1602
1603                                         StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1604
1605                                         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));
1606
1607                                         attrnum = gintuple_get_attrnum(&so->ginstate, itup);
1608
1609                                         if (key->attnum < attrnum)
1610                                         {
1611                                                 StopHigh = StopMiddle;
1612                                                 continue;
1613                                         }
1614                                         if (key->attnum > attrnum)
1615                                         {
1616                                                 StopLow = StopMiddle + 1;
1617                                                 continue;
1618                                         }
1619
1620                                         if (datumExtracted[StopMiddle - 1] == false)
1621                                         {
1622                                                 datum[StopMiddle - 1] =
1623                                                         gintuple_get_key(&so->ginstate, itup,
1624                                                                                          &category[StopMiddle - 1]);
1625                                                 datumExtracted[StopMiddle - 1] = true;
1626                                         }
1627
1628                                         if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
1629                                         {
1630                                                 /* special behavior depending on searchMode */
1631                                                 if (entry->searchMode == GIN_SEARCH_MODE_ALL)
1632                                                 {
1633                                                         /* match anything except NULL_ITEM */
1634                                                         if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM)
1635                                                                 res = -1;
1636                                                         else
1637                                                                 res = 0;
1638                                                 }
1639                                                 else
1640                                                 {
1641                                                         /* match everything */
1642                                                         res = 0;
1643                                                 }
1644                                         }
1645                                         else
1646                                         {
1647                                                 res = ginCompareEntries(&so->ginstate,
1648                                                                                                 entry->attnum,
1649                                                                                                 entry->queryKey,
1650                                                                                                 entry->queryCategory,
1651                                                                                                 datum[StopMiddle - 1],
1652                                                                                                 category[StopMiddle - 1]);
1653                                         }
1654
1655                                         if (res == 0)
1656                                         {
1657                                                 /*
1658                                                  * Found exact match (there can be only one, except in
1659                                                  * EMPTY_QUERY mode).
1660                                                  *
1661                                                  * If doing partial match, scan forward from here to
1662                                                  * end of page to check for matches.
1663                                                  *
1664                                                  * See comment above about tuple's ordering.
1665                                                  */
1666                                                 if (entry->isPartialMatch)
1667                                                         key->entryRes[j] =
1668                                                                 matchPartialInPendingList(&so->ginstate,
1669                                                                                                                   page,
1670                                                                                                                   StopMiddle,
1671                                                                                                                   pos->lastOffset,
1672                                                                                                                   entry,
1673                                                                                                                   datum,
1674                                                                                                                   category,
1675                                                                                                                   datumExtracted);
1676                                                 else
1677                                                         key->entryRes[j] = true;
1678
1679                                                 /* done with binary search */
1680                                                 break;
1681                                         }
1682                                         else if (res < 0)
1683                                                 StopHigh = StopMiddle;
1684                                         else
1685                                                 StopLow = StopMiddle + 1;
1686                                 }
1687
1688                                 if (StopLow >= StopHigh && entry->isPartialMatch)
1689                                 {
1690                                         /*
1691                                          * No exact match on this page.  If doing partial match,
1692                                          * scan from the first tuple greater than target value to
1693                                          * end of page.  Note that since we don't remember whether
1694                                          * the comparePartialFn told us to stop early on a
1695                                          * previous page, we will uselessly apply comparePartialFn
1696                                          * to the first tuple on each subsequent page.
1697                                          */
1698                                         key->entryRes[j] =
1699                                                 matchPartialInPendingList(&so->ginstate,
1700                                                                                                   page,
1701                                                                                                   StopHigh,
1702                                                                                                   pos->lastOffset,
1703                                                                                                   entry,
1704                                                                                                   datum,
1705                                                                                                   category,
1706                                                                                                   datumExtracted);
1707                                 }
1708
1709                                 pos->hasMatchKey[i] |= key->entryRes[j];
1710                         }
1711                 }
1712
1713                 /* Advance firstOffset over the scanned tuples */
1714                 pos->firstOffset = pos->lastOffset;
1715
1716                 if (GinPageHasFullRow(page))
1717                 {
1718                         /*
1719                          * We have examined all pending entries for the current heap row.
1720                          * Break out of loop over pages.
1721                          */
1722                         break;
1723                 }
1724                 else
1725                 {
1726                         /*
1727                          * Advance to next page of pending entries for the current heap
1728                          * row.  Complain if there isn't one.
1729                          */
1730                         ItemPointerData item = pos->item;
1731
1732                         if (scanGetCandidate(scan, pos) == false ||
1733                                 !ItemPointerEquals(&pos->item, &item))
1734                                 elog(ERROR, "could not find additional pending pages for same heap tuple");
1735                 }
1736         }
1737
1738         /*
1739          * Now return "true" if all scan keys have at least one matching datum
1740          */
1741         for (i = 0; i < so->nkeys; i++)
1742         {
1743                 if (pos->hasMatchKey[i] == false)
1744                         return false;
1745         }
1746
1747         return true;
1748 }
1749
1750 /*
1751  * Collect all matched rows from pending list into bitmap.
1752  */
1753 static void
1754 scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
1755 {
1756         GinScanOpaque so = (GinScanOpaque) scan->opaque;
1757         MemoryContext oldCtx;
1758         bool            recheck,
1759                                 match;
1760         int                     i;
1761         pendingPosition pos;
1762         Buffer          metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO);
1763         Page            page;
1764         BlockNumber blkno;
1765
1766         *ntids = 0;
1767
1768         /*
1769          * Acquire predicate lock on the metapage, to conflict with any fastupdate
1770          * insertions.
1771          */
1772         PredicateLockPage(scan->indexRelation, GIN_METAPAGE_BLKNO, scan->xs_snapshot);
1773
1774         LockBuffer(metabuffer, GIN_SHARE);
1775         page = BufferGetPage(metabuffer);
1776         TestForOldSnapshot(scan->xs_snapshot, scan->indexRelation, page);
1777         blkno = GinPageGetMeta(page)->head;
1778
1779         /*
1780          * fetch head of list before unlocking metapage. head page must be pinned
1781          * to prevent deletion by vacuum process
1782          */
1783         if (blkno == InvalidBlockNumber)
1784         {
1785                 /* No pending list, so proceed with normal scan */
1786                 UnlockReleaseBuffer(metabuffer);
1787                 return;
1788         }
1789
1790         pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
1791         LockBuffer(pos.pendingBuffer, GIN_SHARE);
1792         pos.firstOffset = FirstOffsetNumber;
1793         UnlockReleaseBuffer(metabuffer);
1794         pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys);
1795
1796         /*
1797          * loop for each heap row. scanGetCandidate returns full row or row's
1798          * tuples from first page.
1799          */
1800         while (scanGetCandidate(scan, &pos))
1801         {
1802                 /*
1803                  * Check entries in tuple and set up entryRes array.
1804                  *
1805                  * If pending tuples belonging to the current heap row are spread
1806                  * across several pages, collectMatchesForHeapRow will read all of
1807                  * those pages.
1808                  */
1809                 if (!collectMatchesForHeapRow(scan, &pos))
1810                         continue;
1811
1812                 /*
1813                  * Matching of entries of one row is finished, so check row using
1814                  * consistent functions.
1815                  */
1816                 oldCtx = MemoryContextSwitchTo(so->tempCtx);
1817                 recheck = false;
1818                 match = true;
1819
1820                 for (i = 0; i < so->nkeys; i++)
1821                 {
1822                         GinScanKey      key = so->keys + i;
1823
1824                         if (!key->boolConsistentFn(key))
1825                         {
1826                                 match = false;
1827                                 break;
1828                         }
1829                         recheck |= key->recheckCurItem;
1830                 }
1831
1832                 MemoryContextSwitchTo(oldCtx);
1833                 MemoryContextReset(so->tempCtx);
1834
1835                 if (match)
1836                 {
1837                         tbm_add_tuples(tbm, &pos.item, 1, recheck);
1838                         (*ntids)++;
1839                 }
1840         }
1841
1842         pfree(pos.hasMatchKey);
1843 }
1844
1845
1846 #define GinIsVoidRes(s)         ( ((GinScanOpaque) scan->opaque)->isVoidRes )
1847
1848 int64
1849 gingetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
1850 {
1851         GinScanOpaque so = (GinScanOpaque) scan->opaque;
1852         int64           ntids;
1853         ItemPointerData iptr;
1854         bool            recheck;
1855
1856         /*
1857          * Set up the scan keys, and check for unsatisfiable query.
1858          */
1859         ginFreeScanKeys(so);            /* there should be no keys yet, but just to be
1860                                                                  * sure */
1861         ginNewScanKey(scan);
1862
1863         if (GinIsVoidRes(scan))
1864                 return 0;
1865
1866         ntids = 0;
1867
1868         /*
1869          * First, scan the pending list and collect any matching entries into the
1870          * bitmap.  After we scan a pending item, some other backend could post it
1871          * into the main index, and so we might visit it a second time during the
1872          * main scan.  This is okay because we'll just re-set the same bit in the
1873          * bitmap.  (The possibility of duplicate visits is a major reason why GIN
1874          * can't support the amgettuple API, however.) Note that it would not do
1875          * to scan the main index before the pending list, since concurrent
1876          * cleanup could then make us miss entries entirely.
1877          */
1878         scanPendingInsert(scan, tbm, &ntids);
1879
1880         /*
1881          * Now scan the main index.
1882          */
1883         startScan(scan);
1884
1885         ItemPointerSetMin(&iptr);
1886
1887         for (;;)
1888         {
1889                 CHECK_FOR_INTERRUPTS();
1890
1891                 if (!scanGetItem(scan, iptr, &iptr, &recheck))
1892                         break;
1893
1894                 if (ItemPointerIsLossyPage(&iptr))
1895                         tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr));
1896                 else
1897                         tbm_add_tuples(tbm, &iptr, 1, recheck);
1898                 ntids++;
1899         }
1900
1901         return ntids;
1902 }