]> granicus.if.org Git - postgresql/blob - src/backend/access/gin/ginget.c
Use a separate memory context for GIN scan keys.
[postgresql] / src / backend / access / gin / ginget.c
1 /*-------------------------------------------------------------------------
2  *
3  * ginget.c
4  *        fetch tuples from a GIN scan.
5  *
6  *
7  * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
8  * Portions Copyright (c) 1994, Regents of the University of California
9  *
10  * IDENTIFICATION
11  *                      src/backend/access/gin/ginget.c
12  *-------------------------------------------------------------------------
13  */
14
15 #include "postgres.h"
16
17 #include "access/gin_private.h"
18 #include "access/relscan.h"
19 #include "miscadmin.h"
20 #include "utils/datum.h"
21 #include "utils/memutils.h"
22
23 /* GUC parameter */
24 int                     GinFuzzySearchLimit = 0;
25
26 typedef struct pendingPosition
27 {
28         Buffer          pendingBuffer;
29         OffsetNumber firstOffset;
30         OffsetNumber lastOffset;
31         ItemPointerData item;
32         bool       *hasMatchKey;
33 } pendingPosition;
34
35
36 /*
37  * Goes to the next page if current offset is outside of bounds
38  */
39 static bool
40 moveRightIfItNeeded(GinBtreeData *btree, GinBtreeStack *stack)
41 {
42         Page            page = BufferGetPage(stack->buffer);
43
44         if (stack->off > PageGetMaxOffsetNumber(page))
45         {
46                 /*
47                  * We scanned the whole page, so we should take right page
48                  */
49                 if (GinPageRightMost(page))
50                         return false;           /* no more pages */
51
52                 stack->buffer = ginStepRight(stack->buffer, btree->index, GIN_SHARE);
53                 stack->blkno = BufferGetBlockNumber(stack->buffer);
54                 stack->off = FirstOffsetNumber;
55         }
56
57         return true;
58 }
59
60 /*
61  * Scan all pages of a posting tree and save all its heap ItemPointers
62  * in scanEntry->matchBitmap
63  */
64 static void
65 scanPostingTree(Relation index, GinScanEntry scanEntry,
66                                 BlockNumber rootPostingTree)
67 {
68         GinBtreeData btree;
69         GinBtreeStack *stack;
70         Buffer          buffer;
71         Page            page;
72
73         /* Descend to the leftmost leaf page */
74         stack = ginScanBeginPostingTree(&btree, index, rootPostingTree);
75         buffer = stack->buffer;
76         IncrBufferRefCount(buffer); /* prevent unpin in freeGinBtreeStack */
77
78         freeGinBtreeStack(stack);
79
80         /*
81          * Loop iterates through all leaf pages of posting tree
82          */
83         for (;;)
84         {
85                 page = BufferGetPage(buffer);
86                 if ((GinPageGetOpaque(page)->flags & GIN_DELETED) == 0)
87                 {
88                         int                     n = GinDataLeafPageGetItemsToTbm(page, scanEntry->matchBitmap);
89
90                         scanEntry->predictNumberResult += n;
91                 }
92
93                 if (GinPageRightMost(page))
94                         break;                          /* no more pages */
95
96                 buffer = ginStepRight(buffer, index, GIN_SHARE);
97         }
98
99         UnlockReleaseBuffer(buffer);
100 }
101
102 /*
103  * Collects TIDs into scanEntry->matchBitmap for all heap tuples that
104  * match the search entry.  This supports three different match modes:
105  *
106  * 1. Partial-match support: scan from current point until the
107  *        comparePartialFn says we're done.
108  * 2. SEARCH_MODE_ALL: scan from current point (which should be first
109  *        key for the current attnum) until we hit null items or end of attnum
110  * 3. SEARCH_MODE_EVERYTHING: scan from current point (which should be first
111  *        key for the current attnum) until we hit end of attnum
112  *
113  * Returns true if done, false if it's necessary to restart scan from scratch
114  */
115 static bool
116 collectMatchBitmap(GinBtreeData *btree, GinBtreeStack *stack,
117                                    GinScanEntry scanEntry)
118 {
119         OffsetNumber attnum;
120         Form_pg_attribute attr;
121
122         /* Initialize empty bitmap result */
123         scanEntry->matchBitmap = tbm_create(work_mem * 1024L);
124
125         /* Null query cannot partial-match anything */
126         if (scanEntry->isPartialMatch &&
127                 scanEntry->queryCategory != GIN_CAT_NORM_KEY)
128                 return true;
129
130         /* Locate tupdesc entry for key column (for attbyval/attlen data) */
131         attnum = scanEntry->attnum;
132         attr = btree->ginstate->origTupdesc->attrs[attnum - 1];
133
134         for (;;)
135         {
136                 Page            page;
137                 IndexTuple      itup;
138                 Datum           idatum;
139                 GinNullCategory icategory;
140
141                 /*
142                  * stack->off points to the interested entry, buffer is already locked
143                  */
144                 if (moveRightIfItNeeded(btree, stack) == false)
145                         return true;
146
147                 page = BufferGetPage(stack->buffer);
148                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
149
150                 /*
151                  * If tuple stores another attribute then stop scan
152                  */
153                 if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
154                         return true;
155
156                 /* Safe to fetch attribute value */
157                 idatum = gintuple_get_key(btree->ginstate, itup, &icategory);
158
159                 /*
160                  * Check for appropriate scan stop conditions
161                  */
162                 if (scanEntry->isPartialMatch)
163                 {
164                         int32           cmp;
165
166                         /*
167                          * In partial match, stop scan at any null (including
168                          * placeholders); partial matches never match nulls
169                          */
170                         if (icategory != GIN_CAT_NORM_KEY)
171                                 return true;
172
173                         /*----------
174                          * Check of partial match.
175                          * case cmp == 0 => match
176                          * case cmp > 0 => not match and finish scan
177                          * case cmp < 0 => not match and continue scan
178                          *----------
179                          */
180                         cmp = DatumGetInt32(FunctionCall4Coll(&btree->ginstate->comparePartialFn[attnum - 1],
181                                                            btree->ginstate->supportCollation[attnum - 1],
182                                                                                                   scanEntry->queryKey,
183                                                                                                   idatum,
184                                                                                  UInt16GetDatum(scanEntry->strategy),
185                                                                         PointerGetDatum(scanEntry->extra_data)));
186
187                         if (cmp > 0)
188                                 return true;
189                         else if (cmp < 0)
190                         {
191                                 stack->off++;
192                                 continue;
193                         }
194                 }
195                 else if (scanEntry->searchMode == GIN_SEARCH_MODE_ALL)
196                 {
197                         /*
198                          * In ALL mode, we are not interested in null items, so we can
199                          * stop if we get to a null-item placeholder (which will be the
200                          * last entry for a given attnum).  We do want to include NULL_KEY
201                          * and EMPTY_ITEM entries, though.
202                          */
203                         if (icategory == GIN_CAT_NULL_ITEM)
204                                 return true;
205                 }
206
207                 /*
208                  * OK, we want to return the TIDs listed in this entry.
209                  */
210                 if (GinIsPostingTree(itup))
211                 {
212                         BlockNumber rootPostingTree = GinGetPostingTree(itup);
213
214                         /*
215                          * We should unlock current page (but not unpin) during tree scan
216                          * to prevent deadlock with vacuum processes.
217                          *
218                          * We save current entry value (idatum) to be able to re-find our
219                          * tuple after re-locking
220                          */
221                         if (icategory == GIN_CAT_NORM_KEY)
222                                 idatum = datumCopy(idatum, attr->attbyval, attr->attlen);
223
224                         LockBuffer(stack->buffer, GIN_UNLOCK);
225
226                         /* Collect all the TIDs in this entry's posting tree */
227                         scanPostingTree(btree->index, scanEntry, rootPostingTree);
228
229                         /*
230                          * We lock again the entry page and while it was unlocked insert
231                          * might have occurred, so we need to re-find our position.
232                          */
233                         LockBuffer(stack->buffer, GIN_SHARE);
234                         page = BufferGetPage(stack->buffer);
235                         if (!GinPageIsLeaf(page))
236                         {
237                                 /*
238                                  * Root page becomes non-leaf while we unlock it. We will
239                                  * start again, this situation doesn't occur often - root can
240                                  * became a non-leaf only once per life of index.
241                                  */
242                                 return false;
243                         }
244
245                         /* Search forward to re-find idatum */
246                         for (;;)
247                         {
248                                 Datum           newDatum;
249                                 GinNullCategory newCategory;
250
251                                 if (moveRightIfItNeeded(btree, stack) == false)
252                                         elog(ERROR, "lost saved point in index");       /* must not happen !!! */
253
254                                 page = BufferGetPage(stack->buffer);
255                                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stack->off));
256
257                                 if (gintuple_get_attrnum(btree->ginstate, itup) != attnum)
258                                         elog(ERROR, "lost saved point in index");       /* must not happen !!! */
259                                 newDatum = gintuple_get_key(btree->ginstate, itup,
260                                                                                         &newCategory);
261
262                                 if (ginCompareEntries(btree->ginstate, attnum,
263                                                                           newDatum, newCategory,
264                                                                           idatum, icategory) == 0)
265                                         break;          /* Found! */
266
267                                 stack->off++;
268                         }
269
270                         if (icategory == GIN_CAT_NORM_KEY && !attr->attbyval)
271                                 pfree(DatumGetPointer(idatum));
272                 }
273                 else
274                 {
275                         ItemPointer ipd;
276                         int                     nipd;
277
278                         ipd = ginReadTuple(btree->ginstate, scanEntry->attnum, itup, &nipd);
279                         tbm_add_tuples(scanEntry->matchBitmap, ipd, nipd, false);
280                         scanEntry->predictNumberResult += GinGetNPosting(itup);
281                 }
282
283                 /*
284                  * Done with this entry, go to the next
285                  */
286                 stack->off++;
287         }
288 }
289
290 /*
291  * Start* functions setup beginning state of searches: finds correct buffer and pins it.
292  */
293 static void
294 startScanEntry(GinState *ginstate, GinScanEntry entry)
295 {
296         GinBtreeData btreeEntry;
297         GinBtreeStack *stackEntry;
298         Page            page;
299         bool            needUnlock;
300
301 restartScanEntry:
302         entry->buffer = InvalidBuffer;
303         ItemPointerSetMin(&entry->curItem);
304         entry->offset = InvalidOffsetNumber;
305         entry->list = NULL;
306         entry->nlist = 0;
307         entry->matchBitmap = NULL;
308         entry->matchResult = NULL;
309         entry->reduceResult = FALSE;
310         entry->predictNumberResult = 0;
311
312         /*
313          * we should find entry, and begin scan of posting tree or just store
314          * posting list in memory
315          */
316         ginPrepareEntryScan(&btreeEntry, entry->attnum,
317                                                 entry->queryKey, entry->queryCategory,
318                                                 ginstate);
319         stackEntry = ginFindLeafPage(&btreeEntry, true);
320         page = BufferGetPage(stackEntry->buffer);
321         needUnlock = TRUE;
322
323         entry->isFinished = TRUE;
324
325         if (entry->isPartialMatch ||
326                 entry->queryCategory == GIN_CAT_EMPTY_QUERY)
327         {
328                 /*
329                  * btreeEntry.findItem locates the first item >= given search key.
330                  * (For GIN_CAT_EMPTY_QUERY, it will find the leftmost index item
331                  * because of the way the GIN_CAT_EMPTY_QUERY category code is
332                  * assigned.)  We scan forward from there and collect all TIDs needed
333                  * for the entry type.
334                  */
335                 btreeEntry.findItem(&btreeEntry, stackEntry);
336                 if (collectMatchBitmap(&btreeEntry, stackEntry, entry) == false)
337                 {
338                         /*
339                          * GIN tree was seriously restructured, so we will cleanup all
340                          * found data and rescan. See comments near 'return false' in
341                          * collectMatchBitmap()
342                          */
343                         if (entry->matchBitmap)
344                         {
345                                 if (entry->matchIterator)
346                                         tbm_end_iterate(entry->matchIterator);
347                                 entry->matchIterator = NULL;
348                                 tbm_free(entry->matchBitmap);
349                                 entry->matchBitmap = NULL;
350                         }
351                         LockBuffer(stackEntry->buffer, GIN_UNLOCK);
352                         freeGinBtreeStack(stackEntry);
353                         goto restartScanEntry;
354                 }
355
356                 if (entry->matchBitmap && !tbm_is_empty(entry->matchBitmap))
357                 {
358                         entry->matchIterator = tbm_begin_iterate(entry->matchBitmap);
359                         entry->isFinished = FALSE;
360                 }
361         }
362         else if (btreeEntry.findItem(&btreeEntry, stackEntry))
363         {
364                 IndexTuple      itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
365
366                 if (GinIsPostingTree(itup))
367                 {
368                         BlockNumber rootPostingTree = GinGetPostingTree(itup);
369                         GinBtreeStack *stack;
370                         Page            page;
371                         ItemPointerData minItem;
372
373                         /*
374                          * We should unlock entry page before touching posting tree to
375                          * prevent deadlocks with vacuum processes. Because entry is never
376                          * deleted from page and posting tree is never reduced to the
377                          * posting list, we can unlock page after getting BlockNumber of
378                          * root of posting tree.
379                          */
380                         LockBuffer(stackEntry->buffer, GIN_UNLOCK);
381                         needUnlock = FALSE;
382
383                         stack = ginScanBeginPostingTree(&entry->btree, ginstate->index,
384                                                                                         rootPostingTree);
385                         entry->buffer = stack->buffer;
386
387                         /*
388                          * We keep buffer pinned because we need to prevent deletion of
389                          * page during scan. See GIN's vacuum implementation. RefCount is
390                          * increased to keep buffer pinned after freeGinBtreeStack() call.
391                          */
392                         IncrBufferRefCount(entry->buffer);
393
394                         page = BufferGetPage(entry->buffer);
395
396                         /*
397                          * Load the first page into memory.
398                          */
399                         ItemPointerSetMin(&minItem);
400                         entry->list = GinDataLeafPageGetItems(page, &entry->nlist, minItem);
401
402                         entry->predictNumberResult = stack->predictNumber * entry->nlist;
403
404                         LockBuffer(entry->buffer, GIN_UNLOCK);
405                         freeGinBtreeStack(stack);
406                         entry->isFinished = FALSE;
407                 }
408                 else if (GinGetNPosting(itup) > 0)
409                 {
410                         entry->list = ginReadTuple(ginstate, entry->attnum, itup,
411                                                                            &entry->nlist);
412                         entry->predictNumberResult = entry->nlist;
413
414                         entry->isFinished = FALSE;
415                 }
416         }
417
418         if (needUnlock)
419                 LockBuffer(stackEntry->buffer, GIN_UNLOCK);
420         freeGinBtreeStack(stackEntry);
421 }
422
423 /*
424  * Comparison function for scan entry indexes. Sorts by predictNumberResult,
425  * least frequent items first.
426  */
427 static int
428 entryIndexByFrequencyCmp(const void *a1, const void *a2, void *arg)
429 {
430         const GinScanKey key = (const GinScanKey) arg;
431         int                     i1 = *(const int *) a1;
432         int                     i2 = *(const int *) a2;
433         uint32          n1 = key->scanEntry[i1]->predictNumberResult;
434         uint32          n2 = key->scanEntry[i2]->predictNumberResult;
435
436         if (n1 < n2)
437                 return -1;
438         else if (n1 == n2)
439                 return 0;
440         else
441                 return 1;
442 }
443
444 static void
445 startScanKey(GinState *ginstate, GinScanOpaque so, GinScanKey key)
446 {
447         MemoryContext oldCtx = CurrentMemoryContext;
448         int                     i;
449         int                     j;
450         int                *entryIndexes;
451
452         ItemPointerSetMin(&key->curItem);
453         key->curItemMatches = false;
454         key->recheckCurItem = false;
455         key->isFinished = false;
456
457         /*
458          * Divide the entries into two distinct sets: required and additional.
459          * Additional entries are not enough for a match alone, without any items
460          * from the required set, but are needed by the consistent function to
461          * decide if an item matches. When scanning, we can skip over items from
462          * additional entries that have no corresponding matches in any of the
463          * required entries. That speeds up queries like "frequent & rare"
464          * considerably, if the frequent term can be put in the additional set.
465          *
466          * There can be many legal ways to divide them entries into these two
467          * sets. A conservative division is to just put everything in the required
468          * set, but the more you can put in the additional set, the more you can
469          * skip during the scan. To maximize skipping, we try to put as many
470          * frequent items as possible into additional, and less frequent ones into
471          * required. To do that, sort the entries by frequency
472          * (predictNumberResult), and put entries into the required set in that
473          * order, until the consistent function says that none of the remaining
474          * entries can form a match, without any items from the required set. The
475          * rest go to the additional set.
476          */
477         if (key->nentries > 1)
478         {
479                 MemoryContextSwitchTo(so->tempCtx);
480
481                 entryIndexes = (int *) palloc(sizeof(int) * key->nentries);
482                 for (i = 0; i < key->nentries; i++)
483                         entryIndexes[i] = i;
484                 qsort_arg(entryIndexes, key->nentries, sizeof(int),
485                                   entryIndexByFrequencyCmp, key);
486
487                 for (i = 0; i < key->nentries - 1; i++)
488                 {
489                         /* Pass all entries <= i as FALSE, and the rest as MAYBE */
490                         for (j = 0; j <= i; j++)
491                                 key->entryRes[entryIndexes[j]] = GIN_FALSE;
492                         for (j = i + 1; j < key->nentries; j++)
493                                 key->entryRes[entryIndexes[j]] = GIN_MAYBE;
494
495                         if (key->triConsistentFn(key) == GIN_FALSE)
496                                 break;
497                 }
498                 /* i is now the last required entry. */
499
500                 MemoryContextSwitchTo(so->keyCtx);
501
502                 key->nrequired = i + 1;
503                 key->nadditional = key->nentries - key->nrequired;
504                 key->requiredEntries = palloc(key->nrequired * sizeof(GinScanEntry));
505                 key->additionalEntries = palloc(key->nadditional * sizeof(GinScanEntry));
506
507                 j = 0;
508                 for (i = 0; i < key->nrequired; i++)
509                         key->requiredEntries[i] = key->scanEntry[entryIndexes[j++]];
510                 for (i = 0; i < key->nadditional; i++)
511                         key->additionalEntries[i] = key->scanEntry[entryIndexes[j++]];
512
513                 /* clean up after consistentFn calls (also frees entryIndexes) */
514                 MemoryContextReset(so->tempCtx);
515         }
516         else
517         {
518                 MemoryContextSwitchTo(so->keyCtx);
519
520                 key->nrequired = 1;
521                 key->nadditional = 0;
522                 key->requiredEntries = palloc(1 * sizeof(GinScanEntry));
523                 key->requiredEntries[0] = key->scanEntry[0];
524         }
525         MemoryContextSwitchTo(oldCtx);
526 }
527
528 static void
529 startScan(IndexScanDesc scan)
530 {
531         GinScanOpaque so = (GinScanOpaque) scan->opaque;
532         GinState   *ginstate = &so->ginstate;
533         uint32          i;
534
535         for (i = 0; i < so->totalentries; i++)
536                 startScanEntry(ginstate, so->entries[i]);
537
538         if (GinFuzzySearchLimit > 0)
539         {
540                 /*
541                  * If all of keys more than threshold we will try to reduce result, we
542                  * hope (and only hope, for intersection operation of array our
543                  * supposition isn't true), that total result will not more than
544                  * minimal predictNumberResult.
545                  */
546                 bool            reduce = true;
547
548                 for (i = 0; i < so->totalentries; i++)
549                 {
550                         if (so->entries[i]->predictNumberResult <= so->totalentries * GinFuzzySearchLimit)
551                         {
552                                 reduce = false;
553                                 break;
554                         }
555                 }
556                 if (reduce)
557                 {
558                         for (i = 0; i < so->totalentries; i++)
559                         {
560                                 so->entries[i]->predictNumberResult /= so->totalentries;
561                                 so->entries[i]->reduceResult = TRUE;
562                         }
563                 }
564         }
565
566         /*
567          * Now that we have the estimates for the entry frequencies, finish
568          * initializing the scan keys.
569          */
570         for (i = 0; i < so->nkeys; i++)
571                 startScanKey(ginstate, so, so->keys + i);
572 }
573
574 /*
575  * Load the next batch of item pointers from a posting tree.
576  *
577  * Note that we copy the page into GinScanEntry->list array and unlock it, but
578  * keep it pinned to prevent interference with vacuum.
579  */
580 static void
581 entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast)
582 {
583         Page            page;
584         int                     i;
585         bool            stepright;
586
587         if (!BufferIsValid(entry->buffer))
588         {
589                 entry->isFinished = true;
590                 return;
591         }
592
593         /*
594          * We have two strategies for finding the correct page: step right from
595          * the current page, or descend the tree again from the root. If
596          * advancePast equals the current item, the next matching item should be
597          * on the next page, so we step right. Otherwise, descend from root.
598          */
599         if (ginCompareItemPointers(&entry->curItem, &advancePast) == 0)
600         {
601                 stepright = true;
602                 LockBuffer(entry->buffer, GIN_SHARE);
603         }
604         else
605         {
606                 GinBtreeStack *stack;
607
608                 ReleaseBuffer(entry->buffer);
609
610                 /*
611                  * Set the search key, and find the correct leaf page.
612                  */
613                 if (ItemPointerIsLossyPage(&advancePast))
614                 {
615                         ItemPointerSet(&entry->btree.itemptr,
616                                                    GinItemPointerGetBlockNumber(&advancePast) + 1,
617                                                    FirstOffsetNumber);
618                 }
619                 else
620                 {
621                         entry->btree.itemptr = advancePast;
622                         entry->btree.itemptr.ip_posid++;
623                 }
624                 entry->btree.fullScan = false;
625                 stack = ginFindLeafPage(&entry->btree, true);
626
627                 /* we don't need the stack, just the buffer. */
628                 entry->buffer = stack->buffer;
629                 IncrBufferRefCount(entry->buffer);
630                 freeGinBtreeStack(stack);
631                 stepright = false;
632         }
633
634         elog(DEBUG2, "entryLoadMoreItems, %u/%u, skip: %d",
635                  GinItemPointerGetBlockNumber(&advancePast),
636                  GinItemPointerGetOffsetNumber(&advancePast),
637                  !stepright);
638
639         page = BufferGetPage(entry->buffer);
640         for (;;)
641         {
642                 entry->offset = InvalidOffsetNumber;
643                 if (entry->list)
644                 {
645                         pfree(entry->list);
646                         entry->list = NULL;
647                         entry->nlist = 0;
648                 }
649
650                 if (stepright)
651                 {
652                         /*
653                          * We've processed all the entries on this page. If it was the
654                          * last page in the tree, we're done.
655                          */
656                         if (GinPageRightMost(page))
657                         {
658                                 UnlockReleaseBuffer(entry->buffer);
659                                 entry->buffer = InvalidBuffer;
660                                 entry->isFinished = TRUE;
661                                 return;
662                         }
663
664                         /*
665                          * Step to next page, following the right link. then find the
666                          * first ItemPointer greater than advancePast.
667                          */
668                         entry->buffer = ginStepRight(entry->buffer,
669                                                                                  ginstate->index,
670                                                                                  GIN_SHARE);
671                         page = BufferGetPage(entry->buffer);
672                 }
673                 stepright = true;
674
675                 if (GinPageGetOpaque(page)->flags & GIN_DELETED)
676                         continue;                       /* page was deleted by concurrent vacuum */
677
678                 /*
679                  * The first item > advancePast might not be on this page, but
680                  * somewhere to the right, if the page was split, or a non-match from
681                  * another key in the query allowed us to skip some items from this
682                  * entry. Keep following the right-links until we re-find the correct
683                  * page.
684                  */
685                 if (!GinPageRightMost(page) &&
686                         ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
687                 {
688                         /*
689                          * the item we're looking is > the right bound of the page, so it
690                          * can't be on this page.
691                          */
692                         continue;
693                 }
694
695                 entry->list = GinDataLeafPageGetItems(page, &entry->nlist, advancePast);
696
697                 for (i = 0; i < entry->nlist; i++)
698                 {
699                         if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
700                         {
701                                 entry->offset = i;
702
703                                 if (GinPageRightMost(page))
704                                 {
705                                         /* after processing the copied items, we're done. */
706                                         UnlockReleaseBuffer(entry->buffer);
707                                         entry->buffer = InvalidBuffer;
708                                 }
709                                 else
710                                         LockBuffer(entry->buffer, GIN_UNLOCK);
711                                 return;
712                         }
713                 }
714         }
715 }
716
717 #define gin_rand() (((double) random()) / ((double) MAX_RANDOM_VALUE))
718 #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
719
720 /*
721  * Sets entry->curItem to next heap item pointer > advancePast, for one entry
722  * of one scan key, or sets entry->isFinished to TRUE if there are no more.
723  *
724  * Item pointers are returned in ascending order.
725  *
726  * Note: this can return a "lossy page" item pointer, indicating that the
727  * entry potentially matches all items on that heap page.  However, it is
728  * not allowed to return both a lossy page pointer and exact (regular)
729  * item pointers for the same page.  (Doing so would break the key-combination
730  * logic in keyGetItem and scanGetItem; see comment in scanGetItem.)  In the
731  * current implementation this is guaranteed by the behavior of tidbitmaps.
732  */
733 static void
734 entryGetItem(GinState *ginstate, GinScanEntry entry,
735                          ItemPointerData advancePast)
736 {
737         Assert(!entry->isFinished);
738
739         Assert(!ItemPointerIsValid(&entry->curItem) ||
740                    ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
741
742         if (entry->matchBitmap)
743         {
744                 /* A bitmap result */
745                 BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
746                 OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
747                 bool            gotitem = false;
748
749                 do
750                 {
751                         /*
752                          * If we've exhausted all items on this block, move to next block
753                          * in the bitmap.
754                          */
755                         while (entry->matchResult == NULL ||
756                                    (entry->matchResult->ntuples >= 0 &&
757                                         entry->offset >= entry->matchResult->ntuples) ||
758                                    entry->matchResult->blockno < advancePastBlk ||
759                                    (ItemPointerIsLossyPage(&advancePast) &&
760                                         entry->matchResult->blockno == advancePastBlk))
761                         {
762                                 entry->matchResult = tbm_iterate(entry->matchIterator);
763
764                                 if (entry->matchResult == NULL)
765                                 {
766                                         ItemPointerSetInvalid(&entry->curItem);
767                                         tbm_end_iterate(entry->matchIterator);
768                                         entry->matchIterator = NULL;
769                                         entry->isFinished = TRUE;
770                                         break;
771                                 }
772
773                                 /*
774                                  * Reset counter to the beginning of entry->matchResult. Note:
775                                  * entry->offset is still greater than matchResult->ntuples if
776                                  * matchResult is lossy.  So, on next call we will get next
777                                  * result from TIDBitmap.
778                                  */
779                                 entry->offset = 0;
780                         }
781                         if (entry->isFinished)
782                                 break;
783
784                         /*
785                          * We're now on the first page after advancePast which has any
786                          * items on it. If it's a lossy result, return that.
787                          */
788                         if (entry->matchResult->ntuples < 0)
789                         {
790                                 ItemPointerSetLossyPage(&entry->curItem,
791                                                                                 entry->matchResult->blockno);
792
793                                 /*
794                                  * We might as well fall out of the loop; we could not
795                                  * estimate number of results on this page to support correct
796                                  * reducing of result even if it's enabled.
797                                  */
798                                 gotitem = true;
799                                 break;
800                         }
801
802                         /*
803                          * Not a lossy page. Skip over any offsets <= advancePast, and
804                          * return that.
805                          */
806                         if (entry->matchResult->blockno == advancePastBlk)
807                         {
808                                 /*
809                                  * First, do a quick check against the last offset on the
810                                  * page. If that's > advancePast, so are all the other
811                                  * offsets.
812                                  */
813                                 if (entry->matchResult->offsets[entry->matchResult->ntuples - 1] <= advancePastOff)
814                                 {
815                                         entry->offset = entry->matchResult->ntuples;
816                                         continue;
817                                 }
818
819                                 /* Otherwise scan to find the first item > advancePast */
820                                 while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
821                                         entry->offset++;
822                         }
823
824                         ItemPointerSet(&entry->curItem,
825                                                    entry->matchResult->blockno,
826                                                    entry->matchResult->offsets[entry->offset]);
827                         entry->offset++;
828                         gotitem = true;
829                 } while (!gotitem || (entry->reduceResult == TRUE && dropItem(entry)));
830         }
831         else if (!BufferIsValid(entry->buffer))
832         {
833                 /*
834                  * A posting list from an entry tuple, or the last page of a posting
835                  * tree.
836                  */
837                 do
838                 {
839                         if (entry->offset >= entry->nlist)
840                         {
841                                 ItemPointerSetInvalid(&entry->curItem);
842                                 entry->isFinished = TRUE;
843                                 break;
844                         }
845
846                         entry->curItem = entry->list[entry->offset++];
847                 } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
848                 /* XXX: shouldn't we apply the fuzzy search limit here? */
849         }
850         else
851         {
852                 /* A posting tree */
853                 do
854                 {
855                         /* If we've processed the current batch, load more items */
856                         while (entry->offset >= entry->nlist)
857                         {
858                                 entryLoadMoreItems(ginstate, entry, advancePast);
859
860                                 if (entry->isFinished)
861                                 {
862                                         ItemPointerSetInvalid(&entry->curItem);
863                                         return;
864                                 }
865                         }
866
867                         entry->curItem = entry->list[entry->offset++];
868
869                 } while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
870                                  (entry->reduceResult == TRUE && dropItem(entry)));
871         }
872 }
873
874 /*
875  * Identify the "current" item among the input entry streams for this scan key
876  * that is greater than advancePast, and test whether it passes the scan key
877  * qual condition.
878  *
879  * The current item is the smallest curItem among the inputs.  key->curItem
880  * is set to that value.  key->curItemMatches is set to indicate whether that
881  * TID passes the consistentFn test.  If so, key->recheckCurItem is set true
882  * iff recheck is needed for this item pointer (including the case where the
883  * item pointer is a lossy page pointer).
884  *
885  * If all entry streams are exhausted, sets key->isFinished to TRUE.
886  *
887  * Item pointers must be returned in ascending order.
888  *
889  * Note: this can return a "lossy page" item pointer, indicating that the
890  * key potentially matches all items on that heap page.  However, it is
891  * not allowed to return both a lossy page pointer and exact (regular)
892  * item pointers for the same page.  (Doing so would break the key-combination
893  * logic in scanGetItem.)
894  */
895 static void
896 keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
897                    ItemPointerData advancePast)
898 {
899         ItemPointerData minItem;
900         ItemPointerData curPageLossy;
901         uint32          i;
902         bool            haveLossyEntry;
903         GinScanEntry entry;
904         GinTernaryValue res;
905         MemoryContext oldCtx;
906         bool            allFinished;
907
908         Assert(!key->isFinished);
909
910         /*
911          * We might have already tested this item; if so, no need to repeat work.
912          * (Note: the ">" case can happen, if advancePast is exact but we
913          * previously had to set curItem to a lossy-page pointer.)
914          */
915         if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
916                 return;
917
918         /*
919          * Find the minimum item > advancePast among the active entry streams.
920          *
921          * Note: a lossy-page entry is encoded by a ItemPointer with max value for
922          * offset (0xffff), so that it will sort after any exact entries for the
923          * same page.  So we'll prefer to return exact pointers not lossy
924          * pointers, which is good.
925          */
926         ItemPointerSetMax(&minItem);
927         allFinished = true;
928         for (i = 0; i < key->nrequired; i++)
929         {
930                 entry = key->requiredEntries[i];
931
932                 if (entry->isFinished)
933                         continue;
934
935                 /*
936                  * Advance this stream if necessary.
937                  *
938                  * In particular, since entry->curItem was initialized with
939                  * ItemPointerSetMin, this ensures we fetch the first item for each
940                  * entry on the first call.
941                  */
942                 if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
943                 {
944                         entryGetItem(ginstate, entry, advancePast);
945                         if (entry->isFinished)
946                                 continue;
947                 }
948
949                 allFinished = false;
950                 if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
951                         minItem = entry->curItem;
952         }
953
954         if (allFinished)
955         {
956                 /* all entries are finished */
957                 key->isFinished = TRUE;
958                 return;
959         }
960
961         /*
962          * Ok, we now know that there are no matches < minItem.
963          *
964          * If minItem is lossy, it means that there were no exact items on the
965          * page among requiredEntries, because lossy pointers sort after exact
966          * items. However, there might be exact items for the same page among
967          * additionalEntries, so we mustn't advance past them.
968          */
969         if (ItemPointerIsLossyPage(&minItem))
970         {
971                 if (GinItemPointerGetBlockNumber(&advancePast) <
972                         GinItemPointerGetBlockNumber(&minItem))
973                 {
974                         advancePast.ip_blkid = minItem.ip_blkid;
975                         advancePast.ip_posid = 0;
976                 }
977         }
978         else
979         {
980                 Assert(minItem.ip_posid > 0);
981                 advancePast = minItem;
982                 advancePast.ip_posid--;
983         }
984
985         /*
986          * We might not have loaded all the entry streams for this TID yet. We
987          * could call the consistent function, passing MAYBE for those entries, to
988          * see if it can decide if this TID matches based on the information we
989          * have. But if the consistent-function is expensive, and cannot in fact
990          * decide with partial information, that could be a big loss. So, load all
991          * the additional entries, before calling the consistent function.
992          */
993         for (i = 0; i < key->nadditional; i++)
994         {
995                 entry = key->additionalEntries[i];
996
997                 if (entry->isFinished)
998                         continue;
999
1000                 if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1001                 {
1002                         entryGetItem(ginstate, entry, advancePast);
1003                         if (entry->isFinished)
1004                                 continue;
1005                 }
1006
1007                 /*
1008                  * Normally, none of the items in additionalEntries can have a curItem
1009                  * larger than minItem. But if minItem is a lossy page, then there
1010                  * might be exact items on the same page among additionalEntries.
1011                  */
1012                 if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
1013                 {
1014                         Assert(ItemPointerIsLossyPage(&minItem));
1015                         minItem = entry->curItem;
1016                 }
1017         }
1018
1019         /*
1020          * Ok, we've advanced all the entries up to minItem now. Set key->curItem,
1021          * and perform consistentFn test.
1022          *
1023          * Lossy-page entries pose a problem, since we don't know the correct
1024          * entryRes state to pass to the consistentFn, and we also don't know what
1025          * its combining logic will be (could be AND, OR, or even NOT). If the
1026          * logic is OR then the consistentFn might succeed for all items in the
1027          * lossy page even when none of the other entries match.
1028          *
1029          * Our strategy is to call the tri-state consistent function, with the
1030          * lossy-page entries set to MAYBE, and all the other entries FALSE. If it
1031          * returns FALSE, none of the lossy items alone are enough for a match, so
1032          * we don't need to return a lossy-page pointer. Otherwise, return a
1033          * lossy-page pointer to indicate that the whole heap page must be
1034          * checked.  (On subsequent calls, we'll do nothing until minItem is past
1035          * the page altogether, thus ensuring that we never return both regular
1036          * and lossy pointers for the same page.)
1037          *
1038          * An exception is that it doesn't matter what we pass for lossy pointers
1039          * in "hidden" entries, because the consistentFn's result can't depend on
1040          * them. We could pass them as MAYBE as well, but if we're using the
1041          * "shim" implementation of a tri-state consistent function (see
1042          * ginlogic.c), it's better to pass as few MAYBEs as possible. So pass
1043          * them as TRUE.
1044          *
1045          * Note that only lossy-page entries pointing to the current item's page
1046          * should trigger this processing; we might have future lossy pages in the
1047          * entry array, but they aren't relevant yet.
1048          */
1049         key->curItem = minItem;
1050         ItemPointerSetLossyPage(&curPageLossy,
1051                                                         GinItemPointerGetBlockNumber(&key->curItem));
1052         haveLossyEntry = false;
1053         for (i = 0; i < key->nentries; i++)
1054         {
1055                 entry = key->scanEntry[i];
1056                 if (entry->isFinished == FALSE &&
1057                         ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
1058                 {
1059                         if (i < key->nuserentries)
1060                                 key->entryRes[i] = GIN_MAYBE;
1061                         else
1062                                 key->entryRes[i] = GIN_TRUE;
1063                         haveLossyEntry = true;
1064                 }
1065                 else
1066                         key->entryRes[i] = GIN_FALSE;
1067         }
1068
1069         /* prepare for calling consistentFn in temp context */
1070         oldCtx = MemoryContextSwitchTo(tempCtx);
1071
1072         if (haveLossyEntry)
1073         {
1074                 /* Have lossy-page entries, so see if whole page matches */
1075                 res = key->triConsistentFn(key);
1076
1077                 if (res == GIN_TRUE || res == GIN_MAYBE)
1078                 {
1079                         /* Yes, so clean up ... */
1080                         MemoryContextSwitchTo(oldCtx);
1081                         MemoryContextReset(tempCtx);
1082
1083                         /* and return lossy pointer for whole page */
1084                         key->curItem = curPageLossy;
1085                         key->curItemMatches = true;
1086                         key->recheckCurItem = true;
1087                         return;
1088                 }
1089         }
1090
1091         /*
1092          * At this point we know that we don't need to return a lossy whole-page
1093          * pointer, but we might have matches for individual exact item pointers,
1094          * possibly in combination with a lossy pointer. Pass lossy pointers as
1095          * MAYBE to the ternary consistent function, to let it decide if this
1096          * tuple satisfies the overall key, even though we don't know if the lossy
1097          * entries match.
1098          *
1099          * Prepare entryRes array to be passed to consistentFn.
1100          */
1101         for (i = 0; i < key->nentries; i++)
1102         {
1103                 entry = key->scanEntry[i];
1104                 if (entry->isFinished)
1105                         key->entryRes[i] = GIN_FALSE;
1106 #if 0
1107
1108                 /*
1109                  * This case can't currently happen, because we loaded all the entries
1110                  * for this item earlier.
1111                  */
1112                 else if (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
1113                         key->entryRes[i] = GIN_MAYBE;
1114 #endif
1115                 else if (ginCompareItemPointers(&entry->curItem, &curPageLossy) == 0)
1116                         key->entryRes[i] = GIN_MAYBE;
1117                 else if (ginCompareItemPointers(&entry->curItem, &minItem) == 0)
1118                         key->entryRes[i] = GIN_TRUE;
1119                 else
1120                         key->entryRes[i] = GIN_FALSE;
1121         }
1122
1123         res = key->triConsistentFn(key);
1124
1125         switch (res)
1126         {
1127                 case GIN_TRUE:
1128                         key->curItemMatches = true;
1129                         /* triConsistentFn set recheckCurItem */
1130                         break;
1131
1132                 case GIN_FALSE:
1133                         key->curItemMatches = false;
1134                         break;
1135
1136                 case GIN_MAYBE:
1137                         key->curItemMatches = true;
1138                         key->recheckCurItem = true;
1139                         break;
1140
1141                 default:
1142
1143                         /*
1144                          * the 'default' case shouldn't happen, but if the consistent
1145                          * function returns something bogus, this is the safe result
1146                          */
1147                         key->curItemMatches = true;
1148                         key->recheckCurItem = true;
1149                         break;
1150         }
1151
1152         /*
1153          * We have a tuple, and we know if it matches or not. If it's a non-match,
1154          * we could continue to find the next matching tuple, but let's break out
1155          * and give scanGetItem a chance to advance the other keys. They might be
1156          * able to skip past to a much higher TID, allowing us to save work.
1157          */
1158
1159         /* clean up after consistentFn calls */
1160         MemoryContextSwitchTo(oldCtx);
1161         MemoryContextReset(tempCtx);
1162 }
1163
1164 /*
1165  * Get next heap item pointer (after advancePast) from scan.
1166  * Returns true if anything found.
1167  * On success, *item and *recheck are set.
1168  *
1169  * Note: this is very nearly the same logic as in keyGetItem(), except
1170  * that we know the keys are to be combined with AND logic, whereas in
1171  * keyGetItem() the combination logic is known only to the consistentFn.
1172  */
1173 static bool
1174 scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
1175                         ItemPointerData *item, bool *recheck)
1176 {
1177         GinScanOpaque so = (GinScanOpaque) scan->opaque;
1178         uint32          i;
1179         bool            match;
1180
1181         /*----------
1182          * Advance the scan keys in lock-step, until we find an item that matches
1183          * all the keys. If any key reports isFinished, meaning its subset of the
1184          * entries is exhausted, we can stop.  Otherwise, set *item to the next
1185          * matching item.
1186          *
1187          * This logic works only if a keyGetItem stream can never contain both
1188          * exact and lossy pointers for the same page.  Else we could have a
1189          * case like
1190          *
1191          *              stream 1                stream 2
1192          *              ...             ...
1193          *              42/6                    42/7
1194          *              50/1                    42/0xffff
1195          *              ...             ...
1196          *
1197          * We would conclude that 42/6 is not a match and advance stream 1,
1198          * thus never detecting the match to the lossy pointer in stream 2.
1199          * (keyGetItem has a similar problem versus entryGetItem.)
1200          *----------
1201          */
1202         do
1203         {
1204                 ItemPointerSetMin(item);
1205                 match = true;
1206                 for (i = 0; i < so->nkeys && match; i++)
1207                 {
1208                         GinScanKey      key = so->keys + i;
1209
1210                         /* Fetch the next item for this key that is > advancePast. */
1211                         keyGetItem(&so->ginstate, so->tempCtx, key, advancePast);
1212
1213                         if (key->isFinished)
1214                                 return false;
1215
1216                         /*
1217                          * If it's not a match, we can immediately conclude that nothing
1218                          * <= this item matches, without checking the rest of the keys.
1219                          */
1220                         if (!key->curItemMatches)
1221                         {
1222                                 advancePast = key->curItem;
1223                                 match = false;
1224                                 break;
1225                         }
1226
1227                         /*
1228                          * It's a match. We can conclude that nothing < matches, so the
1229                          * other key streams can skip to this item.
1230                          *
1231                          * Beware of lossy pointers, though; from a lossy pointer, we can
1232                          * only conclude that nothing smaller than this *block* matches.
1233                          */
1234                         if (ItemPointerIsLossyPage(&key->curItem))
1235                         {
1236                                 if (GinItemPointerGetBlockNumber(&advancePast) <
1237                                         GinItemPointerGetBlockNumber(&key->curItem))
1238                                 {
1239                                         advancePast.ip_blkid = key->curItem.ip_blkid;
1240                                         advancePast.ip_posid = 0;
1241                                 }
1242                         }
1243                         else
1244                         {
1245                                 Assert(key->curItem.ip_posid > 0);
1246                                 advancePast = key->curItem;
1247                                 advancePast.ip_posid--;
1248                         }
1249
1250                         /*
1251                          * If this is the first key, remember this location as a potential
1252                          * match, and proceed to check the rest of the keys.
1253                          *
1254                          * Otherwise, check if this is the same item that we checked the
1255                          * previous keys for (or a lossy pointer for the same page). If
1256                          * not, loop back to check the previous keys for this item (we
1257                          * will check this key again too, but keyGetItem returns quickly
1258                          * for that)
1259                          */
1260                         if (i == 0)
1261                         {
1262                                 *item = key->curItem;
1263                         }
1264                         else
1265                         {
1266                                 if (ItemPointerIsLossyPage(&key->curItem) ||
1267                                         ItemPointerIsLossyPage(item))
1268                                 {
1269                                         Assert(GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
1270                                         match = (GinItemPointerGetBlockNumber(&key->curItem) ==
1271                                                          GinItemPointerGetBlockNumber(item));
1272                                 }
1273                                 else
1274                                 {
1275                                         Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
1276                                         match = (ginCompareItemPointers(&key->curItem, item) == 0);
1277                                 }
1278                         }
1279                 }
1280         } while (!match);
1281
1282         Assert(!ItemPointerIsMin(item));
1283
1284         /*
1285          * Now *item contains the first ItemPointer after previous result that
1286          * satisfied all the keys for that exact TID, or a lossy reference to the
1287          * same page.
1288          *
1289          * We must return recheck = true if any of the keys are marked recheck.
1290          */
1291         *recheck = false;
1292         for (i = 0; i < so->nkeys; i++)
1293         {
1294                 GinScanKey      key = so->keys + i;
1295
1296                 if (key->recheckCurItem)
1297                 {
1298                         *recheck = true;
1299                         break;
1300                 }
1301         }
1302
1303         return TRUE;
1304 }
1305
1306
1307 /*
1308  * Functions for scanning the pending list
1309  */
1310
1311
1312 /*
1313  * Get ItemPointer of next heap row to be checked from pending list.
1314  * Returns false if there are no more. On pages with several heap rows
1315  * it returns each row separately, on page with part of heap row returns
1316  * per page data.  pos->firstOffset and pos->lastOffset are set to identify
1317  * the range of pending-list tuples belonging to this heap row.
1318  *
1319  * The pendingBuffer is presumed pinned and share-locked on entry, and is
1320  * pinned and share-locked on success exit.  On failure exit it's released.
1321  */
1322 static bool
1323 scanGetCandidate(IndexScanDesc scan, pendingPosition *pos)
1324 {
1325         OffsetNumber maxoff;
1326         Page            page;
1327         IndexTuple      itup;
1328
1329         ItemPointerSetInvalid(&pos->item);
1330         for (;;)
1331         {
1332                 page = BufferGetPage(pos->pendingBuffer);
1333
1334                 maxoff = PageGetMaxOffsetNumber(page);
1335                 if (pos->firstOffset > maxoff)
1336                 {
1337                         BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
1338
1339                         if (blkno == InvalidBlockNumber)
1340                         {
1341                                 UnlockReleaseBuffer(pos->pendingBuffer);
1342                                 pos->pendingBuffer = InvalidBuffer;
1343
1344                                 return false;
1345                         }
1346                         else
1347                         {
1348                                 /*
1349                                  * Here we must prevent deletion of next page by insertcleanup
1350                                  * process, which may be trying to obtain exclusive lock on
1351                                  * current page.  So, we lock next page before releasing the
1352                                  * current one
1353                                  */
1354                                 Buffer          tmpbuf = ReadBuffer(scan->indexRelation, blkno);
1355
1356                                 LockBuffer(tmpbuf, GIN_SHARE);
1357                                 UnlockReleaseBuffer(pos->pendingBuffer);
1358
1359                                 pos->pendingBuffer = tmpbuf;
1360                                 pos->firstOffset = FirstOffsetNumber;
1361                         }
1362                 }
1363                 else
1364                 {
1365                         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->firstOffset));
1366                         pos->item = itup->t_tid;
1367                         if (GinPageHasFullRow(page))
1368                         {
1369                                 /*
1370                                  * find itempointer to the next row
1371                                  */
1372                                 for (pos->lastOffset = pos->firstOffset + 1; pos->lastOffset <= maxoff; pos->lastOffset++)
1373                                 {
1374                                         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, pos->lastOffset));
1375                                         if (!ItemPointerEquals(&pos->item, &itup->t_tid))
1376                                                 break;
1377                                 }
1378                         }
1379                         else
1380                         {
1381                                 /*
1382                                  * All itempointers are the same on this page
1383                                  */
1384                                 pos->lastOffset = maxoff + 1;
1385                         }
1386
1387                         /*
1388                          * Now pos->firstOffset points to the first tuple of current heap
1389                          * row, pos->lastOffset points to the first tuple of next heap row
1390                          * (or to the end of page)
1391                          */
1392                         break;
1393                 }
1394         }
1395
1396         return true;
1397 }
1398
1399 /*
1400  * Scan pending-list page from current tuple (off) up till the first of:
1401  * - match is found (then returns true)
1402  * - no later match is possible
1403  * - tuple's attribute number is not equal to entry's attrnum
1404  * - reach end of page
1405  *
1406  * datum[]/category[]/datumExtracted[] arrays are used to cache the results
1407  * of gintuple_get_key() on the current page.
1408  */
1409 static bool
1410 matchPartialInPendingList(GinState *ginstate, Page page,
1411                                                   OffsetNumber off, OffsetNumber maxoff,
1412                                                   GinScanEntry entry,
1413                                                   Datum *datum, GinNullCategory *category,
1414                                                   bool *datumExtracted)
1415 {
1416         IndexTuple      itup;
1417         int32           cmp;
1418
1419         /* Partial match to a null is not possible */
1420         if (entry->queryCategory != GIN_CAT_NORM_KEY)
1421                 return false;
1422
1423         while (off < maxoff)
1424         {
1425                 itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, off));
1426
1427                 if (gintuple_get_attrnum(ginstate, itup) != entry->attnum)
1428                         return false;
1429
1430                 if (datumExtracted[off - 1] == false)
1431                 {
1432                         datum[off - 1] = gintuple_get_key(ginstate, itup,
1433                                                                                           &category[off - 1]);
1434                         datumExtracted[off - 1] = true;
1435                 }
1436
1437                 /* Once we hit nulls, no further match is possible */
1438                 if (category[off - 1] != GIN_CAT_NORM_KEY)
1439                         return false;
1440
1441                 /*----------
1442                  * Check partial match.
1443                  * case cmp == 0 => match
1444                  * case cmp > 0 => not match and end scan (no later match possible)
1445                  * case cmp < 0 => not match and continue scan
1446                  *----------
1447                  */
1448                 cmp = DatumGetInt32(FunctionCall4Coll(&ginstate->comparePartialFn[entry->attnum - 1],
1449                                                            ginstate->supportCollation[entry->attnum - 1],
1450                                                                                           entry->queryKey,
1451                                                                                           datum[off - 1],
1452                                                                                           UInt16GetDatum(entry->strategy),
1453                                                                                 PointerGetDatum(entry->extra_data)));
1454                 if (cmp == 0)
1455                         return true;
1456                 else if (cmp > 0)
1457                         return false;
1458
1459                 off++;
1460         }
1461
1462         return false;
1463 }
1464
1465 /*
1466  * Set up the entryRes array for each key by looking at
1467  * every entry for current heap row in pending list.
1468  *
1469  * Returns true if each scan key has at least one entryRes match.
1470  * This corresponds to the situations where the normal index search will
1471  * try to apply the key's consistentFn.  (A tuple not meeting that requirement
1472  * cannot be returned by the normal search since no entry stream will
1473  * source its TID.)
1474  *
1475  * The pendingBuffer is presumed pinned and share-locked on entry.
1476  */
1477 static bool
1478 collectMatchesForHeapRow(IndexScanDesc scan, pendingPosition *pos)
1479 {
1480         GinScanOpaque so = (GinScanOpaque) scan->opaque;
1481         OffsetNumber attrnum;
1482         Page            page;
1483         IndexTuple      itup;
1484         int                     i,
1485                                 j;
1486
1487         /*
1488          * Reset all entryRes and hasMatchKey flags
1489          */
1490         for (i = 0; i < so->nkeys; i++)
1491         {
1492                 GinScanKey      key = so->keys + i;
1493
1494                 memset(key->entryRes, GIN_FALSE, key->nentries);
1495         }
1496         memset(pos->hasMatchKey, FALSE, so->nkeys);
1497
1498         /*
1499          * Outer loop iterates over multiple pending-list pages when a single heap
1500          * row has entries spanning those pages.
1501          */
1502         for (;;)
1503         {
1504                 Datum           datum[BLCKSZ / sizeof(IndexTupleData)];
1505                 GinNullCategory category[BLCKSZ / sizeof(IndexTupleData)];
1506                 bool            datumExtracted[BLCKSZ / sizeof(IndexTupleData)];
1507
1508                 Assert(pos->lastOffset > pos->firstOffset);
1509                 memset(datumExtracted + pos->firstOffset - 1, 0,
1510                            sizeof(bool) * (pos->lastOffset - pos->firstOffset));
1511
1512                 page = BufferGetPage(pos->pendingBuffer);
1513
1514                 for (i = 0; i < so->nkeys; i++)
1515                 {
1516                         GinScanKey      key = so->keys + i;
1517
1518                         for (j = 0; j < key->nentries; j++)
1519                         {
1520                                 GinScanEntry entry = key->scanEntry[j];
1521                                 OffsetNumber StopLow = pos->firstOffset,
1522                                                         StopHigh = pos->lastOffset,
1523                                                         StopMiddle;
1524
1525                                 /* If already matched on earlier page, do no extra work */
1526                                 if (key->entryRes[j])
1527                                         continue;
1528
1529                                 /*
1530                                  * Interesting tuples are from pos->firstOffset to
1531                                  * pos->lastOffset and they are ordered by (attnum, Datum) as
1532                                  * it's done in entry tree.  So we can use binary search to
1533                                  * avoid linear scanning.
1534                                  */
1535                                 while (StopLow < StopHigh)
1536                                 {
1537                                         int                     res;
1538
1539                                         StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
1540
1541                                         itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, StopMiddle));
1542
1543                                         attrnum = gintuple_get_attrnum(&so->ginstate, itup);
1544
1545                                         if (key->attnum < attrnum)
1546                                         {
1547                                                 StopHigh = StopMiddle;
1548                                                 continue;
1549                                         }
1550                                         if (key->attnum > attrnum)
1551                                         {
1552                                                 StopLow = StopMiddle + 1;
1553                                                 continue;
1554                                         }
1555
1556                                         if (datumExtracted[StopMiddle - 1] == false)
1557                                         {
1558                                                 datum[StopMiddle - 1] =
1559                                                         gintuple_get_key(&so->ginstate, itup,
1560                                                                                          &category[StopMiddle - 1]);
1561                                                 datumExtracted[StopMiddle - 1] = true;
1562                                         }
1563
1564                                         if (entry->queryCategory == GIN_CAT_EMPTY_QUERY)
1565                                         {
1566                                                 /* special behavior depending on searchMode */
1567                                                 if (entry->searchMode == GIN_SEARCH_MODE_ALL)
1568                                                 {
1569                                                         /* match anything except NULL_ITEM */
1570                                                         if (category[StopMiddle - 1] == GIN_CAT_NULL_ITEM)
1571                                                                 res = -1;
1572                                                         else
1573                                                                 res = 0;
1574                                                 }
1575                                                 else
1576                                                 {
1577                                                         /* match everything */
1578                                                         res = 0;
1579                                                 }
1580                                         }
1581                                         else
1582                                         {
1583                                                 res = ginCompareEntries(&so->ginstate,
1584                                                                                                 entry->attnum,
1585                                                                                                 entry->queryKey,
1586                                                                                                 entry->queryCategory,
1587                                                                                                 datum[StopMiddle - 1],
1588                                                                                                 category[StopMiddle - 1]);
1589                                         }
1590
1591                                         if (res == 0)
1592                                         {
1593                                                 /*
1594                                                  * Found exact match (there can be only one, except in
1595                                                  * EMPTY_QUERY mode).
1596                                                  *
1597                                                  * If doing partial match, scan forward from here to
1598                                                  * end of page to check for matches.
1599                                                  *
1600                                                  * See comment above about tuple's ordering.
1601                                                  */
1602                                                 if (entry->isPartialMatch)
1603                                                         key->entryRes[j] =
1604                                                                 matchPartialInPendingList(&so->ginstate,
1605                                                                                                                   page,
1606                                                                                                                   StopMiddle,
1607                                                                                                                   pos->lastOffset,
1608                                                                                                                   entry,
1609                                                                                                                   datum,
1610                                                                                                                   category,
1611                                                                                                                   datumExtracted);
1612                                                 else
1613                                                         key->entryRes[j] = true;
1614
1615                                                 /* done with binary search */
1616                                                 break;
1617                                         }
1618                                         else if (res < 0)
1619                                                 StopHigh = StopMiddle;
1620                                         else
1621                                                 StopLow = StopMiddle + 1;
1622                                 }
1623
1624                                 if (StopLow >= StopHigh && entry->isPartialMatch)
1625                                 {
1626                                         /*
1627                                          * No exact match on this page.  If doing partial match,
1628                                          * scan from the first tuple greater than target value to
1629                                          * end of page.  Note that since we don't remember whether
1630                                          * the comparePartialFn told us to stop early on a
1631                                          * previous page, we will uselessly apply comparePartialFn
1632                                          * to the first tuple on each subsequent page.
1633                                          */
1634                                         key->entryRes[j] =
1635                                                 matchPartialInPendingList(&so->ginstate,
1636                                                                                                   page,
1637                                                                                                   StopHigh,
1638                                                                                                   pos->lastOffset,
1639                                                                                                   entry,
1640                                                                                                   datum,
1641                                                                                                   category,
1642                                                                                                   datumExtracted);
1643                                 }
1644
1645                                 pos->hasMatchKey[i] |= key->entryRes[j];
1646                         }
1647                 }
1648
1649                 /* Advance firstOffset over the scanned tuples */
1650                 pos->firstOffset = pos->lastOffset;
1651
1652                 if (GinPageHasFullRow(page))
1653                 {
1654                         /*
1655                          * We have examined all pending entries for the current heap row.
1656                          * Break out of loop over pages.
1657                          */
1658                         break;
1659                 }
1660                 else
1661                 {
1662                         /*
1663                          * Advance to next page of pending entries for the current heap
1664                          * row.  Complain if there isn't one.
1665                          */
1666                         ItemPointerData item = pos->item;
1667
1668                         if (scanGetCandidate(scan, pos) == false ||
1669                                 !ItemPointerEquals(&pos->item, &item))
1670                                 elog(ERROR, "could not find additional pending pages for same heap tuple");
1671                 }
1672         }
1673
1674         /*
1675          * Now return "true" if all scan keys have at least one matching datum
1676          */
1677         for (i = 0; i < so->nkeys; i++)
1678         {
1679                 if (pos->hasMatchKey[i] == false)
1680                         return false;
1681         }
1682
1683         return true;
1684 }
1685
1686 /*
1687  * Collect all matched rows from pending list into bitmap
1688  */
1689 static void
1690 scanPendingInsert(IndexScanDesc scan, TIDBitmap *tbm, int64 *ntids)
1691 {
1692         GinScanOpaque so = (GinScanOpaque) scan->opaque;
1693         MemoryContext oldCtx;
1694         bool            recheck,
1695                                 match;
1696         int                     i;
1697         pendingPosition pos;
1698         Buffer          metabuffer = ReadBuffer(scan->indexRelation, GIN_METAPAGE_BLKNO);
1699         BlockNumber blkno;
1700
1701         *ntids = 0;
1702
1703         LockBuffer(metabuffer, GIN_SHARE);
1704         blkno = GinPageGetMeta(BufferGetPage(metabuffer))->head;
1705
1706         /*
1707          * fetch head of list before unlocking metapage. head page must be pinned
1708          * to prevent deletion by vacuum process
1709          */
1710         if (blkno == InvalidBlockNumber)
1711         {
1712                 /* No pending list, so proceed with normal scan */
1713                 UnlockReleaseBuffer(metabuffer);
1714                 return;
1715         }
1716
1717         pos.pendingBuffer = ReadBuffer(scan->indexRelation, blkno);
1718         LockBuffer(pos.pendingBuffer, GIN_SHARE);
1719         pos.firstOffset = FirstOffsetNumber;
1720         UnlockReleaseBuffer(metabuffer);
1721         pos.hasMatchKey = palloc(sizeof(bool) * so->nkeys);
1722
1723         /*
1724          * loop for each heap row. scanGetCandidate returns full row or row's
1725          * tuples from first page.
1726          */
1727         while (scanGetCandidate(scan, &pos))
1728         {
1729                 /*
1730                  * Check entries in tuple and set up entryRes array.
1731                  *
1732                  * If pending tuples belonging to the current heap row are spread
1733                  * across several pages, collectMatchesForHeapRow will read all of
1734                  * those pages.
1735                  */
1736                 if (!collectMatchesForHeapRow(scan, &pos))
1737                         continue;
1738
1739                 /*
1740                  * Matching of entries of one row is finished, so check row using
1741                  * consistent functions.
1742                  */
1743                 oldCtx = MemoryContextSwitchTo(so->tempCtx);
1744                 recheck = false;
1745                 match = true;
1746
1747                 for (i = 0; i < so->nkeys; i++)
1748                 {
1749                         GinScanKey      key = so->keys + i;
1750
1751                         if (!key->boolConsistentFn(key))
1752                         {
1753                                 match = false;
1754                                 break;
1755                         }
1756                         recheck |= key->recheckCurItem;
1757                 }
1758
1759                 MemoryContextSwitchTo(oldCtx);
1760                 MemoryContextReset(so->tempCtx);
1761
1762                 if (match)
1763                 {
1764                         tbm_add_tuples(tbm, &pos.item, 1, recheck);
1765                         (*ntids)++;
1766                 }
1767         }
1768
1769         pfree(pos.hasMatchKey);
1770 }
1771
1772
1773 #define GinIsVoidRes(s)         ( ((GinScanOpaque) scan->opaque)->isVoidRes )
1774
1775 Datum
1776 gingetbitmap(PG_FUNCTION_ARGS)
1777 {
1778         IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0);
1779         TIDBitmap  *tbm = (TIDBitmap *) PG_GETARG_POINTER(1);
1780         GinScanOpaque so = (GinScanOpaque) scan->opaque;
1781         int64           ntids;
1782         ItemPointerData iptr;
1783         bool            recheck;
1784
1785         /*
1786          * Set up the scan keys, and check for unsatisfiable query.
1787          */
1788         ginFreeScanKeys(so); /* there should be no keys yet, but just to be sure */
1789         ginNewScanKey(scan);
1790
1791         if (GinIsVoidRes(scan))
1792                 PG_RETURN_INT64(0);
1793
1794         ntids = 0;
1795
1796         /*
1797          * First, scan the pending list and collect any matching entries into the
1798          * bitmap.  After we scan a pending item, some other backend could post it
1799          * into the main index, and so we might visit it a second time during the
1800          * main scan.  This is okay because we'll just re-set the same bit in the
1801          * bitmap.  (The possibility of duplicate visits is a major reason why GIN
1802          * can't support the amgettuple API, however.) Note that it would not do
1803          * to scan the main index before the pending list, since concurrent
1804          * cleanup could then make us miss entries entirely.
1805          */
1806         scanPendingInsert(scan, tbm, &ntids);
1807
1808         /*
1809          * Now scan the main index.
1810          */
1811         startScan(scan);
1812
1813         ItemPointerSetMin(&iptr);
1814
1815         for (;;)
1816         {
1817                 CHECK_FOR_INTERRUPTS();
1818
1819                 if (!scanGetItem(scan, iptr, &iptr, &recheck))
1820                         break;
1821
1822                 if (ItemPointerIsLossyPage(&iptr))
1823                         tbm_add_page(tbm, ItemPointerGetBlockNumber(&iptr));
1824                 else
1825                         tbm_add_tuples(tbm, &iptr, 1, recheck);
1826                 ntids++;
1827         }
1828
1829         PG_RETURN_INT64(ntids);
1830 }