]> granicus.if.org Git - postgresql/blob - src/backend/access/heap/heapam_handler.c
Report progress of CREATE INDEX operations
[postgresql] / src / backend / access / heap / heapam_handler.c
1 /*-------------------------------------------------------------------------
2  *
3  * heapam_handler.c
4  *        heap table access method code
5  *
6  * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/access/heap/heapam_handler.c
12  *
13  *
14  * NOTES
15  *        This files wires up the lower level heapam.c et al routines with the
16  *        tableam abstraction.
17  *
18  *-------------------------------------------------------------------------
19  */
20 #include "postgres.h"
21
22 #include <math.h>
23
24 #include "miscadmin.h"
25
26 #include "access/genam.h"
27 #include "access/heapam.h"
28 #include "access/multixact.h"
29 #include "access/rewriteheap.h"
30 #include "access/tableam.h"
31 #include "access/tsmapi.h"
32 #include "access/xact.h"
33 #include "catalog/catalog.h"
34 #include "catalog/index.h"
35 #include "catalog/storage.h"
36 #include "catalog/storage_xlog.h"
37 #include "commands/progress.h"
38 #include "executor/executor.h"
39 #include "optimizer/plancat.h"
40 #include "pgstat.h"
41 #include "storage/bufmgr.h"
42 #include "storage/bufpage.h"
43 #include "storage/bufmgr.h"
44 #include "storage/lmgr.h"
45 #include "storage/predicate.h"
46 #include "storage/procarray.h"
47 #include "storage/smgr.h"
48 #include "utils/builtins.h"
49 #include "utils/rel.h"
50
51
52 static void reform_and_rewrite_tuple(HeapTuple tuple,
53                                                  Relation OldHeap, Relation NewHeap,
54                                                  Datum *values, bool *isnull, RewriteState rwstate);
55
56 static bool SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
57                                            HeapTuple tuple,
58                                            OffsetNumber tupoffset);
59
60 static BlockNumber heapam_scan_get_blocks_done(HeapScanDesc hscan);
61
62 static const TableAmRoutine heapam_methods;
63
64
65 /* ------------------------------------------------------------------------
66  * Slot related callbacks for heap AM
67  * ------------------------------------------------------------------------
68  */
69
70 static const TupleTableSlotOps *
71 heapam_slot_callbacks(Relation relation)
72 {
73         return &TTSOpsBufferHeapTuple;
74 }
75
76
77 /* ------------------------------------------------------------------------
78  * Index Scan Callbacks for heap AM
79  * ------------------------------------------------------------------------
80  */
81
82 static IndexFetchTableData *
83 heapam_index_fetch_begin(Relation rel)
84 {
85         IndexFetchHeapData *hscan = palloc0(sizeof(IndexFetchHeapData));
86
87         hscan->xs_base.rel = rel;
88         hscan->xs_cbuf = InvalidBuffer;
89
90         return &hscan->xs_base;
91 }
92
93 static void
94 heapam_index_fetch_reset(IndexFetchTableData *scan)
95 {
96         IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
97
98         if (BufferIsValid(hscan->xs_cbuf))
99         {
100                 ReleaseBuffer(hscan->xs_cbuf);
101                 hscan->xs_cbuf = InvalidBuffer;
102         }
103 }
104
105 static void
106 heapam_index_fetch_end(IndexFetchTableData *scan)
107 {
108         IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
109
110         heapam_index_fetch_reset(scan);
111
112         pfree(hscan);
113 }
114
115 static bool
116 heapam_index_fetch_tuple(struct IndexFetchTableData *scan,
117                                                  ItemPointer tid,
118                                                  Snapshot snapshot,
119                                                  TupleTableSlot *slot,
120                                                  bool *call_again, bool *all_dead)
121 {
122         IndexFetchHeapData *hscan = (IndexFetchHeapData *) scan;
123         BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
124         bool            got_heap_tuple;
125
126         Assert(TTS_IS_BUFFERTUPLE(slot));
127
128         /* We can skip the buffer-switching logic if we're in mid-HOT chain. */
129         if (!*call_again)
130         {
131                 /* Switch to correct buffer if we don't have it already */
132                 Buffer          prev_buf = hscan->xs_cbuf;
133
134                 hscan->xs_cbuf = ReleaseAndReadBuffer(hscan->xs_cbuf,
135                                                                                           hscan->xs_base.rel,
136                                                                                           ItemPointerGetBlockNumber(tid));
137
138                 /*
139                  * Prune page, but only if we weren't already on this page
140                  */
141                 if (prev_buf != hscan->xs_cbuf)
142                         heap_page_prune_opt(hscan->xs_base.rel, hscan->xs_cbuf);
143         }
144
145         /* Obtain share-lock on the buffer so we can examine visibility */
146         LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_SHARE);
147         got_heap_tuple = heap_hot_search_buffer(tid,
148                                                                                         hscan->xs_base.rel,
149                                                                                         hscan->xs_cbuf,
150                                                                                         snapshot,
151                                                                                         &bslot->base.tupdata,
152                                                                                         all_dead,
153                                                                                         !*call_again);
154         bslot->base.tupdata.t_self = *tid;
155         LockBuffer(hscan->xs_cbuf, BUFFER_LOCK_UNLOCK);
156
157         if (got_heap_tuple)
158         {
159                 /*
160                  * Only in a non-MVCC snapshot can more than one member of the HOT
161                  * chain be visible.
162                  */
163                 *call_again = !IsMVCCSnapshot(snapshot);
164
165                 slot->tts_tableOid = RelationGetRelid(scan->rel);
166                 ExecStoreBufferHeapTuple(&bslot->base.tupdata, slot, hscan->xs_cbuf);
167         }
168         else
169         {
170                 /* We've reached the end of the HOT chain. */
171                 *call_again = false;
172         }
173
174         return got_heap_tuple;
175 }
176
177
178 /* ------------------------------------------------------------------------
179  * Callbacks for non-modifying operations on individual tuples for heap AM
180  * ------------------------------------------------------------------------
181  */
182
183 static bool
184 heapam_fetch_row_version(Relation relation,
185                                                  ItemPointer tid,
186                                                  Snapshot snapshot,
187                                                  TupleTableSlot *slot)
188 {
189         BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
190         Buffer          buffer;
191
192         Assert(TTS_IS_BUFFERTUPLE(slot));
193
194         bslot->base.tupdata.t_self = *tid;
195         if (heap_fetch(relation, snapshot, &bslot->base.tupdata, &buffer))
196         {
197                 /* store in slot, transferring existing pin */
198                 ExecStorePinnedBufferHeapTuple(&bslot->base.tupdata, slot, buffer);
199                 slot->tts_tableOid = RelationGetRelid(relation);
200
201                 return true;
202         }
203
204         return false;
205 }
206
207 static bool
208 heapam_tuple_satisfies_snapshot(Relation rel, TupleTableSlot *slot,
209                                                                 Snapshot snapshot)
210 {
211         BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
212         bool            res;
213
214         Assert(TTS_IS_BUFFERTUPLE(slot));
215         Assert(BufferIsValid(bslot->buffer));
216
217         /*
218          * We need buffer pin and lock to call HeapTupleSatisfiesVisibility.
219          * Caller should be holding pin, but not lock.
220          */
221         LockBuffer(bslot->buffer, BUFFER_LOCK_SHARE);
222         res = HeapTupleSatisfiesVisibility(bslot->base.tuple, snapshot,
223                                                                            bslot->buffer);
224         LockBuffer(bslot->buffer, BUFFER_LOCK_UNLOCK);
225
226         return res;
227 }
228
229
230 /* ----------------------------------------------------------------------------
231  *  Functions for manipulations of physical tuples for heap AM.
232  * ----------------------------------------------------------------------------
233  */
234
235 static void
236 heapam_tuple_insert(Relation relation, TupleTableSlot *slot, CommandId cid,
237                                         int options, BulkInsertState bistate)
238 {
239         bool            shouldFree = true;
240         HeapTuple       tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
241
242         /* Update the tuple with table oid */
243         slot->tts_tableOid = RelationGetRelid(relation);
244         tuple->t_tableOid = slot->tts_tableOid;
245
246         /* Perform the insertion, and copy the resulting ItemPointer */
247         heap_insert(relation, tuple, cid, options, bistate);
248         ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
249
250         if (shouldFree)
251                 pfree(tuple);
252 }
253
254 static void
255 heapam_tuple_insert_speculative(Relation relation, TupleTableSlot *slot,
256                                                                 CommandId cid, int options,
257                                                                 BulkInsertState bistate, uint32 specToken)
258 {
259         bool            shouldFree = true;
260         HeapTuple       tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
261
262         /* Update the tuple with table oid */
263         slot->tts_tableOid = RelationGetRelid(relation);
264         tuple->t_tableOid = slot->tts_tableOid;
265
266         HeapTupleHeaderSetSpeculativeToken(tuple->t_data, specToken);
267         options |= HEAP_INSERT_SPECULATIVE;
268
269         /* Perform the insertion, and copy the resulting ItemPointer */
270         heap_insert(relation, tuple, cid, options, bistate);
271         ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
272
273         if (shouldFree)
274                 pfree(tuple);
275 }
276
277 static void
278 heapam_tuple_complete_speculative(Relation relation, TupleTableSlot *slot,
279                                                                   uint32 spekToken, bool succeeded)
280 {
281         bool            shouldFree = true;
282         HeapTuple       tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
283
284         /* adjust the tuple's state accordingly */
285         if (!succeeded)
286                 heap_finish_speculative(relation, &slot->tts_tid);
287         else
288                 heap_abort_speculative(relation, &slot->tts_tid);
289
290         if (shouldFree)
291                 pfree(tuple);
292 }
293
294 static TM_Result
295 heapam_tuple_delete(Relation relation, ItemPointer tid, CommandId cid,
296                                         Snapshot snapshot, Snapshot crosscheck, bool wait,
297                                         TM_FailureData *tmfd, bool changingPart)
298 {
299         /*
300          * Currently Deleting of index tuples are handled at vacuum, in case if
301          * the storage itself is cleaning the dead tuples by itself, it is the
302          * time to call the index tuple deletion also.
303          */
304         return heap_delete(relation, tid, cid, crosscheck, wait, tmfd, changingPart);
305 }
306
307
308 static TM_Result
309 heapam_tuple_update(Relation relation, ItemPointer otid, TupleTableSlot *slot,
310                                         CommandId cid, Snapshot snapshot, Snapshot crosscheck,
311                                         bool wait, TM_FailureData *tmfd,
312                                         LockTupleMode *lockmode, bool *update_indexes)
313 {
314         bool            shouldFree = true;
315         HeapTuple       tuple = ExecFetchSlotHeapTuple(slot, true, &shouldFree);
316         TM_Result       result;
317
318         /* Update the tuple with table oid */
319         slot->tts_tableOid = RelationGetRelid(relation);
320         tuple->t_tableOid = slot->tts_tableOid;
321
322         result = heap_update(relation, otid, tuple, cid, crosscheck, wait,
323                                                  tmfd, lockmode);
324         ItemPointerCopy(&tuple->t_self, &slot->tts_tid);
325
326         /*
327          * Decide whether new index entries are needed for the tuple
328          *
329          * Note: heap_update returns the tid (location) of the new tuple in the
330          * t_self field.
331          *
332          * If it's a HOT update, we mustn't insert new index entries.
333          */
334         *update_indexes = result == TM_Ok && !HeapTupleIsHeapOnly(tuple);
335
336         if (shouldFree)
337                 pfree(tuple);
338
339         return result;
340 }
341
342 static TM_Result
343 heapam_tuple_lock(Relation relation, ItemPointer tid, Snapshot snapshot,
344                                   TupleTableSlot *slot, CommandId cid, LockTupleMode mode,
345                                   LockWaitPolicy wait_policy, uint8 flags,
346                                   TM_FailureData *tmfd)
347 {
348         BufferHeapTupleTableSlot *bslot = (BufferHeapTupleTableSlot *) slot;
349         TM_Result       result;
350         Buffer          buffer;
351         HeapTuple       tuple = &bslot->base.tupdata;
352         bool            follow_updates;
353
354         follow_updates = (flags & TUPLE_LOCK_FLAG_LOCK_UPDATE_IN_PROGRESS) != 0;
355         tmfd->traversed = false;
356
357         Assert(TTS_IS_BUFFERTUPLE(slot));
358
359 tuple_lock_retry:
360         tuple->t_self = *tid;
361         result = heap_lock_tuple(relation, tuple, cid, mode, wait_policy,
362                                                          follow_updates, &buffer, tmfd);
363
364         if (result == TM_Updated &&
365                 (flags & TUPLE_LOCK_FLAG_FIND_LAST_VERSION))
366         {
367                 ReleaseBuffer(buffer);
368                 /* Should not encounter speculative tuple on recheck */
369                 Assert(!HeapTupleHeaderIsSpeculative(tuple->t_data));
370
371                 if (!ItemPointerEquals(&tmfd->ctid, &tuple->t_self))
372                 {
373                         SnapshotData SnapshotDirty;
374                         TransactionId priorXmax;
375
376                         /* it was updated, so look at the updated version */
377                         *tid = tmfd->ctid;
378                         /* updated row should have xmin matching this xmax */
379                         priorXmax = tmfd->xmax;
380
381                         /* signal that a tuple later in the chain is getting locked */
382                         tmfd->traversed = true;
383
384                         /*
385                          * fetch target tuple
386                          *
387                          * Loop here to deal with updated or busy tuples
388                          */
389                         InitDirtySnapshot(SnapshotDirty);
390                         for (;;)
391                         {
392                                 if (ItemPointerIndicatesMovedPartitions(tid))
393                                         ereport(ERROR,
394                                                         (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE),
395                                                          errmsg("tuple to be locked was already moved to another partition due to concurrent update")));
396
397                                 tuple->t_self = *tid;
398                                 if (heap_fetch(relation, &SnapshotDirty, tuple, &buffer))
399                                 {
400                                         /*
401                                          * If xmin isn't what we're expecting, the slot must have
402                                          * been recycled and reused for an unrelated tuple.  This
403                                          * implies that the latest version of the row was deleted,
404                                          * so we need do nothing.  (Should be safe to examine xmin
405                                          * without getting buffer's content lock.  We assume
406                                          * reading a TransactionId to be atomic, and Xmin never
407                                          * changes in an existing tuple, except to invalid or
408                                          * frozen, and neither of those can match priorXmax.)
409                                          */
410                                         if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
411                                                                                          priorXmax))
412                                         {
413                                                 ReleaseBuffer(buffer);
414                                                 return TM_Deleted;
415                                         }
416
417                                         /* otherwise xmin should not be dirty... */
418                                         if (TransactionIdIsValid(SnapshotDirty.xmin))
419                                                 elog(ERROR, "t_xmin is uncommitted in tuple to be updated");
420
421                                         /*
422                                          * If tuple is being updated by other transaction then we
423                                          * have to wait for its commit/abort, or die trying.
424                                          */
425                                         if (TransactionIdIsValid(SnapshotDirty.xmax))
426                                         {
427                                                 ReleaseBuffer(buffer);
428                                                 switch (wait_policy)
429                                                 {
430                                                         case LockWaitBlock:
431                                                                 XactLockTableWait(SnapshotDirty.xmax,
432                                                                                                   relation, &tuple->t_self,
433                                                                                                   XLTW_FetchUpdated);
434                                                                 break;
435                                                         case LockWaitSkip:
436                                                                 if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
437                                                                         /* skip instead of waiting */
438                                                                         return TM_WouldBlock;
439                                                                 break;
440                                                         case LockWaitError:
441                                                                 if (!ConditionalXactLockTableWait(SnapshotDirty.xmax))
442                                                                         ereport(ERROR,
443                                                                                         (errcode(ERRCODE_LOCK_NOT_AVAILABLE),
444                                                                                          errmsg("could not obtain lock on row in relation \"%s\"",
445                                                                                                         RelationGetRelationName(relation))));
446                                                                 break;
447                                                 }
448                                                 continue;       /* loop back to repeat heap_fetch */
449                                         }
450
451                                         /*
452                                          * If tuple was inserted by our own transaction, we have
453                                          * to check cmin against cid: cmin >= current CID means
454                                          * our command cannot see the tuple, so we should ignore
455                                          * it. Otherwise heap_lock_tuple() will throw an error,
456                                          * and so would any later attempt to update or delete the
457                                          * tuple.  (We need not check cmax because
458                                          * HeapTupleSatisfiesDirty will consider a tuple deleted
459                                          * by our transaction dead, regardless of cmax.)  We just
460                                          * checked that priorXmax == xmin, so we can test that
461                                          * variable instead of doing HeapTupleHeaderGetXmin again.
462                                          */
463                                         if (TransactionIdIsCurrentTransactionId(priorXmax) &&
464                                                 HeapTupleHeaderGetCmin(tuple->t_data) >= cid)
465                                         {
466                                                 ReleaseBuffer(buffer);
467                                                 return TM_Invisible;
468                                         }
469
470                                         /*
471                                          * This is a live tuple, so try to lock it again.
472                                          */
473                                         ReleaseBuffer(buffer);
474                                         goto tuple_lock_retry;
475                                 }
476
477                                 /*
478                                  * If the referenced slot was actually empty, the latest
479                                  * version of the row must have been deleted, so we need do
480                                  * nothing.
481                                  */
482                                 if (tuple->t_data == NULL)
483                                 {
484                                         return TM_Deleted;
485                                 }
486
487                                 /*
488                                  * As above, if xmin isn't what we're expecting, do nothing.
489                                  */
490                                 if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple->t_data),
491                                                                                  priorXmax))
492                                 {
493                                         if (BufferIsValid(buffer))
494                                                 ReleaseBuffer(buffer);
495                                         return TM_Deleted;
496                                 }
497
498                                 /*
499                                  * If we get here, the tuple was found but failed
500                                  * SnapshotDirty. Assuming the xmin is either a committed xact
501                                  * or our own xact (as it certainly should be if we're trying
502                                  * to modify the tuple), this must mean that the row was
503                                  * updated or deleted by either a committed xact or our own
504                                  * xact.  If it was deleted, we can ignore it; if it was
505                                  * updated then chain up to the next version and repeat the
506                                  * whole process.
507                                  *
508                                  * As above, it should be safe to examine xmax and t_ctid
509                                  * without the buffer content lock, because they can't be
510                                  * changing.
511                                  */
512                                 if (ItemPointerEquals(&tuple->t_self, &tuple->t_data->t_ctid))
513                                 {
514                                         /* deleted, so forget about it */
515                                         if (BufferIsValid(buffer))
516                                                 ReleaseBuffer(buffer);
517                                         return TM_Deleted;
518                                 }
519
520                                 /* updated, so look at the updated row */
521                                 *tid = tuple->t_data->t_ctid;
522                                 /* updated row should have xmin matching this xmax */
523                                 priorXmax = HeapTupleHeaderGetUpdateXid(tuple->t_data);
524                                 if (BufferIsValid(buffer))
525                                         ReleaseBuffer(buffer);
526                                 /* loop back to fetch next in chain */
527                         }
528                 }
529                 else
530                 {
531                         /* tuple was deleted, so give up */
532                         return TM_Deleted;
533                 }
534         }
535
536         slot->tts_tableOid = RelationGetRelid(relation);
537         tuple->t_tableOid = slot->tts_tableOid;
538
539         /* store in slot, transferring existing pin */
540         ExecStorePinnedBufferHeapTuple(tuple, slot, buffer);
541
542         return result;
543 }
544
545 static void
546 heapam_finish_bulk_insert(Relation relation, int options)
547 {
548         /*
549          * If we skipped writing WAL, then we need to sync the heap (but not
550          * indexes since those use WAL anyway / don't go through tableam)
551          */
552         if (options & HEAP_INSERT_SKIP_WAL)
553                 heap_sync(relation);
554 }
555
556
557 /* ------------------------------------------------------------------------
558  * DDL related callbacks for heap AM.
559  * ------------------------------------------------------------------------
560  */
561
562 static void
563 heapam_relation_set_new_filenode(Relation rel, char persistence,
564                                                                  TransactionId *freezeXid,
565                                                                  MultiXactId *minmulti)
566 {
567         /*
568          * Initialize to the minimum XID that could put tuples in the table. We
569          * know that no xacts older than RecentXmin are still running, so that
570          * will do.
571          */
572         *freezeXid = RecentXmin;
573
574         /*
575          * Similarly, initialize the minimum Multixact to the first value that
576          * could possibly be stored in tuples in the table.  Running transactions
577          * could reuse values from their local cache, so we are careful to
578          * consider all currently running multis.
579          *
580          * XXX this could be refined further, but is it worth the hassle?
581          */
582         *minmulti = GetOldestMultiXactId();
583
584         RelationCreateStorage(rel->rd_node, persistence);
585
586         /*
587          * If required, set up an init fork for an unlogged table so that it can
588          * be correctly reinitialized on restart.  An immediate sync is required
589          * even if the page has been logged, because the write did not go through
590          * shared_buffers and therefore a concurrent checkpoint may have moved the
591          * redo pointer past our xlog record.  Recovery may as well remove it
592          * while replaying, for example, XLOG_DBASE_CREATE or XLOG_TBLSPC_CREATE
593          * record. Therefore, logging is necessary even if wal_level=minimal.
594          */
595         if (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED)
596         {
597                 Assert(rel->rd_rel->relkind == RELKIND_RELATION ||
598                            rel->rd_rel->relkind == RELKIND_MATVIEW ||
599                            rel->rd_rel->relkind == RELKIND_TOASTVALUE);
600                 RelationOpenSmgr(rel);
601                 smgrcreate(rel->rd_smgr, INIT_FORKNUM, false);
602                 log_smgrcreate(&rel->rd_smgr->smgr_rnode.node, INIT_FORKNUM);
603                 smgrimmedsync(rel->rd_smgr, INIT_FORKNUM);
604         }
605 }
606
607 static void
608 heapam_relation_nontransactional_truncate(Relation rel)
609 {
610         RelationTruncate(rel, 0);
611 }
612
613 static void
614 heapam_relation_copy_data(Relation rel, RelFileNode newrnode)
615 {
616         SMgrRelation dstrel;
617
618         dstrel = smgropen(newrnode, rel->rd_backend);
619         RelationOpenSmgr(rel);
620
621         /*
622          * Create and copy all forks of the relation, and schedule unlinking of
623          * old physical files.
624          *
625          * NOTE: any conflict in relfilenode value will be caught in
626          * RelationCreateStorage().
627          */
628         RelationCreateStorage(newrnode, rel->rd_rel->relpersistence);
629
630         /* copy main fork */
631         RelationCopyStorage(rel->rd_smgr, dstrel, MAIN_FORKNUM,
632                                                 rel->rd_rel->relpersistence);
633
634         /* copy those extra forks that exist */
635         for (ForkNumber forkNum = MAIN_FORKNUM + 1;
636                  forkNum <= MAX_FORKNUM; forkNum++)
637         {
638                 if (smgrexists(rel->rd_smgr, forkNum))
639                 {
640                         smgrcreate(dstrel, forkNum, false);
641
642                         /*
643                          * WAL log creation if the relation is persistent, or this is the
644                          * init fork of an unlogged relation.
645                          */
646                         if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
647                                 (rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
648                                  forkNum == INIT_FORKNUM))
649                                 log_smgrcreate(&newrnode, forkNum);
650                         RelationCopyStorage(rel->rd_smgr, dstrel, forkNum,
651                                                                 rel->rd_rel->relpersistence);
652                 }
653         }
654
655
656         /* drop old relation, and close new one */
657         RelationDropStorage(rel);
658         smgrclose(dstrel);
659 }
660
661 static void
662 heapam_relation_copy_for_cluster(Relation OldHeap, Relation NewHeap,
663                                                                  Relation OldIndex, bool use_sort,
664                                                                  TransactionId OldestXmin,
665                                                                  TransactionId FreezeXid,
666                                                                  MultiXactId MultiXactCutoff,
667                                                                  double *num_tuples,
668                                                                  double *tups_vacuumed,
669                                                                  double *tups_recently_dead)
670 {
671         RewriteState rwstate;
672         IndexScanDesc indexScan;
673         TableScanDesc tableScan;
674         HeapScanDesc heapScan;
675         bool            use_wal;
676         bool            is_system_catalog;
677         Tuplesortstate *tuplesort;
678         TupleDesc       oldTupDesc = RelationGetDescr(OldHeap);
679         TupleDesc       newTupDesc = RelationGetDescr(NewHeap);
680         TupleTableSlot *slot;
681         int                     natts;
682         Datum      *values;
683         bool       *isnull;
684         BufferHeapTupleTableSlot *hslot;
685
686         /* Remember if it's a system catalog */
687         is_system_catalog = IsSystemRelation(OldHeap);
688
689         /*
690          * We need to log the copied data in WAL iff WAL archiving/streaming is
691          * enabled AND it's a WAL-logged rel.
692          */
693         use_wal = XLogIsNeeded() && RelationNeedsWAL(NewHeap);
694
695         /* use_wal off requires smgr_targblock be initially invalid */
696         Assert(RelationGetTargetBlock(NewHeap) == InvalidBlockNumber);
697
698         /* Preallocate values/isnull arrays */
699         natts = newTupDesc->natts;
700         values = (Datum *) palloc(natts * sizeof(Datum));
701         isnull = (bool *) palloc(natts * sizeof(bool));
702
703         /* Initialize the rewrite operation */
704         rwstate = begin_heap_rewrite(OldHeap, NewHeap, OldestXmin, FreezeXid,
705                                                                  MultiXactCutoff, use_wal);
706
707
708         /* Set up sorting if wanted */
709         if (use_sort)
710                 tuplesort = tuplesort_begin_cluster(oldTupDesc, OldIndex,
711                                                                                         maintenance_work_mem,
712                                                                                         NULL, false);
713         else
714                 tuplesort = NULL;
715
716         /*
717          * Prepare to scan the OldHeap.  To ensure we see recently-dead tuples
718          * that still need to be copied, we scan with SnapshotAny and use
719          * HeapTupleSatisfiesVacuum for the visibility test.
720          */
721         if (OldIndex != NULL && !use_sort)
722         {
723                 const int       ci_index[] = {
724                         PROGRESS_CLUSTER_PHASE,
725                         PROGRESS_CLUSTER_INDEX_RELID
726                 };
727                 int64           ci_val[2];
728
729                 /* Set phase and OIDOldIndex to columns */
730                 ci_val[0] = PROGRESS_CLUSTER_PHASE_INDEX_SCAN_HEAP;
731                 ci_val[1] = RelationGetRelid(OldIndex);
732                 pgstat_progress_update_multi_param(2, ci_index, ci_val);
733
734                 tableScan = NULL;
735                 heapScan = NULL;
736                 indexScan = index_beginscan(OldHeap, OldIndex, SnapshotAny, 0, 0);
737                 index_rescan(indexScan, NULL, 0, NULL, 0);
738         }
739         else
740         {
741                 /* In scan-and-sort mode and also VACUUM FULL, set phase */
742                 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
743                                                                          PROGRESS_CLUSTER_PHASE_SEQ_SCAN_HEAP);
744
745                 tableScan = table_beginscan(OldHeap, SnapshotAny, 0, (ScanKey) NULL);
746                 heapScan = (HeapScanDesc) tableScan;
747                 indexScan = NULL;
748
749                 /* Set total heap blocks */
750                 pgstat_progress_update_param(PROGRESS_CLUSTER_TOTAL_HEAP_BLKS,
751                                                                          heapScan->rs_nblocks);
752         }
753
754         slot = table_slot_create(OldHeap, NULL);
755         hslot = (BufferHeapTupleTableSlot *) slot;
756
757         /*
758          * Scan through the OldHeap, either in OldIndex order or sequentially;
759          * copy each tuple into the NewHeap, or transiently to the tuplesort
760          * module.  Note that we don't bother sorting dead tuples (they won't get
761          * to the new table anyway).
762          */
763         for (;;)
764         {
765                 HeapTuple       tuple;
766                 Buffer          buf;
767                 bool            isdead;
768
769                 CHECK_FOR_INTERRUPTS();
770
771                 if (indexScan != NULL)
772                 {
773                         if (!index_getnext_slot(indexScan, ForwardScanDirection, slot))
774                                 break;
775
776                         /* Since we used no scan keys, should never need to recheck */
777                         if (indexScan->xs_recheck)
778                                 elog(ERROR, "CLUSTER does not support lossy index conditions");
779                 }
780                 else
781                 {
782                         if (!table_scan_getnextslot(tableScan, ForwardScanDirection, slot))
783                                 break;
784
785                         /*
786                          * In scan-and-sort mode and also VACUUM FULL, set heap blocks
787                          * scanned
788                          */
789                         pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_BLKS_SCANNED,
790                                                                                  heapScan->rs_cblock + 1);
791                 }
792
793                 tuple = ExecFetchSlotHeapTuple(slot, false, NULL);
794                 buf = hslot->buffer;
795
796                 LockBuffer(buf, BUFFER_LOCK_SHARE);
797
798                 switch (HeapTupleSatisfiesVacuum(tuple, OldestXmin, buf))
799                 {
800                         case HEAPTUPLE_DEAD:
801                                 /* Definitely dead */
802                                 isdead = true;
803                                 break;
804                         case HEAPTUPLE_RECENTLY_DEAD:
805                                 *tups_recently_dead += 1;
806                                 /* fall through */
807                         case HEAPTUPLE_LIVE:
808                                 /* Live or recently dead, must copy it */
809                                 isdead = false;
810                                 break;
811                         case HEAPTUPLE_INSERT_IN_PROGRESS:
812
813                                 /*
814                                  * Since we hold exclusive lock on the relation, normally the
815                                  * only way to see this is if it was inserted earlier in our
816                                  * own transaction.  However, it can happen in system
817                                  * catalogs, since we tend to release write lock before commit
818                                  * there.  Give a warning if neither case applies; but in any
819                                  * case we had better copy it.
820                                  */
821                                 if (!is_system_catalog &&
822                                         !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(tuple->t_data)))
823                                         elog(WARNING, "concurrent insert in progress within table \"%s\"",
824                                                  RelationGetRelationName(OldHeap));
825                                 /* treat as live */
826                                 isdead = false;
827                                 break;
828                         case HEAPTUPLE_DELETE_IN_PROGRESS:
829
830                                 /*
831                                  * Similar situation to INSERT_IN_PROGRESS case.
832                                  */
833                                 if (!is_system_catalog &&
834                                         !TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(tuple->t_data)))
835                                         elog(WARNING, "concurrent delete in progress within table \"%s\"",
836                                                  RelationGetRelationName(OldHeap));
837                                 /* treat as recently dead */
838                                 *tups_recently_dead += 1;
839                                 isdead = false;
840                                 break;
841                         default:
842                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
843                                 isdead = false; /* keep compiler quiet */
844                                 break;
845                 }
846
847                 LockBuffer(buf, BUFFER_LOCK_UNLOCK);
848
849                 if (isdead)
850                 {
851                         *tups_vacuumed += 1;
852                         /* heap rewrite module still needs to see it... */
853                         if (rewrite_heap_dead_tuple(rwstate, tuple))
854                         {
855                                 /* A previous recently-dead tuple is now known dead */
856                                 *tups_vacuumed += 1;
857                                 *tups_recently_dead -= 1;
858                         }
859                         continue;
860                 }
861
862                 *num_tuples += 1;
863                 if (tuplesort != NULL)
864                 {
865                         tuplesort_putheaptuple(tuplesort, tuple);
866
867                         /*
868                          * In scan-and-sort mode, report increase in number of tuples
869                          * scanned
870                          */
871                         pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
872                                                                                  *num_tuples);
873                 }
874                 else
875                 {
876                         const int       ct_index[] = {
877                                 PROGRESS_CLUSTER_HEAP_TUPLES_SCANNED,
878                                 PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN
879                         };
880                         int64           ct_val[2];
881
882                         reform_and_rewrite_tuple(tuple, OldHeap, NewHeap,
883                                                                          values, isnull, rwstate);
884
885                         /*
886                          * In indexscan mode and also VACUUM FULL, report increase in
887                          * number of tuples scanned and written
888                          */
889                         ct_val[0] = *num_tuples;
890                         ct_val[1] = *num_tuples;
891                         pgstat_progress_update_multi_param(2, ct_index, ct_val);
892                 }
893         }
894
895         if (indexScan != NULL)
896                 index_endscan(indexScan);
897         if (tableScan != NULL)
898                 table_endscan(tableScan);
899         if (slot)
900                 ExecDropSingleTupleTableSlot(slot);
901
902         /*
903          * In scan-and-sort mode, complete the sort, then read out all live tuples
904          * from the tuplestore and write them to the new relation.
905          */
906         if (tuplesort != NULL)
907         {
908                 double          n_tuples = 0;
909
910                 /* Report that we are now sorting tuples */
911                 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
912                                                                          PROGRESS_CLUSTER_PHASE_SORT_TUPLES);
913
914                 tuplesort_performsort(tuplesort);
915
916                 /* Report that we are now writing new heap */
917                 pgstat_progress_update_param(PROGRESS_CLUSTER_PHASE,
918                                                                          PROGRESS_CLUSTER_PHASE_WRITE_NEW_HEAP);
919
920                 for (;;)
921                 {
922                         HeapTuple       tuple;
923
924                         CHECK_FOR_INTERRUPTS();
925
926                         tuple = tuplesort_getheaptuple(tuplesort, true);
927                         if (tuple == NULL)
928                                 break;
929
930                         n_tuples += 1;
931                         reform_and_rewrite_tuple(tuple,
932                                                                          OldHeap, NewHeap,
933                                                                          values, isnull,
934                                                                          rwstate);
935                         /* Report n_tuples */
936                         pgstat_progress_update_param(PROGRESS_CLUSTER_HEAP_TUPLES_WRITTEN,
937                                                                                  n_tuples);
938                 }
939
940                 tuplesort_end(tuplesort);
941         }
942
943         /* Write out any remaining tuples, and fsync if needed */
944         end_heap_rewrite(rwstate);
945
946         /* Clean up */
947         pfree(values);
948         pfree(isnull);
949 }
950
951 static bool
952 heapam_scan_analyze_next_block(TableScanDesc scan, BlockNumber blockno,
953                                                            BufferAccessStrategy bstrategy)
954 {
955         HeapScanDesc hscan = (HeapScanDesc) scan;
956
957         /*
958          * We must maintain a pin on the target page's buffer to ensure that
959          * concurrent activity - e.g. HOT pruning - doesn't delete tuples out from
960          * under us.  Hence, pin the page until we are done looking at it.  We
961          * also choose to hold sharelock on the buffer throughout --- we could
962          * release and re-acquire sharelock for each tuple, but since we aren't
963          * doing much work per tuple, the extra lock traffic is probably better
964          * avoided.
965          */
966         hscan->rs_cblock = blockno;
967         hscan->rs_cindex = FirstOffsetNumber;
968         hscan->rs_cbuf = ReadBufferExtended(scan->rs_rd, MAIN_FORKNUM,
969                                                                                 blockno, RBM_NORMAL, bstrategy);
970         LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
971
972         /* in heap all blocks can contain tuples, so always return true */
973         return true;
974 }
975
976 static bool
977 heapam_scan_analyze_next_tuple(TableScanDesc scan, TransactionId OldestXmin,
978                                                            double *liverows, double *deadrows,
979                                                            TupleTableSlot *slot)
980 {
981         HeapScanDesc hscan = (HeapScanDesc) scan;
982         Page            targpage;
983         OffsetNumber maxoffset;
984         BufferHeapTupleTableSlot *hslot;
985
986         Assert(TTS_IS_BUFFERTUPLE(slot));
987
988         hslot = (BufferHeapTupleTableSlot *) slot;
989         targpage = BufferGetPage(hscan->rs_cbuf);
990         maxoffset = PageGetMaxOffsetNumber(targpage);
991
992         /* Inner loop over all tuples on the selected page */
993         for (; hscan->rs_cindex <= maxoffset; hscan->rs_cindex++)
994         {
995                 ItemId          itemid;
996                 HeapTuple       targtuple = &hslot->base.tupdata;
997                 bool            sample_it = false;
998
999                 itemid = PageGetItemId(targpage, hscan->rs_cindex);
1000
1001                 /*
1002                  * We ignore unused and redirect line pointers.  DEAD line pointers
1003                  * should be counted as dead, because we need vacuum to run to get rid
1004                  * of them.  Note that this rule agrees with the way that
1005                  * heap_page_prune() counts things.
1006                  */
1007                 if (!ItemIdIsNormal(itemid))
1008                 {
1009                         if (ItemIdIsDead(itemid))
1010                                 *deadrows += 1;
1011                         continue;
1012                 }
1013
1014                 ItemPointerSet(&targtuple->t_self, hscan->rs_cblock, hscan->rs_cindex);
1015
1016                 targtuple->t_tableOid = RelationGetRelid(scan->rs_rd);
1017                 targtuple->t_data = (HeapTupleHeader) PageGetItem(targpage, itemid);
1018                 targtuple->t_len = ItemIdGetLength(itemid);
1019
1020                 switch (HeapTupleSatisfiesVacuum(targtuple, OldestXmin,
1021                                                                                  hscan->rs_cbuf))
1022                 {
1023                         case HEAPTUPLE_LIVE:
1024                                 sample_it = true;
1025                                 *liverows += 1;
1026                                 break;
1027
1028                         case HEAPTUPLE_DEAD:
1029                         case HEAPTUPLE_RECENTLY_DEAD:
1030                                 /* Count dead and recently-dead rows */
1031                                 *deadrows += 1;
1032                                 break;
1033
1034                         case HEAPTUPLE_INSERT_IN_PROGRESS:
1035
1036                                 /*
1037                                  * Insert-in-progress rows are not counted.  We assume that
1038                                  * when the inserting transaction commits or aborts, it will
1039                                  * send a stats message to increment the proper count.  This
1040                                  * works right only if that transaction ends after we finish
1041                                  * analyzing the table; if things happen in the other order,
1042                                  * its stats update will be overwritten by ours.  However, the
1043                                  * error will be large only if the other transaction runs long
1044                                  * enough to insert many tuples, so assuming it will finish
1045                                  * after us is the safer option.
1046                                  *
1047                                  * A special case is that the inserting transaction might be
1048                                  * our own.  In this case we should count and sample the row,
1049                                  * to accommodate users who load a table and analyze it in one
1050                                  * transaction.  (pgstat_report_analyze has to adjust the
1051                                  * numbers we send to the stats collector to make this come
1052                                  * out right.)
1053                                  */
1054                                 if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmin(targtuple->t_data)))
1055                                 {
1056                                         sample_it = true;
1057                                         *liverows += 1;
1058                                 }
1059                                 break;
1060
1061                         case HEAPTUPLE_DELETE_IN_PROGRESS:
1062
1063                                 /*
1064                                  * We count and sample delete-in-progress rows the same as
1065                                  * live ones, so that the stats counters come out right if the
1066                                  * deleting transaction commits after us, per the same
1067                                  * reasoning given above.
1068                                  *
1069                                  * If the delete was done by our own transaction, however, we
1070                                  * must count the row as dead to make pgstat_report_analyze's
1071                                  * stats adjustments come out right.  (Note: this works out
1072                                  * properly when the row was both inserted and deleted in our
1073                                  * xact.)
1074                                  *
1075                                  * The net effect of these choices is that we act as though an
1076                                  * IN_PROGRESS transaction hasn't happened yet, except if it
1077                                  * is our own transaction, which we assume has happened.
1078                                  *
1079                                  * This approach ensures that we behave sanely if we see both
1080                                  * the pre-image and post-image rows for a row being updated
1081                                  * by a concurrent transaction: we will sample the pre-image
1082                                  * but not the post-image.  We also get sane results if the
1083                                  * concurrent transaction never commits.
1084                                  */
1085                                 if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetUpdateXid(targtuple->t_data)))
1086                                         deadrows += 1;
1087                                 else
1088                                 {
1089                                         sample_it = true;
1090                                         liverows += 1;
1091                                 }
1092                                 break;
1093
1094                         default:
1095                                 elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1096                                 break;
1097                 }
1098
1099                 if (sample_it)
1100                 {
1101                         ExecStoreBufferHeapTuple(targtuple, slot, hscan->rs_cbuf);
1102                         hscan->rs_cindex++;
1103
1104                         /* note that we leave the buffer locked here! */
1105                         return true;
1106                 }
1107         }
1108
1109         /* Now release the lock and pin on the page */
1110         UnlockReleaseBuffer(hscan->rs_cbuf);
1111         hscan->rs_cbuf = InvalidBuffer;
1112
1113         /* also prevent old slot contents from having pin on page */
1114         ExecClearTuple(slot);
1115
1116         return false;
1117 }
1118
1119 static double
1120 heapam_index_build_range_scan(Relation heapRelation,
1121                                                           Relation indexRelation,
1122                                                           IndexInfo *indexInfo,
1123                                                           bool allow_sync,
1124                                                           bool anyvisible,
1125                                                           bool progress,
1126                                                           BlockNumber start_blockno,
1127                                                           BlockNumber numblocks,
1128                                                           IndexBuildCallback callback,
1129                                                           void *callback_state,
1130                                                           TableScanDesc scan)
1131 {
1132         HeapScanDesc hscan;
1133         bool            is_system_catalog;
1134         bool            checking_uniqueness;
1135         HeapTuple       heapTuple;
1136         Datum           values[INDEX_MAX_KEYS];
1137         bool            isnull[INDEX_MAX_KEYS];
1138         double          reltuples;
1139         ExprState  *predicate;
1140         TupleTableSlot *slot;
1141         EState     *estate;
1142         ExprContext *econtext;
1143         Snapshot        snapshot;
1144         bool            need_unregister_snapshot = false;
1145         TransactionId OldestXmin;
1146         BlockNumber     previous_blkno = InvalidBlockNumber;
1147         BlockNumber root_blkno = InvalidBlockNumber;
1148         OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1149
1150         /*
1151          * sanity checks
1152          */
1153         Assert(OidIsValid(indexRelation->rd_rel->relam));
1154
1155         /* Remember if it's a system catalog */
1156         is_system_catalog = IsSystemRelation(heapRelation);
1157
1158         /* See whether we're verifying uniqueness/exclusion properties */
1159         checking_uniqueness = (indexInfo->ii_Unique ||
1160                                                    indexInfo->ii_ExclusionOps != NULL);
1161
1162         /*
1163          * "Any visible" mode is not compatible with uniqueness checks; make sure
1164          * only one of those is requested.
1165          */
1166         Assert(!(anyvisible && checking_uniqueness));
1167
1168         /*
1169          * Need an EState for evaluation of index expressions and partial-index
1170          * predicates.  Also a slot to hold the current tuple.
1171          */
1172         estate = CreateExecutorState();
1173         econtext = GetPerTupleExprContext(estate);
1174         slot = table_slot_create(heapRelation, NULL);
1175
1176         /* Arrange for econtext's scan tuple to be the tuple under test */
1177         econtext->ecxt_scantuple = slot;
1178
1179         /* Set up execution state for predicate, if any. */
1180         predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1181
1182         /*
1183          * Prepare for scan of the base relation.  In a normal index build, we use
1184          * SnapshotAny because we must retrieve all tuples and do our own time
1185          * qual checks (because we have to index RECENTLY_DEAD tuples). In a
1186          * concurrent build, or during bootstrap, we take a regular MVCC snapshot
1187          * and index whatever's live according to that.
1188          */
1189         OldestXmin = InvalidTransactionId;
1190
1191         /* okay to ignore lazy VACUUMs here */
1192         if (!IsBootstrapProcessingMode() && !indexInfo->ii_Concurrent)
1193                 OldestXmin = GetOldestXmin(heapRelation, PROCARRAY_FLAGS_VACUUM);
1194
1195         if (!scan)
1196         {
1197                 /*
1198                  * Serial index build.
1199                  *
1200                  * Must begin our own heap scan in this case.  We may also need to
1201                  * register a snapshot whose lifetime is under our direct control.
1202                  */
1203                 if (!TransactionIdIsValid(OldestXmin))
1204                 {
1205                         snapshot = RegisterSnapshot(GetTransactionSnapshot());
1206                         need_unregister_snapshot = true;
1207                 }
1208                 else
1209                         snapshot = SnapshotAny;
1210
1211                 scan = table_beginscan_strat(heapRelation,      /* relation */
1212                                                                          snapshot,      /* snapshot */
1213                                                                          0, /* number of keys */
1214                                                                          NULL,  /* scan key */
1215                                                                          true,  /* buffer access strategy OK */
1216                                                                          allow_sync);   /* syncscan OK? */
1217         }
1218         else
1219         {
1220                 /*
1221                  * Parallel index build.
1222                  *
1223                  * Parallel case never registers/unregisters own snapshot.  Snapshot
1224                  * is taken from parallel heap scan, and is SnapshotAny or an MVCC
1225                  * snapshot, based on same criteria as serial case.
1226                  */
1227                 Assert(!IsBootstrapProcessingMode());
1228                 Assert(allow_sync);
1229                 snapshot = scan->rs_snapshot;
1230         }
1231
1232         hscan = (HeapScanDesc) scan;
1233
1234         /* Publish number of blocks to scan */
1235         if (progress)
1236         {
1237                 BlockNumber             nblocks;
1238
1239                 if (hscan->rs_base.rs_parallel != NULL)
1240                 {
1241                         ParallelBlockTableScanDesc pbscan;
1242
1243                         pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1244                         nblocks = pbscan->phs_nblocks;
1245                 }
1246                 else
1247                         nblocks = hscan->rs_nblocks;
1248
1249                 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1250                                                                          nblocks);
1251         }
1252
1253         /*
1254          * Must call GetOldestXmin() with SnapshotAny.  Should never call
1255          * GetOldestXmin() with MVCC snapshot. (It's especially worth checking
1256          * this for parallel builds, since ambuild routines that support parallel
1257          * builds must work these details out for themselves.)
1258          */
1259         Assert(snapshot == SnapshotAny || IsMVCCSnapshot(snapshot));
1260         Assert(snapshot == SnapshotAny ? TransactionIdIsValid(OldestXmin) :
1261                    !TransactionIdIsValid(OldestXmin));
1262         Assert(snapshot == SnapshotAny || !anyvisible);
1263
1264         /* set our scan endpoints */
1265         if (!allow_sync)
1266                 heap_setscanlimits(scan, start_blockno, numblocks);
1267         else
1268         {
1269                 /* syncscan can only be requested on whole relation */
1270                 Assert(start_blockno == 0);
1271                 Assert(numblocks == InvalidBlockNumber);
1272         }
1273
1274         reltuples = 0;
1275
1276         /*
1277          * Scan all tuples in the base relation.
1278          */
1279         while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1280         {
1281                 bool            tupleIsAlive;
1282
1283                 CHECK_FOR_INTERRUPTS();
1284
1285                 /* Report scan progress, if asked to. */
1286                 if (progress)
1287                 {
1288                         BlockNumber     blocks_done = heapam_scan_get_blocks_done(hscan);
1289
1290                         if (blocks_done != previous_blkno)
1291                         {
1292                                 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1293                                                                                          blocks_done);
1294                                 previous_blkno = blocks_done;
1295                         }
1296                 }
1297
1298                 /*
1299                  * When dealing with a HOT-chain of updated tuples, we want to index
1300                  * the values of the live tuple (if any), but index it under the TID
1301                  * of the chain's root tuple.  This approach is necessary to preserve
1302                  * the HOT-chain structure in the heap. So we need to be able to find
1303                  * the root item offset for every tuple that's in a HOT-chain.  When
1304                  * first reaching a new page of the relation, call
1305                  * heap_get_root_tuples() to build a map of root item offsets on the
1306                  * page.
1307                  *
1308                  * It might look unsafe to use this information across buffer
1309                  * lock/unlock.  However, we hold ShareLock on the table so no
1310                  * ordinary insert/update/delete should occur; and we hold pin on the
1311                  * buffer continuously while visiting the page, so no pruning
1312                  * operation can occur either.
1313                  *
1314                  * Also, although our opinions about tuple liveness could change while
1315                  * we scan the page (due to concurrent transaction commits/aborts),
1316                  * the chain root locations won't, so this info doesn't need to be
1317                  * rebuilt after waiting for another transaction.
1318                  *
1319                  * Note the implied assumption that there is no more than one live
1320                  * tuple per HOT-chain --- else we could create more than one index
1321                  * entry pointing to the same root tuple.
1322                  */
1323                 if (hscan->rs_cblock != root_blkno)
1324                 {
1325                         Page            page = BufferGetPage(hscan->rs_cbuf);
1326
1327                         LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1328                         heap_get_root_tuples(page, root_offsets);
1329                         LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1330
1331                         root_blkno = hscan->rs_cblock;
1332                 }
1333
1334                 if (snapshot == SnapshotAny)
1335                 {
1336                         /* do our own time qual check */
1337                         bool            indexIt;
1338                         TransactionId xwait;
1339
1340         recheck:
1341
1342                         /*
1343                          * We could possibly get away with not locking the buffer here,
1344                          * since caller should hold ShareLock on the relation, but let's
1345                          * be conservative about it.  (This remark is still correct even
1346                          * with HOT-pruning: our pin on the buffer prevents pruning.)
1347                          */
1348                         LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1349
1350                         /*
1351                          * The criteria for counting a tuple as live in this block need to
1352                          * match what analyze.c's heapam_scan_analyze_next_tuple() does,
1353                          * otherwise CREATE INDEX and ANALYZE may produce wildly different
1354                          * reltuples values, e.g. when there are many recently-dead
1355                          * tuples.
1356                          */
1357                         switch (HeapTupleSatisfiesVacuum(heapTuple, OldestXmin,
1358                                                                                          hscan->rs_cbuf))
1359                         {
1360                                 case HEAPTUPLE_DEAD:
1361                                         /* Definitely dead, we can ignore it */
1362                                         indexIt = false;
1363                                         tupleIsAlive = false;
1364                                         break;
1365                                 case HEAPTUPLE_LIVE:
1366                                         /* Normal case, index and unique-check it */
1367                                         indexIt = true;
1368                                         tupleIsAlive = true;
1369                                         /* Count it as live, too */
1370                                         reltuples += 1;
1371                                         break;
1372                                 case HEAPTUPLE_RECENTLY_DEAD:
1373
1374                                         /*
1375                                          * If tuple is recently deleted then we must index it
1376                                          * anyway to preserve MVCC semantics.  (Pre-existing
1377                                          * transactions could try to use the index after we finish
1378                                          * building it, and may need to see such tuples.)
1379                                          *
1380                                          * However, if it was HOT-updated then we must only index
1381                                          * the live tuple at the end of the HOT-chain.  Since this
1382                                          * breaks semantics for pre-existing snapshots, mark the
1383                                          * index as unusable for them.
1384                                          *
1385                                          * We don't count recently-dead tuples in reltuples, even
1386                                          * if we index them; see heapam_scan_analyze_next_tuple().
1387                                          */
1388                                         if (HeapTupleIsHotUpdated(heapTuple))
1389                                         {
1390                                                 indexIt = false;
1391                                                 /* mark the index as unsafe for old snapshots */
1392                                                 indexInfo->ii_BrokenHotChain = true;
1393                                         }
1394                                         else
1395                                                 indexIt = true;
1396                                         /* In any case, exclude the tuple from unique-checking */
1397                                         tupleIsAlive = false;
1398                                         break;
1399                                 case HEAPTUPLE_INSERT_IN_PROGRESS:
1400
1401                                         /*
1402                                          * In "anyvisible" mode, this tuple is visible and we
1403                                          * don't need any further checks.
1404                                          */
1405                                         if (anyvisible)
1406                                         {
1407                                                 indexIt = true;
1408                                                 tupleIsAlive = true;
1409                                                 reltuples += 1;
1410                                                 break;
1411                                         }
1412
1413                                         /*
1414                                          * Since caller should hold ShareLock or better, normally
1415                                          * the only way to see this is if it was inserted earlier
1416                                          * in our own transaction.  However, it can happen in
1417                                          * system catalogs, since we tend to release write lock
1418                                          * before commit there.  Give a warning if neither case
1419                                          * applies.
1420                                          */
1421                                         xwait = HeapTupleHeaderGetXmin(heapTuple->t_data);
1422                                         if (!TransactionIdIsCurrentTransactionId(xwait))
1423                                         {
1424                                                 if (!is_system_catalog)
1425                                                         elog(WARNING, "concurrent insert in progress within table \"%s\"",
1426                                                                  RelationGetRelationName(heapRelation));
1427
1428                                                 /*
1429                                                  * If we are performing uniqueness checks, indexing
1430                                                  * such a tuple could lead to a bogus uniqueness
1431                                                  * failure.  In that case we wait for the inserting
1432                                                  * transaction to finish and check again.
1433                                                  */
1434                                                 if (checking_uniqueness)
1435                                                 {
1436                                                         /*
1437                                                          * Must drop the lock on the buffer before we wait
1438                                                          */
1439                                                         LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1440                                                         XactLockTableWait(xwait, heapRelation,
1441                                                                                           &heapTuple->t_self,
1442                                                                                           XLTW_InsertIndexUnique);
1443                                                         CHECK_FOR_INTERRUPTS();
1444                                                         goto recheck;
1445                                                 }
1446                                         }
1447                                         else
1448                                         {
1449                                                 /*
1450                                                  * For consistency with
1451                                                  * heapam_scan_analyze_next_tuple(), count
1452                                                  * HEAPTUPLE_INSERT_IN_PROGRESS tuples as live only
1453                                                  * when inserted by our own transaction.
1454                                                  */
1455                                                 reltuples += 1;
1456                                         }
1457
1458                                         /*
1459                                          * We must index such tuples, since if the index build
1460                                          * commits then they're good.
1461                                          */
1462                                         indexIt = true;
1463                                         tupleIsAlive = true;
1464                                         break;
1465                                 case HEAPTUPLE_DELETE_IN_PROGRESS:
1466
1467                                         /*
1468                                          * As with INSERT_IN_PROGRESS case, this is unexpected
1469                                          * unless it's our own deletion or a system catalog; but
1470                                          * in anyvisible mode, this tuple is visible.
1471                                          */
1472                                         if (anyvisible)
1473                                         {
1474                                                 indexIt = true;
1475                                                 tupleIsAlive = false;
1476                                                 reltuples += 1;
1477                                                 break;
1478                                         }
1479
1480                                         xwait = HeapTupleHeaderGetUpdateXid(heapTuple->t_data);
1481                                         if (!TransactionIdIsCurrentTransactionId(xwait))
1482                                         {
1483                                                 if (!is_system_catalog)
1484                                                         elog(WARNING, "concurrent delete in progress within table \"%s\"",
1485                                                                  RelationGetRelationName(heapRelation));
1486
1487                                                 /*
1488                                                  * If we are performing uniqueness checks, assuming
1489                                                  * the tuple is dead could lead to missing a
1490                                                  * uniqueness violation.  In that case we wait for the
1491                                                  * deleting transaction to finish and check again.
1492                                                  *
1493                                                  * Also, if it's a HOT-updated tuple, we should not
1494                                                  * index it but rather the live tuple at the end of
1495                                                  * the HOT-chain.  However, the deleting transaction
1496                                                  * could abort, possibly leaving this tuple as live
1497                                                  * after all, in which case it has to be indexed. The
1498                                                  * only way to know what to do is to wait for the
1499                                                  * deleting transaction to finish and check again.
1500                                                  */
1501                                                 if (checking_uniqueness ||
1502                                                         HeapTupleIsHotUpdated(heapTuple))
1503                                                 {
1504                                                         /*
1505                                                          * Must drop the lock on the buffer before we wait
1506                                                          */
1507                                                         LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1508                                                         XactLockTableWait(xwait, heapRelation,
1509                                                                                           &heapTuple->t_self,
1510                                                                                           XLTW_InsertIndexUnique);
1511                                                         CHECK_FOR_INTERRUPTS();
1512                                                         goto recheck;
1513                                                 }
1514
1515                                                 /*
1516                                                  * Otherwise index it but don't check for uniqueness,
1517                                                  * the same as a RECENTLY_DEAD tuple.
1518                                                  */
1519                                                 indexIt = true;
1520
1521                                                 /*
1522                                                  * Count HEAPTUPLE_DELETE_IN_PROGRESS tuples as live,
1523                                                  * if they were not deleted by the current
1524                                                  * transaction.  That's what
1525                                                  * heapam_scan_analyze_next_tuple() does, and we want
1526                                                  * the behavior to be consistent.
1527                                                  */
1528                                                 reltuples += 1;
1529                                         }
1530                                         else if (HeapTupleIsHotUpdated(heapTuple))
1531                                         {
1532                                                 /*
1533                                                  * It's a HOT-updated tuple deleted by our own xact.
1534                                                  * We can assume the deletion will commit (else the
1535                                                  * index contents don't matter), so treat the same as
1536                                                  * RECENTLY_DEAD HOT-updated tuples.
1537                                                  */
1538                                                 indexIt = false;
1539                                                 /* mark the index as unsafe for old snapshots */
1540                                                 indexInfo->ii_BrokenHotChain = true;
1541                                         }
1542                                         else
1543                                         {
1544                                                 /*
1545                                                  * It's a regular tuple deleted by our own xact. Index
1546                                                  * it, but don't check for uniqueness nor count in
1547                                                  * reltuples, the same as a RECENTLY_DEAD tuple.
1548                                                  */
1549                                                 indexIt = true;
1550                                         }
1551                                         /* In any case, exclude the tuple from unique-checking */
1552                                         tupleIsAlive = false;
1553                                         break;
1554                                 default:
1555                                         elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result");
1556                                         indexIt = tupleIsAlive = false; /* keep compiler quiet */
1557                                         break;
1558                         }
1559
1560                         LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1561
1562                         if (!indexIt)
1563                                 continue;
1564                 }
1565                 else
1566                 {
1567                         /* heap_getnext did the time qual check */
1568                         tupleIsAlive = true;
1569                         reltuples += 1;
1570                 }
1571
1572                 MemoryContextReset(econtext->ecxt_per_tuple_memory);
1573
1574                 /* Set up for predicate or expression evaluation */
1575                 ExecStoreBufferHeapTuple(heapTuple, slot, hscan->rs_cbuf);
1576
1577                 /*
1578                  * In a partial index, discard tuples that don't satisfy the
1579                  * predicate.
1580                  */
1581                 if (predicate != NULL)
1582                 {
1583                         if (!ExecQual(predicate, econtext))
1584                                 continue;
1585                 }
1586
1587                 /*
1588                  * For the current heap tuple, extract all the attributes we use in
1589                  * this index, and note which are null.  This also performs evaluation
1590                  * of any expressions needed.
1591                  */
1592                 FormIndexDatum(indexInfo,
1593                                            slot,
1594                                            estate,
1595                                            values,
1596                                            isnull);
1597
1598                 /*
1599                  * You'd think we should go ahead and build the index tuple here, but
1600                  * some index AMs want to do further processing on the data first.  So
1601                  * pass the values[] and isnull[] arrays, instead.
1602                  */
1603
1604                 if (HeapTupleIsHeapOnly(heapTuple))
1605                 {
1606                         /*
1607                          * For a heap-only tuple, pretend its TID is that of the root. See
1608                          * src/backend/access/heap/README.HOT for discussion.
1609                          */
1610                         HeapTupleData rootTuple;
1611                         OffsetNumber offnum;
1612
1613                         rootTuple = *heapTuple;
1614                         offnum = ItemPointerGetOffsetNumber(&heapTuple->t_self);
1615
1616                         if (!OffsetNumberIsValid(root_offsets[offnum - 1]))
1617                                 ereport(ERROR,
1618                                                 (errcode(ERRCODE_DATA_CORRUPTED),
1619                                                  errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1620                                                                                  ItemPointerGetBlockNumber(&heapTuple->t_self),
1621                                                                                  offnum,
1622                                                                                  RelationGetRelationName(heapRelation))));
1623
1624                         ItemPointerSetOffsetNumber(&rootTuple.t_self,
1625                                                                            root_offsets[offnum - 1]);
1626
1627                         /* Call the AM's callback routine to process the tuple */
1628                         callback(indexRelation, &rootTuple, values, isnull, tupleIsAlive,
1629                                          callback_state);
1630                 }
1631                 else
1632                 {
1633                         /* Call the AM's callback routine to process the tuple */
1634                         callback(indexRelation, heapTuple, values, isnull, tupleIsAlive,
1635                                          callback_state);
1636                 }
1637         }
1638
1639         /* Report scan progress one last time. */
1640         if (progress)
1641         {
1642                 BlockNumber             blks_done;
1643
1644                 if (hscan->rs_base.rs_parallel != NULL)
1645                 {
1646                         ParallelBlockTableScanDesc pbscan;
1647
1648                         pbscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1649                         blks_done = pbscan->phs_nblocks;
1650                 }
1651                 else
1652                         blks_done = hscan->rs_nblocks;
1653
1654                 pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1655                                                                          blks_done);
1656         }
1657
1658         table_endscan(scan);
1659
1660         /* we can now forget our snapshot, if set and registered by us */
1661         if (need_unregister_snapshot)
1662                 UnregisterSnapshot(snapshot);
1663
1664         ExecDropSingleTupleTableSlot(slot);
1665
1666         FreeExecutorState(estate);
1667
1668         /* These may have been pointing to the now-gone estate */
1669         indexInfo->ii_ExpressionsState = NIL;
1670         indexInfo->ii_PredicateState = NULL;
1671
1672         return reltuples;
1673 }
1674
1675 static void
1676 heapam_index_validate_scan(Relation heapRelation,
1677                                                    Relation indexRelation,
1678                                                    IndexInfo *indexInfo,
1679                                                    Snapshot snapshot,
1680                                                    ValidateIndexState *state)
1681 {
1682         TableScanDesc scan;
1683         HeapScanDesc hscan;
1684         HeapTuple       heapTuple;
1685         Datum           values[INDEX_MAX_KEYS];
1686         bool            isnull[INDEX_MAX_KEYS];
1687         ExprState  *predicate;
1688         TupleTableSlot *slot;
1689         EState     *estate;
1690         ExprContext *econtext;
1691         BlockNumber root_blkno = InvalidBlockNumber;
1692         OffsetNumber root_offsets[MaxHeapTuplesPerPage];
1693         bool            in_index[MaxHeapTuplesPerPage];
1694         BlockNumber     previous_blkno = InvalidBlockNumber;
1695
1696         /* state variables for the merge */
1697         ItemPointer indexcursor = NULL;
1698         ItemPointerData decoded;
1699         bool            tuplesort_empty = false;
1700
1701         /*
1702          * sanity checks
1703          */
1704         Assert(OidIsValid(indexRelation->rd_rel->relam));
1705
1706         /*
1707          * Need an EState for evaluation of index expressions and partial-index
1708          * predicates.  Also a slot to hold the current tuple.
1709          */
1710         estate = CreateExecutorState();
1711         econtext = GetPerTupleExprContext(estate);
1712         slot = MakeSingleTupleTableSlot(RelationGetDescr(heapRelation),
1713                                                                         &TTSOpsHeapTuple);
1714
1715         /* Arrange for econtext's scan tuple to be the tuple under test */
1716         econtext->ecxt_scantuple = slot;
1717
1718         /* Set up execution state for predicate, if any. */
1719         predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate);
1720
1721         /*
1722          * Prepare for scan of the base relation.  We need just those tuples
1723          * satisfying the passed-in reference snapshot.  We must disable syncscan
1724          * here, because it's critical that we read from block zero forward to
1725          * match the sorted TIDs.
1726          */
1727         scan = table_beginscan_strat(heapRelation,      /* relation */
1728                                                                  snapshot,      /* snapshot */
1729                                                                  0, /* number of keys */
1730                                                                  NULL,  /* scan key */
1731                                                                  true,  /* buffer access strategy OK */
1732                                                                  false);        /* syncscan not OK */
1733         hscan = (HeapScanDesc) scan;
1734
1735         pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_TOTAL,
1736                                                                  hscan->rs_nblocks);
1737
1738         /*
1739          * Scan all tuples matching the snapshot.
1740          */
1741         while ((heapTuple = heap_getnext(scan, ForwardScanDirection)) != NULL)
1742         {
1743                 ItemPointer heapcursor = &heapTuple->t_self;
1744                 ItemPointerData rootTuple;
1745                 OffsetNumber root_offnum;
1746
1747                 CHECK_FOR_INTERRUPTS();
1748
1749                 state->htups += 1;
1750
1751                 if ((previous_blkno == InvalidBlockNumber) ||
1752                         (hscan->rs_cblock != previous_blkno))
1753                 {
1754                         pgstat_progress_update_param(PROGRESS_SCAN_BLOCKS_DONE,
1755                                                                                  hscan->rs_cblock);
1756                         previous_blkno = hscan->rs_cblock;
1757                 }
1758
1759                 /*
1760                  * As commented in table_index_build_scan, we should index heap-only
1761                  * tuples under the TIDs of their root tuples; so when we advance onto
1762                  * a new heap page, build a map of root item offsets on the page.
1763                  *
1764                  * This complicates merging against the tuplesort output: we will
1765                  * visit the live tuples in order by their offsets, but the root
1766                  * offsets that we need to compare against the index contents might be
1767                  * ordered differently.  So we might have to "look back" within the
1768                  * tuplesort output, but only within the current page.  We handle that
1769                  * by keeping a bool array in_index[] showing all the
1770                  * already-passed-over tuplesort output TIDs of the current page. We
1771                  * clear that array here, when advancing onto a new heap page.
1772                  */
1773                 if (hscan->rs_cblock != root_blkno)
1774                 {
1775                         Page            page = BufferGetPage(hscan->rs_cbuf);
1776
1777                         LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
1778                         heap_get_root_tuples(page, root_offsets);
1779                         LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
1780
1781                         memset(in_index, 0, sizeof(in_index));
1782
1783                         root_blkno = hscan->rs_cblock;
1784                 }
1785
1786                 /* Convert actual tuple TID to root TID */
1787                 rootTuple = *heapcursor;
1788                 root_offnum = ItemPointerGetOffsetNumber(heapcursor);
1789
1790                 if (HeapTupleIsHeapOnly(heapTuple))
1791                 {
1792                         root_offnum = root_offsets[root_offnum - 1];
1793                         if (!OffsetNumberIsValid(root_offnum))
1794                                 ereport(ERROR,
1795                                                 (errcode(ERRCODE_DATA_CORRUPTED),
1796                                                  errmsg_internal("failed to find parent tuple for heap-only tuple at (%u,%u) in table \"%s\"",
1797                                                                                  ItemPointerGetBlockNumber(heapcursor),
1798                                                                                  ItemPointerGetOffsetNumber(heapcursor),
1799                                                                                  RelationGetRelationName(heapRelation))));
1800                         ItemPointerSetOffsetNumber(&rootTuple, root_offnum);
1801                 }
1802
1803                 /*
1804                  * "merge" by skipping through the index tuples until we find or pass
1805                  * the current root tuple.
1806                  */
1807                 while (!tuplesort_empty &&
1808                            (!indexcursor ||
1809                                 ItemPointerCompare(indexcursor, &rootTuple) < 0))
1810                 {
1811                         Datum           ts_val;
1812                         bool            ts_isnull;
1813
1814                         if (indexcursor)
1815                         {
1816                                 /*
1817                                  * Remember index items seen earlier on the current heap page
1818                                  */
1819                                 if (ItemPointerGetBlockNumber(indexcursor) == root_blkno)
1820                                         in_index[ItemPointerGetOffsetNumber(indexcursor) - 1] = true;
1821                         }
1822
1823                         tuplesort_empty = !tuplesort_getdatum(state->tuplesort, true,
1824                                                                                                   &ts_val, &ts_isnull, NULL);
1825                         Assert(tuplesort_empty || !ts_isnull);
1826                         if (!tuplesort_empty)
1827                         {
1828                                 itemptr_decode(&decoded, DatumGetInt64(ts_val));
1829                                 indexcursor = &decoded;
1830
1831                                 /* If int8 is pass-by-ref, free (encoded) TID Datum memory */
1832 #ifndef USE_FLOAT8_BYVAL
1833                                 pfree(DatumGetPointer(ts_val));
1834 #endif
1835                         }
1836                         else
1837                         {
1838                                 /* Be tidy */
1839                                 indexcursor = NULL;
1840                         }
1841                 }
1842
1843                 /*
1844                  * If the tuplesort has overshot *and* we didn't see a match earlier,
1845                  * then this tuple is missing from the index, so insert it.
1846                  */
1847                 if ((tuplesort_empty ||
1848                          ItemPointerCompare(indexcursor, &rootTuple) > 0) &&
1849                         !in_index[root_offnum - 1])
1850                 {
1851                         MemoryContextReset(econtext->ecxt_per_tuple_memory);
1852
1853                         /* Set up for predicate or expression evaluation */
1854                         ExecStoreHeapTuple(heapTuple, slot, false);
1855
1856                         /*
1857                          * In a partial index, discard tuples that don't satisfy the
1858                          * predicate.
1859                          */
1860                         if (predicate != NULL)
1861                         {
1862                                 if (!ExecQual(predicate, econtext))
1863                                         continue;
1864                         }
1865
1866                         /*
1867                          * For the current heap tuple, extract all the attributes we use
1868                          * in this index, and note which are null.  This also performs
1869                          * evaluation of any expressions needed.
1870                          */
1871                         FormIndexDatum(indexInfo,
1872                                                    slot,
1873                                                    estate,
1874                                                    values,
1875                                                    isnull);
1876
1877                         /*
1878                          * You'd think we should go ahead and build the index tuple here,
1879                          * but some index AMs want to do further processing on the data
1880                          * first. So pass the values[] and isnull[] arrays, instead.
1881                          */
1882
1883                         /*
1884                          * If the tuple is already committed dead, you might think we
1885                          * could suppress uniqueness checking, but this is no longer true
1886                          * in the presence of HOT, because the insert is actually a proxy
1887                          * for a uniqueness check on the whole HOT-chain.  That is, the
1888                          * tuple we have here could be dead because it was already
1889                          * HOT-updated, and if so the updating transaction will not have
1890                          * thought it should insert index entries.  The index AM will
1891                          * check the whole HOT-chain and correctly detect a conflict if
1892                          * there is one.
1893                          */
1894
1895                         index_insert(indexRelation,
1896                                                  values,
1897                                                  isnull,
1898                                                  &rootTuple,
1899                                                  heapRelation,
1900                                                  indexInfo->ii_Unique ?
1901                                                  UNIQUE_CHECK_YES : UNIQUE_CHECK_NO,
1902                                                  indexInfo);
1903
1904                         state->tups_inserted += 1;
1905                 }
1906         }
1907
1908         table_endscan(scan);
1909
1910         ExecDropSingleTupleTableSlot(slot);
1911
1912         FreeExecutorState(estate);
1913
1914         /* These may have been pointing to the now-gone estate */
1915         indexInfo->ii_ExpressionsState = NIL;
1916         indexInfo->ii_PredicateState = NULL;
1917 }
1918
1919 /*
1920  * Return the number of blocks that have been read by this scan since
1921  * starting.  This is meant for progress reporting rather than be fully
1922  * accurate: in a parallel scan, workers can be concurrently reading blocks
1923  * further ahead than what we report.
1924  */
1925 static BlockNumber
1926 heapam_scan_get_blocks_done(HeapScanDesc hscan)
1927 {
1928         ParallelBlockTableScanDesc bpscan = NULL;
1929         BlockNumber             startblock;
1930         BlockNumber             blocks_done;
1931
1932         if (hscan->rs_base.rs_parallel != NULL)
1933         {
1934                 bpscan = (ParallelBlockTableScanDesc) hscan->rs_base.rs_parallel;
1935                 startblock = bpscan->phs_startblock;
1936         }
1937         else
1938                 startblock = hscan->rs_startblock;
1939
1940         /*
1941          * Might have wrapped around the end of the relation, if startblock was
1942          * not zero.
1943          */
1944         if (hscan->rs_cblock > startblock)
1945                 blocks_done = hscan->rs_cblock - startblock;
1946         else
1947         {
1948                 BlockNumber     nblocks;
1949
1950                 nblocks = bpscan != NULL ? bpscan->phs_nblocks : hscan->rs_nblocks;
1951                 blocks_done = nblocks - startblock +
1952                         hscan->rs_cblock;
1953         }
1954
1955         return blocks_done;
1956 }
1957
1958
1959
1960 /* ------------------------------------------------------------------------
1961  * Planner related callbacks for the heap AM
1962  * ------------------------------------------------------------------------
1963  */
1964
1965 static void
1966 heapam_estimate_rel_size(Relation rel, int32 *attr_widths,
1967                                                  BlockNumber *pages, double *tuples,
1968                                                  double *allvisfrac)
1969 {
1970         BlockNumber curpages;
1971         BlockNumber relpages;
1972         double          reltuples;
1973         BlockNumber relallvisible;
1974         double          density;
1975
1976         /* it has storage, ok to call the smgr */
1977         curpages = RelationGetNumberOfBlocks(rel);
1978
1979         /* coerce values in pg_class to more desirable types */
1980         relpages = (BlockNumber) rel->rd_rel->relpages;
1981         reltuples = (double) rel->rd_rel->reltuples;
1982         relallvisible = (BlockNumber) rel->rd_rel->relallvisible;
1983
1984         /*
1985          * HACK: if the relation has never yet been vacuumed, use a minimum size
1986          * estimate of 10 pages.  The idea here is to avoid assuming a
1987          * newly-created table is really small, even if it currently is, because
1988          * that may not be true once some data gets loaded into it.  Once a vacuum
1989          * or analyze cycle has been done on it, it's more reasonable to believe
1990          * the size is somewhat stable.
1991          *
1992          * (Note that this is only an issue if the plan gets cached and used again
1993          * after the table has been filled.  What we're trying to avoid is using a
1994          * nestloop-type plan on a table that has grown substantially since the
1995          * plan was made.  Normally, autovacuum/autoanalyze will occur once enough
1996          * inserts have happened and cause cached-plan invalidation; but that
1997          * doesn't happen instantaneously, and it won't happen at all for cases
1998          * such as temporary tables.)
1999          *
2000          * We approximate "never vacuumed" by "has relpages = 0", which means this
2001          * will also fire on genuinely empty relations.  Not great, but
2002          * fortunately that's a seldom-seen case in the real world, and it
2003          * shouldn't degrade the quality of the plan too much anyway to err in
2004          * this direction.
2005          *
2006          * If the table has inheritance children, we don't apply this heuristic.
2007          * Totally empty parent tables are quite common, so we should be willing
2008          * to believe that they are empty.
2009          */
2010         if (curpages < 10 &&
2011                 relpages == 0 &&
2012                 !rel->rd_rel->relhassubclass)
2013                 curpages = 10;
2014
2015         /* report estimated # pages */
2016         *pages = curpages;
2017         /* quick exit if rel is clearly empty */
2018         if (curpages == 0)
2019         {
2020                 *tuples = 0;
2021                 *allvisfrac = 0;
2022                 return;
2023         }
2024
2025         /* estimate number of tuples from previous tuple density */
2026         if (relpages > 0)
2027                 density = reltuples / (double) relpages;
2028         else
2029         {
2030                 /*
2031                  * When we have no data because the relation was truncated, estimate
2032                  * tuple width from attribute datatypes.  We assume here that the
2033                  * pages are completely full, which is OK for tables (since they've
2034                  * presumably not been VACUUMed yet) but is probably an overestimate
2035                  * for indexes.  Fortunately get_relation_info() can clamp the
2036                  * overestimate to the parent table's size.
2037                  *
2038                  * Note: this code intentionally disregards alignment considerations,
2039                  * because (a) that would be gilding the lily considering how crude
2040                  * the estimate is, and (b) it creates platform dependencies in the
2041                  * default plans which are kind of a headache for regression testing.
2042                  */
2043                 int32           tuple_width;
2044
2045                 tuple_width = get_rel_data_width(rel, attr_widths);
2046                 tuple_width += MAXALIGN(SizeofHeapTupleHeader);
2047                 tuple_width += sizeof(ItemIdData);
2048                 /* note: integer division is intentional here */
2049                 density = (BLCKSZ - SizeOfPageHeaderData) / tuple_width;
2050         }
2051         *tuples = rint(density * (double) curpages);
2052
2053         /*
2054          * We use relallvisible as-is, rather than scaling it up like we do for
2055          * the pages and tuples counts, on the theory that any pages added since
2056          * the last VACUUM are most likely not marked all-visible.  But costsize.c
2057          * wants it converted to a fraction.
2058          */
2059         if (relallvisible == 0 || curpages <= 0)
2060                 *allvisfrac = 0;
2061         else if ((double) relallvisible >= curpages)
2062                 *allvisfrac = 1;
2063         else
2064                 *allvisfrac = (double) relallvisible / curpages;
2065 }
2066
2067
2068 /* ------------------------------------------------------------------------
2069  * Executor related callbacks for the heap AM
2070  * ------------------------------------------------------------------------
2071  */
2072
2073 static bool
2074 heapam_scan_bitmap_next_block(TableScanDesc scan,
2075                                                           TBMIterateResult *tbmres)
2076 {
2077         HeapScanDesc hscan = (HeapScanDesc) scan;
2078         BlockNumber page = tbmres->blockno;
2079         Buffer          buffer;
2080         Snapshot        snapshot;
2081         int                     ntup;
2082
2083         hscan->rs_cindex = 0;
2084         hscan->rs_ntuples = 0;
2085
2086         /*
2087          * Ignore any claimed entries past what we think is the end of the
2088          * relation. It may have been extended after the start of our scan (we
2089          * only hold an AccessShareLock, and it could be inserts from this
2090          * backend).
2091          */
2092         if (page >= hscan->rs_nblocks)
2093                 return false;
2094
2095         /*
2096          * Acquire pin on the target heap page, trading in any pin we held before.
2097          */
2098         hscan->rs_cbuf = ReleaseAndReadBuffer(hscan->rs_cbuf,
2099                                                                                   scan->rs_rd,
2100                                                                                   page);
2101         hscan->rs_cblock = page;
2102         buffer = hscan->rs_cbuf;
2103         snapshot = scan->rs_snapshot;
2104
2105         ntup = 0;
2106
2107         /*
2108          * Prune and repair fragmentation for the whole page, if possible.
2109          */
2110         heap_page_prune_opt(scan->rs_rd, buffer);
2111
2112         /*
2113          * We must hold share lock on the buffer content while examining tuple
2114          * visibility.  Afterwards, however, the tuples we have found to be
2115          * visible are guaranteed good as long as we hold the buffer pin.
2116          */
2117         LockBuffer(buffer, BUFFER_LOCK_SHARE);
2118
2119         /*
2120          * We need two separate strategies for lossy and non-lossy cases.
2121          */
2122         if (tbmres->ntuples >= 0)
2123         {
2124                 /*
2125                  * Bitmap is non-lossy, so we just look through the offsets listed in
2126                  * tbmres; but we have to follow any HOT chain starting at each such
2127                  * offset.
2128                  */
2129                 int                     curslot;
2130
2131                 for (curslot = 0; curslot < tbmres->ntuples; curslot++)
2132                 {
2133                         OffsetNumber offnum = tbmres->offsets[curslot];
2134                         ItemPointerData tid;
2135                         HeapTupleData heapTuple;
2136
2137                         ItemPointerSet(&tid, page, offnum);
2138                         if (heap_hot_search_buffer(&tid, scan->rs_rd, buffer, snapshot,
2139                                                                            &heapTuple, NULL, true))
2140                                 hscan->rs_vistuples[ntup++] = ItemPointerGetOffsetNumber(&tid);
2141                 }
2142         }
2143         else
2144         {
2145                 /*
2146                  * Bitmap is lossy, so we must examine each item pointer on the page.
2147                  * But we can ignore HOT chains, since we'll check each tuple anyway.
2148                  */
2149                 Page            dp = (Page) BufferGetPage(buffer);
2150                 OffsetNumber maxoff = PageGetMaxOffsetNumber(dp);
2151                 OffsetNumber offnum;
2152
2153                 for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum))
2154                 {
2155                         ItemId          lp;
2156                         HeapTupleData loctup;
2157                         bool            valid;
2158
2159                         lp = PageGetItemId(dp, offnum);
2160                         if (!ItemIdIsNormal(lp))
2161                                 continue;
2162                         loctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
2163                         loctup.t_len = ItemIdGetLength(lp);
2164                         loctup.t_tableOid = scan->rs_rd->rd_id;
2165                         ItemPointerSet(&loctup.t_self, page, offnum);
2166                         valid = HeapTupleSatisfiesVisibility(&loctup, snapshot, buffer);
2167                         if (valid)
2168                         {
2169                                 hscan->rs_vistuples[ntup++] = offnum;
2170                                 PredicateLockTuple(scan->rs_rd, &loctup, snapshot);
2171                         }
2172                         CheckForSerializableConflictOut(valid, scan->rs_rd, &loctup,
2173                                                                                         buffer, snapshot);
2174                 }
2175         }
2176
2177         LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
2178
2179         Assert(ntup <= MaxHeapTuplesPerPage);
2180         hscan->rs_ntuples = ntup;
2181
2182         return ntup > 0;
2183 }
2184
2185 static bool
2186 heapam_scan_bitmap_next_tuple(TableScanDesc scan,
2187                                                           TBMIterateResult *tbmres,
2188                                                           TupleTableSlot *slot)
2189 {
2190         HeapScanDesc hscan = (HeapScanDesc) scan;
2191         OffsetNumber targoffset;
2192         Page            dp;
2193         ItemId          lp;
2194
2195         /*
2196          * Out of range?  If so, nothing more to look at on this page
2197          */
2198         if (hscan->rs_cindex < 0 || hscan->rs_cindex >= hscan->rs_ntuples)
2199                 return false;
2200
2201         targoffset = hscan->rs_vistuples[hscan->rs_cindex];
2202         dp = (Page) BufferGetPage(hscan->rs_cbuf);
2203         lp = PageGetItemId(dp, targoffset);
2204         Assert(ItemIdIsNormal(lp));
2205
2206         hscan->rs_ctup.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp);
2207         hscan->rs_ctup.t_len = ItemIdGetLength(lp);
2208         hscan->rs_ctup.t_tableOid = scan->rs_rd->rd_id;
2209         ItemPointerSet(&hscan->rs_ctup.t_self, hscan->rs_cblock, targoffset);
2210
2211         pgstat_count_heap_fetch(scan->rs_rd);
2212
2213         /*
2214          * Set up the result slot to point to this tuple.  Note that the slot
2215          * acquires a pin on the buffer.
2216          */
2217         ExecStoreBufferHeapTuple(&hscan->rs_ctup,
2218                                                          slot,
2219                                                          hscan->rs_cbuf);
2220
2221         hscan->rs_cindex++;
2222
2223         return true;
2224 }
2225
2226 static bool
2227 heapam_scan_sample_next_block(TableScanDesc scan, SampleScanState *scanstate)
2228 {
2229         HeapScanDesc hscan = (HeapScanDesc) scan;
2230         TsmRoutine *tsm = scanstate->tsmroutine;
2231         BlockNumber blockno;
2232
2233         /* return false immediately if relation is empty */
2234         if (hscan->rs_nblocks == 0)
2235                 return false;
2236
2237         if (tsm->NextSampleBlock)
2238         {
2239                 blockno = tsm->NextSampleBlock(scanstate, hscan->rs_nblocks);
2240                 hscan->rs_cblock = blockno;
2241         }
2242         else
2243         {
2244                 /* scanning table sequentially */
2245
2246                 if (hscan->rs_cblock == InvalidBlockNumber)
2247                 {
2248                         Assert(!hscan->rs_inited);
2249                         blockno = hscan->rs_startblock;
2250                 }
2251                 else
2252                 {
2253                         Assert(hscan->rs_inited);
2254
2255                         blockno = hscan->rs_cblock + 1;
2256
2257                         if (blockno >= hscan->rs_nblocks)
2258                         {
2259                                 /* wrap to begining of rel, might not have started at 0 */
2260                                 blockno = 0;
2261                         }
2262
2263                         /*
2264                          * Report our new scan position for synchronization purposes.
2265                          *
2266                          * Note: we do this before checking for end of scan so that the
2267                          * final state of the position hint is back at the start of the
2268                          * rel.  That's not strictly necessary, but otherwise when you run
2269                          * the same query multiple times the starting position would shift
2270                          * a little bit backwards on every invocation, which is confusing.
2271                          * We don't guarantee any specific ordering in general, though.
2272                          */
2273                         if (scan->rs_syncscan)
2274                                 ss_report_location(scan->rs_rd, blockno);
2275
2276                         if (blockno == hscan->rs_startblock)
2277                         {
2278                                 blockno = InvalidBlockNumber;
2279                         }
2280                 }
2281         }
2282
2283         if (!BlockNumberIsValid(blockno))
2284         {
2285                 if (BufferIsValid(hscan->rs_cbuf))
2286                         ReleaseBuffer(hscan->rs_cbuf);
2287                 hscan->rs_cbuf = InvalidBuffer;
2288                 hscan->rs_cblock = InvalidBlockNumber;
2289                 hscan->rs_inited = false;
2290
2291                 return false;
2292         }
2293
2294         heapgetpage(scan, blockno);
2295         hscan->rs_inited = true;
2296
2297         return true;
2298 }
2299
2300 static bool
2301 heapam_scan_sample_next_tuple(TableScanDesc scan, SampleScanState *scanstate,
2302                                                           TupleTableSlot *slot)
2303 {
2304         HeapScanDesc hscan = (HeapScanDesc) scan;
2305         TsmRoutine *tsm = scanstate->tsmroutine;
2306         BlockNumber blockno = hscan->rs_cblock;
2307         bool            pagemode = scan->rs_pageatatime;
2308
2309         Page            page;
2310         bool            all_visible;
2311         OffsetNumber maxoffset;
2312
2313         /*
2314          * When not using pagemode, we must lock the buffer during tuple
2315          * visibility checks.
2316          */
2317         if (!pagemode)
2318                 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_SHARE);
2319
2320         page = (Page) BufferGetPage(hscan->rs_cbuf);
2321         all_visible = PageIsAllVisible(page) &&
2322                 !scan->rs_snapshot->takenDuringRecovery;
2323         maxoffset = PageGetMaxOffsetNumber(page);
2324
2325         for (;;)
2326         {
2327                 OffsetNumber tupoffset;
2328
2329                 CHECK_FOR_INTERRUPTS();
2330
2331                 /* Ask the tablesample method which tuples to check on this page. */
2332                 tupoffset = tsm->NextSampleTuple(scanstate,
2333                                                                                  blockno,
2334                                                                                  maxoffset);
2335
2336                 if (OffsetNumberIsValid(tupoffset))
2337                 {
2338                         ItemId          itemid;
2339                         bool            visible;
2340                         HeapTuple       tuple = &(hscan->rs_ctup);
2341
2342                         /* Skip invalid tuple pointers. */
2343                         itemid = PageGetItemId(page, tupoffset);
2344                         if (!ItemIdIsNormal(itemid))
2345                                 continue;
2346
2347                         tuple->t_data = (HeapTupleHeader) PageGetItem(page, itemid);
2348                         tuple->t_len = ItemIdGetLength(itemid);
2349                         ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
2350
2351
2352                         if (all_visible)
2353                                 visible = true;
2354                         else
2355                                 visible = SampleHeapTupleVisible(scan, hscan->rs_cbuf,
2356                                                                                                  tuple, tupoffset);
2357
2358                         /* in pagemode, heapgetpage did this for us */
2359                         if (!pagemode)
2360                                 CheckForSerializableConflictOut(visible, scan->rs_rd, tuple,
2361                                                                                                 hscan->rs_cbuf, scan->rs_snapshot);
2362
2363                         /* Try next tuple from same page. */
2364                         if (!visible)
2365                                 continue;
2366
2367                         /* Found visible tuple, return it. */
2368                         if (!pagemode)
2369                                 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2370
2371                         ExecStoreBufferHeapTuple(tuple, slot, hscan->rs_cbuf);
2372
2373                         /* Count successfully-fetched tuples as heap fetches */
2374                         pgstat_count_heap_getnext(scan->rs_rd);
2375
2376                         return true;
2377                 }
2378                 else
2379                 {
2380                         /*
2381                          * If we get here, it means we've exhausted the items on this page
2382                          * and it's time to move to the next.
2383                          */
2384                         if (!pagemode)
2385                                 LockBuffer(hscan->rs_cbuf, BUFFER_LOCK_UNLOCK);
2386
2387                         ExecClearTuple(slot);
2388                         return false;
2389                 }
2390         }
2391
2392         Assert(0);
2393 }
2394
2395
2396 /* ----------------------------------------------------------------------------
2397  *  Helper functions for the above.
2398  * ----------------------------------------------------------------------------
2399  */
2400
2401 /*
2402  * Reconstruct and rewrite the given tuple
2403  *
2404  * We cannot simply copy the tuple as-is, for several reasons:
2405  *
2406  * 1. We'd like to squeeze out the values of any dropped columns, both
2407  * to save space and to ensure we have no corner-case failures. (It's
2408  * possible for example that the new table hasn't got a TOAST table
2409  * and so is unable to store any large values of dropped cols.)
2410  *
2411  * 2. The tuple might not even be legal for the new table; this is
2412  * currently only known to happen as an after-effect of ALTER TABLE
2413  * SET WITHOUT OIDS.
2414  *
2415  * So, we must reconstruct the tuple from component Datums.
2416  */
2417 static void
2418 reform_and_rewrite_tuple(HeapTuple tuple,
2419                                                  Relation OldHeap, Relation NewHeap,
2420                                                  Datum *values, bool *isnull, RewriteState rwstate)
2421 {
2422         TupleDesc       oldTupDesc = RelationGetDescr(OldHeap);
2423         TupleDesc       newTupDesc = RelationGetDescr(NewHeap);
2424         HeapTuple       copiedTuple;
2425         int                     i;
2426
2427         heap_deform_tuple(tuple, oldTupDesc, values, isnull);
2428
2429         /* Be sure to null out any dropped columns */
2430         for (i = 0; i < newTupDesc->natts; i++)
2431         {
2432                 if (TupleDescAttr(newTupDesc, i)->attisdropped)
2433                         isnull[i] = true;
2434         }
2435
2436         copiedTuple = heap_form_tuple(newTupDesc, values, isnull);
2437
2438         /* The heap rewrite module does the rest */
2439         rewrite_heap_tuple(rwstate, tuple, copiedTuple);
2440
2441         heap_freetuple(copiedTuple);
2442 }
2443
2444 /*
2445  * Check visibility of the tuple.
2446  */
2447 static bool
2448 SampleHeapTupleVisible(TableScanDesc scan, Buffer buffer,
2449                                            HeapTuple tuple,
2450                                            OffsetNumber tupoffset)
2451 {
2452         HeapScanDesc hscan = (HeapScanDesc) scan;
2453
2454         if (scan->rs_pageatatime)
2455         {
2456                 /*
2457                  * In pageatatime mode, heapgetpage() already did visibility checks,
2458                  * so just look at the info it left in rs_vistuples[].
2459                  *
2460                  * We use a binary search over the known-sorted array.  Note: we could
2461                  * save some effort if we insisted that NextSampleTuple select tuples
2462                  * in increasing order, but it's not clear that there would be enough
2463                  * gain to justify the restriction.
2464                  */
2465                 int                     start = 0,
2466                                         end = hscan->rs_ntuples - 1;
2467
2468                 while (start <= end)
2469                 {
2470                         int                     mid = (start + end) / 2;
2471                         OffsetNumber curoffset = hscan->rs_vistuples[mid];
2472
2473                         if (tupoffset == curoffset)
2474                                 return true;
2475                         else if (tupoffset < curoffset)
2476                                 end = mid - 1;
2477                         else
2478                                 start = mid + 1;
2479                 }
2480
2481                 return false;
2482         }
2483         else
2484         {
2485                 /* Otherwise, we have to check the tuple individually. */
2486                 return HeapTupleSatisfiesVisibility(tuple, scan->rs_snapshot,
2487                                                                                         buffer);
2488         }
2489 }
2490
2491
2492 /* ------------------------------------------------------------------------
2493  * Definition of the heap table access method.
2494  * ------------------------------------------------------------------------
2495  */
2496
2497 static const TableAmRoutine heapam_methods = {
2498         .type = T_TableAmRoutine,
2499
2500         .slot_callbacks = heapam_slot_callbacks,
2501
2502         .scan_begin = heap_beginscan,
2503         .scan_end = heap_endscan,
2504         .scan_rescan = heap_rescan,
2505         .scan_getnextslot = heap_getnextslot,
2506
2507         .parallelscan_estimate = table_block_parallelscan_estimate,
2508         .parallelscan_initialize = table_block_parallelscan_initialize,
2509         .parallelscan_reinitialize = table_block_parallelscan_reinitialize,
2510
2511         .index_fetch_begin = heapam_index_fetch_begin,
2512         .index_fetch_reset = heapam_index_fetch_reset,
2513         .index_fetch_end = heapam_index_fetch_end,
2514         .index_fetch_tuple = heapam_index_fetch_tuple,
2515
2516         .tuple_insert = heapam_tuple_insert,
2517         .tuple_insert_speculative = heapam_tuple_insert_speculative,
2518         .tuple_complete_speculative = heapam_tuple_complete_speculative,
2519         .tuple_delete = heapam_tuple_delete,
2520         .tuple_update = heapam_tuple_update,
2521         .tuple_lock = heapam_tuple_lock,
2522         .finish_bulk_insert = heapam_finish_bulk_insert,
2523
2524         .tuple_fetch_row_version = heapam_fetch_row_version,
2525         .tuple_get_latest_tid = heap_get_latest_tid,
2526         .tuple_satisfies_snapshot = heapam_tuple_satisfies_snapshot,
2527         .compute_xid_horizon_for_tuples = heap_compute_xid_horizon_for_tuples,
2528
2529         .relation_set_new_filenode = heapam_relation_set_new_filenode,
2530         .relation_nontransactional_truncate = heapam_relation_nontransactional_truncate,
2531         .relation_copy_data = heapam_relation_copy_data,
2532         .relation_copy_for_cluster = heapam_relation_copy_for_cluster,
2533         .relation_vacuum = heap_vacuum_rel,
2534         .scan_analyze_next_block = heapam_scan_analyze_next_block,
2535         .scan_analyze_next_tuple = heapam_scan_analyze_next_tuple,
2536         .index_build_range_scan = heapam_index_build_range_scan,
2537         .index_validate_scan = heapam_index_validate_scan,
2538
2539         .relation_estimate_size = heapam_estimate_rel_size,
2540
2541         .scan_bitmap_next_block = heapam_scan_bitmap_next_block,
2542         .scan_bitmap_next_tuple = heapam_scan_bitmap_next_tuple,
2543         .scan_sample_next_block = heapam_scan_sample_next_block,
2544         .scan_sample_next_tuple = heapam_scan_sample_next_tuple
2545 };
2546
2547
2548 const TableAmRoutine *
2549 GetHeapamTableAmRoutine(void)
2550 {
2551         return &heapam_methods;
2552 }
2553
2554 Datum
2555 heap_tableam_handler(PG_FUNCTION_ARGS)
2556 {
2557         PG_RETURN_POINTER(&heapam_methods);
2558 }