]> granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c
Send only one FORGET_RELATION_FSYNC request when dropping a relation.
[postgresql] / src / backend / storage / smgr / md.c
1 /*-------------------------------------------------------------------------
2  *
3  * md.c
4  *        This code manages relations that reside on magnetic disk.
5  *
6  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/storage/smgr/md.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include <unistd.h>
18 #include <fcntl.h>
19 #include <sys/file.h>
20
21 #include "miscadmin.h"
22 #include "access/xlog.h"
23 #include "catalog/catalog.h"
24 #include "portability/instr_time.h"
25 #include "postmaster/bgwriter.h"
26 #include "storage/fd.h"
27 #include "storage/bufmgr.h"
28 #include "storage/relfilenode.h"
29 #include "storage/smgr.h"
30 #include "utils/hsearch.h"
31 #include "utils/memutils.h"
32 #include "pg_trace.h"
33
34
35 /* interval for calling AbsorbFsyncRequests in mdsync */
36 #define FSYNCS_PER_ABSORB               10
37
38 /*
39  * Special values for the segno arg to RememberFsyncRequest.
40  *
41  * Note that CompactCheckpointerRequestQueue assumes that it's OK to remove an
42  * fsync request from the queue if an identical, subsequent request is found.
43  * See comments there before making changes here.
44  */
45 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
46 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
47 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
48
49 /*
50  * On Windows, we have to interpret EACCES as possibly meaning the same as
51  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
52  * that's what you get.  Ugh.  This code is designed so that we don't
53  * actually believe these cases are okay without further evidence (namely,
54  * a pending fsync request getting revoked ... see mdsync).
55  */
56 #ifndef WIN32
57 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
58 #else
59 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
60 #endif
61
62 /*
63  *      The magnetic disk storage manager keeps track of open file
64  *      descriptors in its own descriptor pool.  This is done to make it
65  *      easier to support relations that are larger than the operating
66  *      system's file size limit (often 2GBytes).  In order to do that,
67  *      we break relations up into "segment" files that are each shorter than
68  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
69  *      configuration constant in pg_config.h.
70  *
71  *      On disk, a relation must consist of consecutively numbered segment
72  *      files in the pattern
73  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
74  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
75  *              -- Optionally, any number of inactive segments of size 0 blocks.
76  *      The full and partial segments are collectively the "active" segments.
77  *      Inactive segments are those that once contained data but are currently
78  *      not needed because of an mdtruncate() operation.  The reason for leaving
79  *      them present at size zero, rather than unlinking them, is that other
80  *      backends and/or the checkpointer might be holding open file references to
81  *      such segments.  If the relation expands again after mdtruncate(), such
82  *      that a deactivated segment becomes active again, it is important that
83  *      such file references still be valid --- else data might get written
84  *      out to an unlinked old copy of a segment file that will eventually
85  *      disappear.
86  *
87  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
88  *      cache is, therefore, just the head of a list of MdfdVec objects, one
89  *      per segment.  But note the md_fd pointer can be NULL, indicating
90  *      relation not open.
91  *
92  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
93  *      doesn't have another segment after this one; we may just not have
94  *      opened the next segment yet.  (We could not have "all segments are
95  *      in the chain" as an invariant anyway, since another backend could
96  *      extend the relation when we weren't looking.)  We do not make chain
97  *      entries for inactive segments, however; as soon as we find a partial
98  *      segment, we assume that any subsequent segments are inactive.
99  *
100  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
101  */
102
103 typedef struct _MdfdVec
104 {
105         File            mdfd_vfd;               /* fd number in fd.c's pool */
106         BlockNumber mdfd_segno;         /* segment number, from 0 */
107         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
108 } MdfdVec;
109
110 static MemoryContext MdCxt;             /* context for all md.c allocations */
111
112
113 /*
114  * In some contexts (currently, standalone backends and the checkpointer process)
115  * we keep track of pending fsync operations: we need to remember all relation
116  * segments that have been written since the last checkpoint, so that we can
117  * fsync them down to disk before completing the next checkpoint.  This hash
118  * table remembers the pending operations.      We use a hash table mostly as
119  * a convenient way of eliminating duplicate requests.
120  *
121  * We use a similar mechanism to remember no-longer-needed files that can
122  * be deleted after the next checkpoint, but we use a linked list instead of
123  * a hash table, because we don't expect there to be any duplicate requests.
124  *
125  * These mechanisms are only used for non-temp relations; we never fsync
126  * temp rels, nor do we need to postpone their deletion (see comments in
127  * mdunlink).
128  *
129  * (Regular backends do not track pending operations locally, but forward
130  * them to the checkpointer.)
131  */
132 typedef struct
133 {
134         RelFileNode     rnode;                  /* the targeted relation */
135         ForkNumber      forknum;                /* which fork */
136         BlockNumber segno;                      /* which segment */
137 } PendingOperationTag;
138
139 typedef uint16 CycleCtr;                /* can be any convenient integer size */
140
141 typedef struct
142 {
143         PendingOperationTag tag;        /* hash table key (must be first!) */
144         bool            canceled;               /* T => request canceled, not yet removed */
145         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
146 } PendingOperationEntry;
147
148 typedef struct
149 {
150         RelFileNode     rnode;                  /* the dead relation to delete */
151         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
152 } PendingUnlinkEntry;
153
154 static HTAB *pendingOpsTable = NULL;
155 static List *pendingUnlinks = NIL;
156
157 static CycleCtr mdsync_cycle_ctr = 0;
158 static CycleCtr mdckpt_cycle_ctr = 0;
159
160
161 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
162 {
163         EXTENSION_FAIL,                         /* ereport if segment not present */
164         EXTENSION_RETURN_NULL,          /* return NULL if not present */
165         EXTENSION_CREATE                        /* create new segments as needed */
166 } ExtensionBehavior;
167
168 /* local routines */
169 static void mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum,
170                                                  bool isRedo);
171 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
172            ExtensionBehavior behavior);
173 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
174                                            MdfdVec *seg);
175 static void register_unlink(RelFileNodeBackend rnode);
176 static MdfdVec *_fdvec_alloc(void);
177 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
178                           BlockNumber segno);
179 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
180                           BlockNumber segno, int oflags);
181 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
182                          BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
183 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
184                    MdfdVec *seg);
185
186
187 /*
188  *      mdinit() -- Initialize private state for magnetic disk storage manager.
189  */
190 void
191 mdinit(void)
192 {
193         MdCxt = AllocSetContextCreate(TopMemoryContext,
194                                                                   "MdSmgr",
195                                                                   ALLOCSET_DEFAULT_MINSIZE,
196                                                                   ALLOCSET_DEFAULT_INITSIZE,
197                                                                   ALLOCSET_DEFAULT_MAXSIZE);
198
199         /*
200          * Create pending-operations hashtable if we need it.  Currently, we need
201          * it if we are standalone (not under a postmaster) or if we are a startup
202          * or checkpointer auxiliary process.
203          */
204         if (!IsUnderPostmaster || AmStartupProcess() || AmCheckpointerProcess())
205         {
206                 HASHCTL         hash_ctl;
207
208                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
209                 hash_ctl.keysize = sizeof(PendingOperationTag);
210                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
211                 hash_ctl.hash = tag_hash;
212                 hash_ctl.hcxt = MdCxt;
213                 pendingOpsTable = hash_create("Pending Ops Table",
214                                                                           100L,
215                                                                           &hash_ctl,
216                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
217                 pendingUnlinks = NIL;
218         }
219 }
220
221 /*
222  * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
223  * already created the pendingOpsTable during initialization of the startup
224  * process.  Calling this function drops the local pendingOpsTable so that
225  * subsequent requests will be forwarded to checkpointer.
226  */
227 void
228 SetForwardFsyncRequests(void)
229 {
230         /* Perform any pending ops we may have queued up */
231         if (pendingOpsTable)
232                 mdsync();
233         pendingOpsTable = NULL;
234 }
235
236 /*
237  *      mdexists() -- Does the physical file exist?
238  *
239  * Note: this will return true for lingering files, with pending deletions
240  */
241 bool
242 mdexists(SMgrRelation reln, ForkNumber forkNum)
243 {
244         /*
245          * Close it first, to ensure that we notice if the fork has been unlinked
246          * since we opened it.
247          */
248         mdclose(reln, forkNum);
249
250         return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
251 }
252
253 /*
254  *      mdcreate() -- Create a new relation on magnetic disk.
255  *
256  * If isRedo is true, it's okay for the relation to exist already.
257  */
258 void
259 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
260 {
261         char       *path;
262         File            fd;
263
264         if (isRedo && reln->md_fd[forkNum] != NULL)
265                 return;                                 /* created and opened already... */
266
267         Assert(reln->md_fd[forkNum] == NULL);
268
269         path = relpath(reln->smgr_rnode, forkNum);
270
271         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
272
273         if (fd < 0)
274         {
275                 int                     save_errno = errno;
276
277                 /*
278                  * During bootstrap, there are cases where a system relation will be
279                  * accessed (by internal backend processes) before the bootstrap
280                  * script nominally creates it.  Therefore, allow the file to exist
281                  * already, even if isRedo is not set.  (See also mdopen)
282                  */
283                 if (isRedo || IsBootstrapProcessingMode())
284                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
285                 if (fd < 0)
286                 {
287                         /* be sure to report the error reported by create, not open */
288                         errno = save_errno;
289                         ereport(ERROR,
290                                         (errcode_for_file_access(),
291                                          errmsg("could not create file \"%s\": %m", path)));
292                 }
293         }
294
295         pfree(path);
296
297         if (reln->smgr_transient)
298                 FileSetTransient(fd);
299
300         reln->md_fd[forkNum] = _fdvec_alloc();
301
302         reln->md_fd[forkNum]->mdfd_vfd = fd;
303         reln->md_fd[forkNum]->mdfd_segno = 0;
304         reln->md_fd[forkNum]->mdfd_chain = NULL;
305 }
306
307 /*
308  *      mdunlink() -- Unlink a relation.
309  *
310  * Note that we're passed a RelFileNodeBackend --- by the time this is called,
311  * there won't be an SMgrRelation hashtable entry anymore.
312  *
313  * forkNum can be a fork number to delete a specific fork, or InvalidForkNumber
314  * to delete all forks.
315  *
316  * For regular relations, we don't unlink the first segment file of the rel,
317  * but just truncate it to zero length, and record a request to unlink it after
318  * the next checkpoint.  Additional segments can be unlinked immediately,
319  * however.  Leaving the empty file in place prevents that relfilenode
320  * number from being reused.  The scenario this protects us from is:
321  * 1. We delete a relation (and commit, and actually remove its file).
322  * 2. We create a new relation, which by chance gets the same relfilenode as
323  *        the just-deleted one (OIDs must've wrapped around for that to happen).
324  * 3. We crash before another checkpoint occurs.
325  * During replay, we would delete the file and then recreate it, which is fine
326  * if the contents of the file were repopulated by subsequent WAL entries.
327  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
328  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
329  * the contents of the file would be lost forever.      By leaving the empty file
330  * until after the next checkpoint, we prevent reassignment of the relfilenode
331  * number until it's safe, because relfilenode assignment skips over any
332  * existing file.
333  *
334  * We do not need to go through this dance for temp relations, though, because
335  * we never make WAL entries for temp rels, and so a temp rel poses no threat
336  * to the health of a regular rel that has taken over its relfilenode number.
337  * The fact that temp rels and regular rels have different file naming
338  * patterns provides additional safety.
339  *
340  * All the above applies only to the relation's main fork; other forks can
341  * just be removed immediately, since they are not needed to prevent the
342  * relfilenode number from being recycled.      Also, we do not carefully
343  * track whether other forks have been created or not, but just attempt to
344  * unlink them unconditionally; so we should never complain about ENOENT.
345  *
346  * If isRedo is true, it's unsurprising for the relation to be already gone.
347  * Also, we should remove the file immediately instead of queuing a request
348  * for later, since during redo there's no possibility of creating a
349  * conflicting relation.
350  *
351  * Note: any failure should be reported as WARNING not ERROR, because
352  * we are usually not in a transaction anymore when this is called.
353  */
354 void
355 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
356 {
357         /*
358          * We have to clean out any pending fsync requests for the doomed
359          * relation, else the next mdsync() will fail.  There can't be any such
360          * requests for a temp relation, though.  We can send just one request
361          * even when deleting multiple forks, since the fsync queuing code accepts
362          * the "InvalidForkNumber = all forks" convention.
363          */
364         if (!RelFileNodeBackendIsTemp(rnode))
365                 ForgetRelationFsyncRequests(rnode.node, forkNum);
366
367         /* Now do the per-fork work */
368         if (forkNum == InvalidForkNumber)
369         {
370                 for (forkNum = 0; forkNum <= MAX_FORKNUM; forkNum++)
371                         mdunlinkfork(rnode, forkNum, isRedo);
372         }
373         else
374                 mdunlinkfork(rnode, forkNum, isRedo);
375 }
376
377 static void
378 mdunlinkfork(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
379 {
380         char       *path;
381         int                     ret;
382
383         path = relpath(rnode, forkNum);
384
385         /*
386          * Delete or truncate the first segment.
387          */
388         if (isRedo || forkNum != MAIN_FORKNUM || RelFileNodeBackendIsTemp(rnode))
389         {
390                 ret = unlink(path);
391                 if (ret < 0 && errno != ENOENT)
392                         ereport(WARNING,
393                                         (errcode_for_file_access(),
394                                          errmsg("could not remove file \"%s\": %m", path)));
395         }
396         else
397         {
398                 /* truncate(2) would be easier here, but Windows hasn't got it */
399                 int                     fd;
400
401                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
402                 if (fd >= 0)
403                 {
404                         int                     save_errno;
405
406                         ret = ftruncate(fd, 0);
407                         save_errno = errno;
408                         close(fd);
409                         errno = save_errno;
410                 }
411                 else
412                         ret = -1;
413                 if (ret < 0 && errno != ENOENT)
414                         ereport(WARNING,
415                                         (errcode_for_file_access(),
416                                          errmsg("could not truncate file \"%s\": %m", path)));
417
418                 /* Register request to unlink first segment later */
419                 register_unlink(rnode);
420         }
421
422         /*
423          * Delete any additional segments.
424          */
425         if (ret >= 0)
426         {
427                 char       *segpath = (char *) palloc(strlen(path) + 12);
428                 BlockNumber segno;
429
430                 /*
431                  * Note that because we loop until getting ENOENT, we will correctly
432                  * remove all inactive segments as well as active ones.
433                  */
434                 for (segno = 1;; segno++)
435                 {
436                         sprintf(segpath, "%s.%u", path, segno);
437                         if (unlink(segpath) < 0)
438                         {
439                                 /* ENOENT is expected after the last segment... */
440                                 if (errno != ENOENT)
441                                         ereport(WARNING,
442                                                         (errcode_for_file_access(),
443                                            errmsg("could not remove file \"%s\": %m", segpath)));
444                                 break;
445                         }
446                 }
447                 pfree(segpath);
448         }
449
450         pfree(path);
451 }
452
453 /*
454  *      mdextend() -- Add a block to the specified relation.
455  *
456  *              The semantics are nearly the same as mdwrite(): write at the
457  *              specified position.  However, this is to be used for the case of
458  *              extending a relation (i.e., blocknum is at or beyond the current
459  *              EOF).  Note that we assume writing a block beyond current EOF
460  *              causes intervening file space to become filled with zeroes.
461  */
462 void
463 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
464                  char *buffer, bool skipFsync)
465 {
466         off_t           seekpos;
467         int                     nbytes;
468         MdfdVec    *v;
469
470         /* This assert is too expensive to have on normally ... */
471 #ifdef CHECK_WRITE_VS_EXTEND
472         Assert(blocknum >= mdnblocks(reln, forknum));
473 #endif
474
475         /*
476          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
477          * more --- we mustn't create a block whose number actually is
478          * InvalidBlockNumber.
479          */
480         if (blocknum == InvalidBlockNumber)
481                 ereport(ERROR,
482                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
483                                  errmsg("cannot extend file \"%s\" beyond %u blocks",
484                                                 relpath(reln->smgr_rnode, forknum),
485                                                 InvalidBlockNumber)));
486
487         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
488
489         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
490
491         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
492
493         /*
494          * Note: because caller usually obtained blocknum by calling mdnblocks,
495          * which did a seek(SEEK_END), this seek is often redundant and will be
496          * optimized away by fd.c.      It's not redundant, however, if there is a
497          * partial page at the end of the file. In that case we want to try to
498          * overwrite the partial page with a full page.  It's also not redundant
499          * if bufmgr.c had to dump another buffer of the same file to make room
500          * for the new page's buffer.
501          */
502         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
503                 ereport(ERROR,
504                                 (errcode_for_file_access(),
505                                  errmsg("could not seek to block %u in file \"%s\": %m",
506                                                 blocknum, FilePathName(v->mdfd_vfd))));
507
508         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
509         {
510                 if (nbytes < 0)
511                         ereport(ERROR,
512                                         (errcode_for_file_access(),
513                                          errmsg("could not extend file \"%s\": %m",
514                                                         FilePathName(v->mdfd_vfd)),
515                                          errhint("Check free disk space.")));
516                 /* short write: complain appropriately */
517                 ereport(ERROR,
518                                 (errcode(ERRCODE_DISK_FULL),
519                                  errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
520                                                 FilePathName(v->mdfd_vfd),
521                                                 nbytes, BLCKSZ, blocknum),
522                                  errhint("Check free disk space.")));
523         }
524
525         if (!skipFsync && !SmgrIsTemp(reln))
526                 register_dirty_segment(reln, forknum, v);
527
528         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
529 }
530
531 /*
532  *      mdopen() -- Open the specified relation.
533  *
534  * Note we only open the first segment, when there are multiple segments.
535  *
536  * If first segment is not present, either ereport or return NULL according
537  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
538  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
539  * invent one out of whole cloth.
540  */
541 static MdfdVec *
542 mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
543 {
544         MdfdVec    *mdfd;
545         char       *path;
546         File            fd;
547
548         /* No work if already open */
549         if (reln->md_fd[forknum])
550                 return reln->md_fd[forknum];
551
552         path = relpath(reln->smgr_rnode, forknum);
553
554         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
555
556         if (fd < 0)
557         {
558                 /*
559                  * During bootstrap, there are cases where a system relation will be
560                  * accessed (by internal backend processes) before the bootstrap
561                  * script nominally creates it.  Therefore, accept mdopen() as a
562                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
563                  */
564                 if (IsBootstrapProcessingMode())
565                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
566                 if (fd < 0)
567                 {
568                         if (behavior == EXTENSION_RETURN_NULL &&
569                                 FILE_POSSIBLY_DELETED(errno))
570                         {
571                                 pfree(path);
572                                 return NULL;
573                         }
574                         ereport(ERROR,
575                                         (errcode_for_file_access(),
576                                          errmsg("could not open file \"%s\": %m", path)));
577                 }
578         }
579
580         pfree(path);
581
582         if (reln->smgr_transient)
583                 FileSetTransient(fd);
584
585         reln->md_fd[forknum] = mdfd = _fdvec_alloc();
586
587         mdfd->mdfd_vfd = fd;
588         mdfd->mdfd_segno = 0;
589         mdfd->mdfd_chain = NULL;
590         Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
591
592         return mdfd;
593 }
594
595 /*
596  *      mdclose() -- Close the specified relation, if it isn't closed already.
597  */
598 void
599 mdclose(SMgrRelation reln, ForkNumber forknum)
600 {
601         MdfdVec    *v = reln->md_fd[forknum];
602
603         /* No work if already closed */
604         if (v == NULL)
605                 return;
606
607         reln->md_fd[forknum] = NULL;    /* prevent dangling pointer after error */
608
609         while (v != NULL)
610         {
611                 MdfdVec    *ov = v;
612
613                 /* if not closed already */
614                 if (v->mdfd_vfd >= 0)
615                         FileClose(v->mdfd_vfd);
616                 /* Now free vector */
617                 v = v->mdfd_chain;
618                 pfree(ov);
619         }
620 }
621
622 /*
623  *      mdprefetch() -- Initiate asynchronous read of the specified block of a relation
624  */
625 void
626 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
627 {
628 #ifdef USE_PREFETCH
629         off_t           seekpos;
630         MdfdVec    *v;
631
632         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
633
634         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
635
636         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
637
638         (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
639 #endif   /* USE_PREFETCH */
640 }
641
642
643 /*
644  *      mdread() -- Read the specified block from a relation.
645  */
646 void
647 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
648            char *buffer)
649 {
650         off_t           seekpos;
651         int                     nbytes;
652         MdfdVec    *v;
653
654         TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
655                                                                                 reln->smgr_rnode.node.spcNode,
656                                                                                 reln->smgr_rnode.node.dbNode,
657                                                                                 reln->smgr_rnode.node.relNode,
658                                                                                 reln->smgr_rnode.backend);
659
660         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
661
662         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
663
664         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
665
666         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
667                 ereport(ERROR,
668                                 (errcode_for_file_access(),
669                                  errmsg("could not seek to block %u in file \"%s\": %m",
670                                                 blocknum, FilePathName(v->mdfd_vfd))));
671
672         nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
673
674         TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
675                                                                            reln->smgr_rnode.node.spcNode,
676                                                                            reln->smgr_rnode.node.dbNode,
677                                                                            reln->smgr_rnode.node.relNode,
678                                                                            reln->smgr_rnode.backend,
679                                                                            nbytes,
680                                                                            BLCKSZ);
681
682         if (nbytes != BLCKSZ)
683         {
684                 if (nbytes < 0)
685                         ereport(ERROR,
686                                         (errcode_for_file_access(),
687                                          errmsg("could not read block %u in file \"%s\": %m",
688                                                         blocknum, FilePathName(v->mdfd_vfd))));
689
690                 /*
691                  * Short read: we are at or past EOF, or we read a partial block at
692                  * EOF.  Normally this is an error; upper levels should never try to
693                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
694                  * we are InRecovery, we should instead return zeroes without
695                  * complaining.  This allows, for example, the case of trying to
696                  * update a block that was later truncated away.
697                  */
698                 if (zero_damaged_pages || InRecovery)
699                         MemSet(buffer, 0, BLCKSZ);
700                 else
701                         ereport(ERROR,
702                                         (errcode(ERRCODE_DATA_CORRUPTED),
703                                          errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
704                                                         blocknum, FilePathName(v->mdfd_vfd),
705                                                         nbytes, BLCKSZ)));
706         }
707 }
708
709 /*
710  *      mdwrite() -- Write the supplied block at the appropriate location.
711  *
712  *              This is to be used only for updating already-existing blocks of a
713  *              relation (ie, those before the current EOF).  To extend a relation,
714  *              use mdextend().
715  */
716 void
717 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
718                 char *buffer, bool skipFsync)
719 {
720         off_t           seekpos;
721         int                     nbytes;
722         MdfdVec    *v;
723
724         /* This assert is too expensive to have on normally ... */
725 #ifdef CHECK_WRITE_VS_EXTEND
726         Assert(blocknum < mdnblocks(reln, forknum));
727 #endif
728
729         TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
730                                                                                  reln->smgr_rnode.node.spcNode,
731                                                                                  reln->smgr_rnode.node.dbNode,
732                                                                                  reln->smgr_rnode.node.relNode,
733                                                                                  reln->smgr_rnode.backend);
734
735         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL);
736
737         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
738
739         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
740
741         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
742                 ereport(ERROR,
743                                 (errcode_for_file_access(),
744                                  errmsg("could not seek to block %u in file \"%s\": %m",
745                                                 blocknum, FilePathName(v->mdfd_vfd))));
746
747         nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
748
749         TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
750                                                                                 reln->smgr_rnode.node.spcNode,
751                                                                                 reln->smgr_rnode.node.dbNode,
752                                                                                 reln->smgr_rnode.node.relNode,
753                                                                                 reln->smgr_rnode.backend,
754                                                                                 nbytes,
755                                                                                 BLCKSZ);
756
757         if (nbytes != BLCKSZ)
758         {
759                 if (nbytes < 0)
760                         ereport(ERROR,
761                                         (errcode_for_file_access(),
762                                          errmsg("could not write block %u in file \"%s\": %m",
763                                                         blocknum, FilePathName(v->mdfd_vfd))));
764                 /* short write: complain appropriately */
765                 ereport(ERROR,
766                                 (errcode(ERRCODE_DISK_FULL),
767                                  errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
768                                                 blocknum,
769                                                 FilePathName(v->mdfd_vfd),
770                                                 nbytes, BLCKSZ),
771                                  errhint("Check free disk space.")));
772         }
773
774         if (!skipFsync && !SmgrIsTemp(reln))
775                 register_dirty_segment(reln, forknum, v);
776 }
777
778 /*
779  *      mdnblocks() -- Get the number of blocks stored in a relation.
780  *
781  *              Important side effect: all active segments of the relation are opened
782  *              and added to the mdfd_chain list.  If this routine has not been
783  *              called, then only segments up to the last one actually touched
784  *              are present in the chain.
785  */
786 BlockNumber
787 mdnblocks(SMgrRelation reln, ForkNumber forknum)
788 {
789         MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
790         BlockNumber nblocks;
791         BlockNumber segno = 0;
792
793         /*
794          * Skip through any segments that aren't the last one, to avoid redundant
795          * seeks on them.  We have previously verified that these segments are
796          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
797          *
798          * NOTE: this assumption could only be wrong if another backend has
799          * truncated the relation.      We rely on higher code levels to handle that
800          * scenario by closing and re-opening the md fd, which is handled via
801          * relcache flush.      (Since the checkpointer doesn't participate in
802          * relcache flush, it could have segment chain entries for inactive
803          * segments; that's OK because the checkpointer never needs to compute
804          * relation size.)
805          */
806         while (v->mdfd_chain != NULL)
807         {
808                 segno++;
809                 v = v->mdfd_chain;
810         }
811
812         for (;;)
813         {
814                 nblocks = _mdnblocks(reln, forknum, v);
815                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
816                         elog(FATAL, "segment too big");
817                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
818                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
819
820                 /*
821                  * If segment is exactly RELSEG_SIZE, advance to next one.
822                  */
823                 segno++;
824
825                 if (v->mdfd_chain == NULL)
826                 {
827                         /*
828                          * Because we pass O_CREAT, we will create the next segment (with
829                          * zero length) immediately, if the last segment is of length
830                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
831                          * the logic simple.
832                          */
833                         v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
834                         if (v->mdfd_chain == NULL)
835                                 ereport(ERROR,
836                                                 (errcode_for_file_access(),
837                                                  errmsg("could not open file \"%s\": %m",
838                                                                 _mdfd_segpath(reln, forknum, segno))));
839                 }
840
841                 v = v->mdfd_chain;
842         }
843 }
844
845 /*
846  *      mdtruncate() -- Truncate relation to specified number of blocks.
847  */
848 void
849 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
850 {
851         MdfdVec    *v;
852         BlockNumber curnblk;
853         BlockNumber priorblocks;
854
855         /*
856          * NOTE: mdnblocks makes sure we have opened all active segments, so that
857          * truncation loop will get them all!
858          */
859         curnblk = mdnblocks(reln, forknum);
860         if (nblocks > curnblk)
861         {
862                 /* Bogus request ... but no complaint if InRecovery */
863                 if (InRecovery)
864                         return;
865                 ereport(ERROR,
866                                 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
867                                                 relpath(reln->smgr_rnode, forknum),
868                                                 nblocks, curnblk)));
869         }
870         if (nblocks == curnblk)
871                 return;                                 /* no work */
872
873         v = mdopen(reln, forknum, EXTENSION_FAIL);
874
875         priorblocks = 0;
876         while (v != NULL)
877         {
878                 MdfdVec    *ov = v;
879
880                 if (priorblocks > nblocks)
881                 {
882                         /*
883                          * This segment is no longer active (and has already been unlinked
884                          * from the mdfd_chain). We truncate the file, but do not delete
885                          * it, for reasons explained in the header comments.
886                          */
887                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
888                                 ereport(ERROR,
889                                                 (errcode_for_file_access(),
890                                                  errmsg("could not truncate file \"%s\": %m",
891                                                                 FilePathName(v->mdfd_vfd))));
892
893                         if (!SmgrIsTemp(reln))
894                                 register_dirty_segment(reln, forknum, v);
895                         v = v->mdfd_chain;
896                         Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
897                                                                                                  * segment */
898                         pfree(ov);
899                 }
900                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
901                 {
902                         /*
903                          * This is the last segment we want to keep. Truncate the file to
904                          * the right length, and clear chain link that points to any
905                          * remaining segments (which we shall zap). NOTE: if nblocks is
906                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
907                          * segment to 0 length but keep it. This adheres to the invariant
908                          * given in the header comments.
909                          */
910                         BlockNumber lastsegblocks = nblocks - priorblocks;
911
912                         if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
913                                 ereport(ERROR,
914                                                 (errcode_for_file_access(),
915                                         errmsg("could not truncate file \"%s\" to %u blocks: %m",
916                                                    FilePathName(v->mdfd_vfd),
917                                                    nblocks)));
918                         if (!SmgrIsTemp(reln))
919                                 register_dirty_segment(reln, forknum, v);
920                         v = v->mdfd_chain;
921                         ov->mdfd_chain = NULL;
922                 }
923                 else
924                 {
925                         /*
926                          * We still need this segment and 0 or more blocks beyond it, so
927                          * nothing to do here.
928                          */
929                         v = v->mdfd_chain;
930                 }
931                 priorblocks += RELSEG_SIZE;
932         }
933 }
934
935 /*
936  *      mdimmedsync() -- Immediately sync a relation to stable storage.
937  *
938  * Note that only writes already issued are synced; this routine knows
939  * nothing of dirty buffers that may exist inside the buffer manager.
940  */
941 void
942 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
943 {
944         MdfdVec    *v;
945
946         /*
947          * NOTE: mdnblocks makes sure we have opened all active segments, so that
948          * fsync loop will get them all!
949          */
950         mdnblocks(reln, forknum);
951
952         v = mdopen(reln, forknum, EXTENSION_FAIL);
953
954         while (v != NULL)
955         {
956                 if (FileSync(v->mdfd_vfd) < 0)
957                         ereport(ERROR,
958                                         (errcode_for_file_access(),
959                                          errmsg("could not fsync file \"%s\": %m",
960                                                         FilePathName(v->mdfd_vfd))));
961                 v = v->mdfd_chain;
962         }
963 }
964
965 /*
966  *      mdsync() -- Sync previous writes to stable storage.
967  */
968 void
969 mdsync(void)
970 {
971         static bool mdsync_in_progress = false;
972
973         HASH_SEQ_STATUS hstat;
974         PendingOperationEntry *entry;
975         int                     absorb_counter;
976
977         /* Statistics on sync times */
978         int                     processed = 0;
979         instr_time      sync_start,
980                                 sync_end,
981                                 sync_diff;
982         uint64          elapsed;
983         uint64          longest = 0;
984         uint64          total_elapsed = 0;
985
986         /*
987          * This is only called during checkpoints, and checkpoints should only
988          * occur in processes that have created a pendingOpsTable.
989          */
990         if (!pendingOpsTable)
991                 elog(ERROR, "cannot sync without a pendingOpsTable");
992
993         /*
994          * If we are in the checkpointer, the sync had better include all fsync
995          * requests that were queued by backends up to this point.      The tightest
996          * race condition that could occur is that a buffer that must be written
997          * and fsync'd for the checkpoint could have been dumped by a backend just
998          * before it was visited by BufferSync().  We know the backend will have
999          * queued an fsync request before clearing the buffer's dirtybit, so we
1000          * are safe as long as we do an Absorb after completing BufferSync().
1001          */
1002         AbsorbFsyncRequests();
1003
1004         /*
1005          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
1006          * checkpoint), we want to ignore fsync requests that are entered into the
1007          * hashtable after this point --- they should be processed next time,
1008          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
1009          * ones: new ones will have cycle_ctr equal to the incremented value of
1010          * mdsync_cycle_ctr.
1011          *
1012          * In normal circumstances, all entries present in the table at this point
1013          * will have cycle_ctr exactly equal to the current (about to be old)
1014          * value of mdsync_cycle_ctr.  However, if we fail partway through the
1015          * fsync'ing loop, then older values of cycle_ctr might remain when we
1016          * come back here to try again.  Repeated checkpoint failures would
1017          * eventually wrap the counter around to the point where an old entry
1018          * might appear new, causing us to skip it, possibly allowing a checkpoint
1019          * to succeed that should not have.  To forestall wraparound, any time the
1020          * previous mdsync() failed to complete, run through the table and
1021          * forcibly set cycle_ctr = mdsync_cycle_ctr.
1022          *
1023          * Think not to merge this loop with the main loop, as the problem is
1024          * exactly that that loop may fail before having visited all the entries.
1025          * From a performance point of view it doesn't matter anyway, as this path
1026          * will never be taken in a system that's functioning normally.
1027          */
1028         if (mdsync_in_progress)
1029         {
1030                 /* prior try failed, so update any stale cycle_ctr values */
1031                 hash_seq_init(&hstat, pendingOpsTable);
1032                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1033                 {
1034                         entry->cycle_ctr = mdsync_cycle_ctr;
1035                 }
1036         }
1037
1038         /* Advance counter so that new hashtable entries are distinguishable */
1039         mdsync_cycle_ctr++;
1040
1041         /* Set flag to detect failure if we don't reach the end of the loop */
1042         mdsync_in_progress = true;
1043
1044         /* Now scan the hashtable for fsync requests to process */
1045         absorb_counter = FSYNCS_PER_ABSORB;
1046         hash_seq_init(&hstat, pendingOpsTable);
1047         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1048         {
1049                 /*
1050                  * If the entry is new then don't process it this time.  Note that
1051                  * "continue" bypasses the hash-remove call at the bottom of the loop.
1052                  */
1053                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1054                         continue;
1055
1056                 /* Else assert we haven't missed it */
1057                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1058
1059                 /*
1060                  * If fsync is off then we don't have to bother opening the file at
1061                  * all.  (We delay checking until this point so that changing fsync on
1062                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
1063                  * fall through to delete it.
1064                  */
1065                 if (enableFsync && !entry->canceled)
1066                 {
1067                         int                     failures;
1068
1069                         /*
1070                          * If in checkpointer, we want to absorb pending requests every so
1071                          * often to prevent overflow of the fsync request queue.  It is
1072                          * unspecified whether newly-added entries will be visited by
1073                          * hash_seq_search, but we don't care since we don't need to
1074                          * process them anyway.
1075                          */
1076                         if (--absorb_counter <= 0)
1077                         {
1078                                 AbsorbFsyncRequests();
1079                                 absorb_counter = FSYNCS_PER_ABSORB;
1080                         }
1081
1082                         /*
1083                          * The fsync table could contain requests to fsync segments that
1084                          * have been deleted (unlinked) by the time we get to them. Rather
1085                          * than just hoping an ENOENT (or EACCES on Windows) error can be
1086                          * ignored, what we do on error is absorb pending requests and
1087                          * then retry.  Since mdunlink() queues a "revoke" message before
1088                          * actually unlinking, the fsync request is guaranteed to be
1089                          * marked canceled after the absorb if it really was this case.
1090                          * DROP DATABASE likewise has to tell us to forget fsync requests
1091                          * before it starts deletions.
1092                          */
1093                         for (failures = 0;; failures++)         /* loop exits at "break" */
1094                         {
1095                                 SMgrRelation reln;
1096                                 MdfdVec    *seg;
1097                                 char       *path;
1098
1099                                 /*
1100                                  * Find or create an smgr hash entry for this relation. This
1101                                  * may seem a bit unclean -- md calling smgr?  But it's really
1102                                  * the best solution.  It ensures that the open file reference
1103                                  * isn't permanently leaked if we get an error here. (You may
1104                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1105                                  * really, because the only case in which a checkpoint is done
1106                                  * by a process that isn't about to shut down is in the
1107                                  * checkpointer, and it will periodically do smgrcloseall().
1108                                  * This fact justifies our not closing the reln in the success
1109                                  * path either, which is a good thing since in
1110                                  * non-checkpointer cases we couldn't safely do that.)
1111                                  */
1112                                 reln = smgropen(entry->tag.rnode, InvalidBackendId);
1113
1114                                 /*
1115                                  * It is possible that the relation has been dropped or
1116                                  * truncated since the fsync request was entered.  Therefore,
1117                                  * allow ENOENT, but only if we didn't fail already on this
1118                                  * file.  This applies both during _mdfd_getseg() and during
1119                                  * FileSync, since fd.c might have closed the file behind our
1120                                  * back.
1121                                  */
1122                                 seg = _mdfd_getseg(reln, entry->tag.forknum,
1123                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1124                                                                    false, EXTENSION_RETURN_NULL);
1125
1126                                 INSTR_TIME_SET_CURRENT(sync_start);
1127
1128                                 if (seg != NULL &&
1129                                         FileSync(seg->mdfd_vfd) >= 0)
1130                                 {
1131                                         INSTR_TIME_SET_CURRENT(sync_end);
1132                                         sync_diff = sync_end;
1133                                         INSTR_TIME_SUBTRACT(sync_diff, sync_start);
1134                                         elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
1135                                         if (elapsed > longest)
1136                                                 longest = elapsed;
1137                                         total_elapsed += elapsed;
1138                                         processed++;
1139                                         if (log_checkpoints)
1140                                                 elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
1141                                                          processed, FilePathName(seg->mdfd_vfd), (double) elapsed / 1000);
1142
1143                                         break;          /* success; break out of retry loop */
1144                                 }
1145
1146                                 /*
1147                                  * XXX is there any point in allowing more than one retry?
1148                                  * Don't see one at the moment, but easy to change the test
1149                                  * here if so.
1150                                  */
1151                                 path = _mdfd_segpath(reln, entry->tag.forknum,
1152                                                                          entry->tag.segno);
1153                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1154                                         failures > 0)
1155                                         ereport(ERROR,
1156                                                         (errcode_for_file_access(),
1157                                                    errmsg("could not fsync file \"%s\": %m", path)));
1158                                 else
1159                                         ereport(DEBUG1,
1160                                                         (errcode_for_file_access(),
1161                                            errmsg("could not fsync file \"%s\" but retrying: %m",
1162                                                           path)));
1163                                 pfree(path);
1164
1165                                 /*
1166                                  * Absorb incoming requests and check to see if canceled.
1167                                  */
1168                                 AbsorbFsyncRequests();
1169                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1170
1171                                 if (entry->canceled)
1172                                         break;
1173                         }                                       /* end retry loop */
1174                 }
1175
1176                 /*
1177                  * If we get here, either we fsync'd successfully, or we don't have to
1178                  * because enableFsync is off, or the entry is (now) marked canceled.
1179                  * Okay to delete it.
1180                  */
1181                 if (hash_search(pendingOpsTable, &entry->tag,
1182                                                 HASH_REMOVE, NULL) == NULL)
1183                         elog(ERROR, "pendingOpsTable corrupted");
1184         }                                                       /* end loop over hashtable entries */
1185
1186         /* Return sync performance metrics for report at checkpoint end */
1187         CheckpointStats.ckpt_sync_rels = processed;
1188         CheckpointStats.ckpt_longest_sync = longest;
1189         CheckpointStats.ckpt_agg_sync_time = total_elapsed;
1190
1191         /* Flag successful completion of mdsync */
1192         mdsync_in_progress = false;
1193 }
1194
1195 /*
1196  * mdpreckpt() -- Do pre-checkpoint work
1197  *
1198  * To distinguish unlink requests that arrived before this checkpoint
1199  * started from those that arrived during the checkpoint, we use a cycle
1200  * counter similar to the one we use for fsync requests. That cycle
1201  * counter is incremented here.
1202  *
1203  * This must be called *before* the checkpoint REDO point is determined.
1204  * That ensures that we won't delete files too soon.
1205  *
1206  * Note that we can't do anything here that depends on the assumption
1207  * that the checkpoint will be completed.
1208  */
1209 void
1210 mdpreckpt(void)
1211 {
1212         ListCell   *cell;
1213
1214         /*
1215          * In case the prior checkpoint wasn't completed, stamp all entries in the
1216          * list with the current cycle counter.  Anything that's in the list at
1217          * the start of checkpoint can surely be deleted after the checkpoint is
1218          * finished, regardless of when the request was made.
1219          */
1220         foreach(cell, pendingUnlinks)
1221         {
1222                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1223
1224                 entry->cycle_ctr = mdckpt_cycle_ctr;
1225         }
1226
1227         /*
1228          * Any unlink requests arriving after this point will be assigned the next
1229          * cycle counter, and won't be unlinked until next checkpoint.
1230          */
1231         mdckpt_cycle_ctr++;
1232 }
1233
1234 /*
1235  * mdpostckpt() -- Do post-checkpoint work
1236  *
1237  * Remove any lingering files that can now be safely removed.
1238  */
1239 void
1240 mdpostckpt(void)
1241 {
1242         while (pendingUnlinks != NIL)
1243         {
1244                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1245                 char       *path;
1246
1247                 /*
1248                  * New entries are appended to the end, so if the entry is new we've
1249                  * reached the end of old entries.
1250                  */
1251                 if (entry->cycle_ctr == mdckpt_cycle_ctr)
1252                         break;
1253
1254                 /* Else assert we haven't missed it */
1255                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1256
1257                 /* Unlink the file */
1258                 path = relpathperm(entry->rnode, MAIN_FORKNUM);
1259                 if (unlink(path) < 0)
1260                 {
1261                         /*
1262                          * There's a race condition, when the database is dropped at the
1263                          * same time that we process the pending unlink requests. If the
1264                          * DROP DATABASE deletes the file before we do, we will get ENOENT
1265                          * here. rmtree() also has to ignore ENOENT errors, to deal with
1266                          * the possibility that we delete the file first.
1267                          */
1268                         if (errno != ENOENT)
1269                                 ereport(WARNING,
1270                                                 (errcode_for_file_access(),
1271                                                  errmsg("could not remove file \"%s\": %m", path)));
1272                 }
1273                 pfree(path);
1274
1275                 pendingUnlinks = list_delete_first(pendingUnlinks);
1276                 pfree(entry);
1277         }
1278 }
1279
1280 /*
1281  * register_dirty_segment() -- Mark a relation segment as needing fsync
1282  *
1283  * If there is a local pending-ops table, just make an entry in it for
1284  * mdsync to process later.  Otherwise, try to pass off the fsync request
1285  * to the checkpointer process.  If that fails, just do the fsync
1286  * locally before returning (we hope this will not happen often enough
1287  * to be a performance problem).
1288  */
1289 static void
1290 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1291 {
1292         /* Temp relations should never be fsync'd */
1293         Assert(!SmgrIsTemp(reln));
1294
1295         if (pendingOpsTable)
1296         {
1297                 /* push it into local pending-ops table */
1298                 RememberFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno);
1299         }
1300         else
1301         {
1302                 if (ForwardFsyncRequest(reln->smgr_rnode.node, forknum, seg->mdfd_segno))
1303                         return;                         /* passed it off successfully */
1304
1305                 ereport(DEBUG1,
1306                                 (errmsg("could not forward fsync request because request queue is full")));
1307
1308                 if (FileSync(seg->mdfd_vfd) < 0)
1309                         ereport(ERROR,
1310                                         (errcode_for_file_access(),
1311                                          errmsg("could not fsync file \"%s\": %m",
1312                                                         FilePathName(seg->mdfd_vfd))));
1313         }
1314 }
1315
1316 /*
1317  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1318  *
1319  * We don't bother passing in the fork number, because this is only used
1320  * with main forks.
1321  *
1322  * As with register_dirty_segment, this could involve either a local or
1323  * a remote pending-ops table.
1324  */
1325 static void
1326 register_unlink(RelFileNodeBackend rnode)
1327 {
1328         /* Should never be used with temp relations */
1329         Assert(!RelFileNodeBackendIsTemp(rnode));
1330
1331         if (pendingOpsTable)
1332         {
1333                 /* push it into local pending-ops table */
1334                 RememberFsyncRequest(rnode.node, MAIN_FORKNUM,
1335                                                          UNLINK_RELATION_REQUEST);
1336         }
1337         else
1338         {
1339                 /*
1340                  * Notify the checkpointer about it.  If we fail to queue the request
1341                  * message, we have to sleep and try again, because we can't simply
1342                  * delete the file now.  Ugly, but hopefully won't happen often.
1343                  *
1344                  * XXX should we just leave the file orphaned instead?
1345                  */
1346                 Assert(IsUnderPostmaster);
1347                 while (!ForwardFsyncRequest(rnode.node, MAIN_FORKNUM,
1348                                                                         UNLINK_RELATION_REQUEST))
1349                         pg_usleep(10000L);      /* 10 msec seems a good number */
1350         }
1351 }
1352
1353 /*
1354  * RememberFsyncRequest() -- callback from checkpointer side of fsync request
1355  *
1356  * We stuff most fsync requests into the local hash table for execution
1357  * during the checkpointer's next checkpoint.  UNLINK requests go into a
1358  * separate linked list, however, because they get processed separately.
1359  *
1360  * The range of possible segment numbers is way less than the range of
1361  * BlockNumber, so we can reserve high values of segno for special purposes.
1362  * We define three:
1363  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation,
1364  *   either for one fork, or all forks if forknum is InvalidForkNumber
1365  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1366  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1367  *       checkpoint.
1368  *
1369  * (Handling the FORGET_* requests is a tad slow because the hash table has
1370  * to be searched linearly, but it doesn't seem worth rethinking the table
1371  * structure for them.)
1372  */
1373 void
1374 RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
1375 {
1376         Assert(pendingOpsTable);
1377
1378         if (segno == FORGET_RELATION_FSYNC)
1379         {
1380                 /* Remove any pending requests for the relation (one or all forks) */
1381                 HASH_SEQ_STATUS hstat;
1382                 PendingOperationEntry *entry;
1383
1384                 hash_seq_init(&hstat, pendingOpsTable);
1385                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1386                 {
1387                         if (RelFileNodeEquals(entry->tag.rnode, rnode) &&
1388                                 (entry->tag.forknum == forknum ||
1389                                  forknum == InvalidForkNumber))
1390                         {
1391                                 /* Okay, cancel this entry */
1392                                 entry->canceled = true;
1393                         }
1394                 }
1395         }
1396         else if (segno == FORGET_DATABASE_FSYNC)
1397         {
1398                 /* Remove any pending requests for the entire database */
1399                 HASH_SEQ_STATUS hstat;
1400                 PendingOperationEntry *entry;
1401                 ListCell   *cell,
1402                                    *prev,
1403                                    *next;
1404
1405                 /* Remove fsync requests */
1406                 hash_seq_init(&hstat, pendingOpsTable);
1407                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1408                 {
1409                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1410                         {
1411                                 /* Okay, cancel this entry */
1412                                 entry->canceled = true;
1413                         }
1414                 }
1415
1416                 /* Remove unlink requests */
1417                 prev = NULL;
1418                 for (cell = list_head(pendingUnlinks); cell; cell = next)
1419                 {
1420                         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1421
1422                         next = lnext(cell);
1423                         if (entry->rnode.dbNode == rnode.dbNode)
1424                         {
1425                                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1426                                 pfree(entry);
1427                         }
1428                         else
1429                                 prev = cell;
1430                 }
1431         }
1432         else if (segno == UNLINK_RELATION_REQUEST)
1433         {
1434                 /* Unlink request: put it in the linked list */
1435                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1436                 PendingUnlinkEntry *entry;
1437
1438                 /* PendingUnlinkEntry doesn't store forknum, since it's always MAIN */
1439                 Assert(forknum == MAIN_FORKNUM);
1440
1441                 entry = palloc(sizeof(PendingUnlinkEntry));
1442                 entry->rnode = rnode;
1443                 entry->cycle_ctr = mdckpt_cycle_ctr;
1444
1445                 pendingUnlinks = lappend(pendingUnlinks, entry);
1446
1447                 MemoryContextSwitchTo(oldcxt);
1448         }
1449         else
1450         {
1451                 /* Normal case: enter a request to fsync this segment */
1452                 PendingOperationTag key;
1453                 PendingOperationEntry *entry;
1454                 bool            found;
1455
1456                 /* ensure any pad bytes in the hash key are zeroed */
1457                 MemSet(&key, 0, sizeof(key));
1458                 key.rnode = rnode;
1459                 key.forknum = forknum;
1460                 key.segno = segno;
1461
1462                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1463                                                                                                           &key,
1464                                                                                                           HASH_ENTER,
1465                                                                                                           &found);
1466                 /* if new or previously canceled entry, initialize it */
1467                 if (!found || entry->canceled)
1468                 {
1469                         entry->canceled = false;
1470                         entry->cycle_ctr = mdsync_cycle_ctr;
1471                 }
1472
1473                 /*
1474                  * NB: it's intentional that we don't change cycle_ctr if the entry
1475                  * already exists.      The fsync request must be treated as old, even
1476                  * though the new request will be satisfied too by any subsequent
1477                  * fsync.
1478                  *
1479                  * However, if the entry is present but is marked canceled, we should
1480                  * act just as though it wasn't there.  The only case where this could
1481                  * happen would be if a file had been deleted, we received but did not
1482                  * yet act on the cancel request, and the same relfilenode was then
1483                  * assigned to a new file.      We mustn't lose the new request, but it
1484                  * should be considered new not old.
1485                  */
1486         }
1487 }
1488
1489 /*
1490  * ForgetRelationFsyncRequests -- forget any fsyncs for a relation fork
1491  *
1492  * forknum == InvalidForkNumber means all forks, although this code doesn't
1493  * actually know that, since it's just forwarding the request elsewhere.
1494  */
1495 void
1496 ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
1497 {
1498         if (pendingOpsTable)
1499         {
1500                 /* standalone backend or startup process: fsync state is local */
1501                 RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
1502         }
1503         else if (IsUnderPostmaster)
1504         {
1505                 /*
1506                  * Notify the checkpointer about it.  If we fail to queue the revoke
1507                  * message, we have to sleep and try again ... ugly, but hopefully
1508                  * won't happen often.
1509                  *
1510                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1511                  * error would leave the no-longer-used file still present on disk,
1512                  * which would be bad, so I'm inclined to assume that the checkpointer
1513                  * will always empty the queue soon.
1514                  */
1515                 while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
1516                         pg_usleep(10000L);      /* 10 msec seems a good number */
1517
1518                 /*
1519                  * Note we don't wait for the checkpointer to actually absorb the
1520                  * revoke message; see mdsync() for the implications.
1521                  */
1522         }
1523 }
1524
1525 /*
1526  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1527  */
1528 void
1529 ForgetDatabaseFsyncRequests(Oid dbid)
1530 {
1531         RelFileNode rnode;
1532
1533         rnode.dbNode = dbid;
1534         rnode.spcNode = 0;
1535         rnode.relNode = 0;
1536
1537         if (pendingOpsTable)
1538         {
1539                 /* standalone backend or startup process: fsync state is local */
1540                 RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
1541         }
1542         else if (IsUnderPostmaster)
1543         {
1544                 /* see notes in ForgetRelationFsyncRequests */
1545                 while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
1546                                                                         FORGET_DATABASE_FSYNC))
1547                         pg_usleep(10000L);      /* 10 msec seems a good number */
1548         }
1549 }
1550
1551
1552 /*
1553  *      _fdvec_alloc() -- Make a MdfdVec object.
1554  */
1555 static MdfdVec *
1556 _fdvec_alloc(void)
1557 {
1558         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1559 }
1560
1561 /*
1562  * Return the filename for the specified segment of the relation. The
1563  * returned string is palloc'd.
1564  */
1565 static char *
1566 _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
1567 {
1568         char       *path,
1569                            *fullpath;
1570
1571         path = relpath(reln->smgr_rnode, forknum);
1572
1573         if (segno > 0)
1574         {
1575                 /* be sure we have enough space for the '.segno' */
1576                 fullpath = (char *) palloc(strlen(path) + 12);
1577                 sprintf(fullpath, "%s.%u", path, segno);
1578                 pfree(path);
1579         }
1580         else
1581                 fullpath = path;
1582
1583         return fullpath;
1584 }
1585
1586 /*
1587  * Open the specified segment of the relation,
1588  * and make a MdfdVec object for it.  Returns NULL on failure.
1589  */
1590 static MdfdVec *
1591 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1592                           int oflags)
1593 {
1594         MdfdVec    *v;
1595         int                     fd;
1596         char       *fullpath;
1597
1598         fullpath = _mdfd_segpath(reln, forknum, segno);
1599
1600         /* open the file */
1601         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1602
1603         pfree(fullpath);
1604
1605         if (fd < 0)
1606                 return NULL;
1607
1608         if (reln->smgr_transient)
1609                 FileSetTransient(fd);
1610
1611         /* allocate an mdfdvec entry for it */
1612         v = _fdvec_alloc();
1613
1614         /* fill the entry */
1615         v->mdfd_vfd = fd;
1616         v->mdfd_segno = segno;
1617         v->mdfd_chain = NULL;
1618         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1619
1620         /* all done */
1621         return v;
1622 }
1623
1624 /*
1625  *      _mdfd_getseg() -- Find the segment of the relation holding the
1626  *              specified block.
1627  *
1628  * If the segment doesn't exist, we ereport, return NULL, or create the
1629  * segment, according to "behavior".  Note: skipFsync is only used in the
1630  * EXTENSION_CREATE case.
1631  */
1632 static MdfdVec *
1633 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1634                          bool skipFsync, ExtensionBehavior behavior)
1635 {
1636         MdfdVec    *v = mdopen(reln, forknum, behavior);
1637         BlockNumber targetseg;
1638         BlockNumber nextsegno;
1639
1640         if (!v)
1641                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1642
1643         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1644         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1645         {
1646                 Assert(nextsegno == v->mdfd_segno + 1);
1647
1648                 if (v->mdfd_chain == NULL)
1649                 {
1650                         /*
1651                          * Normally we will create new segments only if authorized by the
1652                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1653                          * recovery, create segments anyway; this allows cases such as
1654                          * replaying WAL data that has a write into a high-numbered
1655                          * segment of a relation that was later deleted.  We want to go
1656                          * ahead and create the segments so we can finish out the replay.
1657                          *
1658                          * We have to maintain the invariant that segments before the last
1659                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1660                          * with zeroes if needed.  (This only matters if caller is
1661                          * extending the relation discontiguously, but that can happen in
1662                          * hash indexes.)
1663                          */
1664                         if (behavior == EXTENSION_CREATE || InRecovery)
1665                         {
1666                                 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
1667                                 {
1668                                         char       *zerobuf = palloc0(BLCKSZ);
1669
1670                                         mdextend(reln, forknum,
1671                                                          nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1672                                                          zerobuf, skipFsync);
1673                                         pfree(zerobuf);
1674                                 }
1675                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
1676                         }
1677                         else
1678                         {
1679                                 /* We won't create segment if not existent */
1680                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
1681                         }
1682                         if (v->mdfd_chain == NULL)
1683                         {
1684                                 if (behavior == EXTENSION_RETURN_NULL &&
1685                                         FILE_POSSIBLY_DELETED(errno))
1686                                         return NULL;
1687                                 ereport(ERROR,
1688                                                 (errcode_for_file_access(),
1689                                    errmsg("could not open file \"%s\" (target block %u): %m",
1690                                                   _mdfd_segpath(reln, forknum, nextsegno),
1691                                                   blkno)));
1692                         }
1693                 }
1694                 v = v->mdfd_chain;
1695         }
1696         return v;
1697 }
1698
1699 /*
1700  * Get number of blocks present in a single disk file
1701  */
1702 static BlockNumber
1703 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1704 {
1705         off_t           len;
1706
1707         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1708         if (len < 0)
1709                 ereport(ERROR,
1710                                 (errcode_for_file_access(),
1711                                  errmsg("could not seek to end of file \"%s\": %m",
1712                                                 FilePathName(seg->mdfd_vfd))));
1713         /* note that this calculation will ignore any partial block at EOF */
1714         return (BlockNumber) (len / BLCKSZ);
1715 }