]> granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c
e5dec9d2a329b1a36e82c4ed62f2ba6be48217c6
[postgresql] / src / backend / storage / smgr / md.c
1 /*-------------------------------------------------------------------------
2  *
3  * md.c
4  *        This code manages relations that reside on magnetic disk.
5  *
6  * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/storage/smgr/md.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include <unistd.h>
18 #include <fcntl.h>
19 #include <sys/file.h>
20
21 #include "miscadmin.h"
22 #include "access/xlog.h"
23 #include "catalog/catalog.h"
24 #include "portability/instr_time.h"
25 #include "postmaster/bgwriter.h"
26 #include "storage/fd.h"
27 #include "storage/bufmgr.h"
28 #include "storage/relfilenode.h"
29 #include "storage/smgr.h"
30 #include "utils/hsearch.h"
31 #include "utils/memutils.h"
32 #include "pg_trace.h"
33
34
35 /* interval for calling AbsorbFsyncRequests in mdsync */
36 #define FSYNCS_PER_ABSORB               10
37
38 /*
39  * Special values for the segno arg to RememberFsyncRequest.
40  *
41  * Note that CompactcheckpointerRequestQueue assumes that it's OK to remove an
42  * fsync request from the queue if an identical, subsequent request is found.
43  * See comments there before making changes here.
44  */
45 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
46 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
47 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
48
49 /*
50  * On Windows, we have to interpret EACCES as possibly meaning the same as
51  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
52  * that's what you get.  Ugh.  This code is designed so that we don't
53  * actually believe these cases are okay without further evidence (namely,
54  * a pending fsync request getting revoked ... see mdsync).
55  */
56 #ifndef WIN32
57 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
58 #else
59 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
60 #endif
61
62 /*
63  *      The magnetic disk storage manager keeps track of open file
64  *      descriptors in its own descriptor pool.  This is done to make it
65  *      easier to support relations that are larger than the operating
66  *      system's file size limit (often 2GBytes).  In order to do that,
67  *      we break relations up into "segment" files that are each shorter than
68  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
69  *      configuration constant in pg_config.h.
70  *
71  *      On disk, a relation must consist of consecutively numbered segment
72  *      files in the pattern
73  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
74  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
75  *              -- Optionally, any number of inactive segments of size 0 blocks.
76  *      The full and partial segments are collectively the "active" segments.
77  *      Inactive segments are those that once contained data but are currently
78  *      not needed because of an mdtruncate() operation.  The reason for leaving
79  *      them present at size zero, rather than unlinking them, is that other
80  *      backends and/or the checkpointer might be holding open file references to
81  *      such segments.  If the relation expands again after mdtruncate(), such
82  *      that a deactivated segment becomes active again, it is important that
83  *      such file references still be valid --- else data might get written
84  *      out to an unlinked old copy of a segment file that will eventually
85  *      disappear.
86  *
87  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
88  *      cache is, therefore, just the head of a list of MdfdVec objects, one
89  *      per segment.  But note the md_fd pointer can be NULL, indicating
90  *      relation not open.
91  *
92  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
93  *      doesn't have another segment after this one; we may just not have
94  *      opened the next segment yet.  (We could not have "all segments are
95  *      in the chain" as an invariant anyway, since another backend could
96  *      extend the relation when we weren't looking.)  We do not make chain
97  *      entries for inactive segments, however; as soon as we find a partial
98  *      segment, we assume that any subsequent segments are inactive.
99  *
100  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
101  */
102
103 typedef struct _MdfdVec
104 {
105         File            mdfd_vfd;               /* fd number in fd.c's pool */
106         BlockNumber mdfd_segno;         /* segment number, from 0 */
107         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
108 } MdfdVec;
109
110 static MemoryContext MdCxt;             /* context for all md.c allocations */
111
112
113 /*
114  * In some contexts (currently, standalone backends and the checkpointer process)
115  * we keep track of pending fsync operations: we need to remember all relation
116  * segments that have been written since the last checkpoint, so that we can
117  * fsync them down to disk before completing the next checkpoint.  This hash
118  * table remembers the pending operations.      We use a hash table mostly as
119  * a convenient way of eliminating duplicate requests.
120  *
121  * We use a similar mechanism to remember no-longer-needed files that can
122  * be deleted after the next checkpoint, but we use a linked list instead of
123  * a hash table, because we don't expect there to be any duplicate requests.
124  *
125  * (Regular backends do not track pending operations locally, but forward
126  * them to the checkpointer.)
127  */
128 typedef struct
129 {
130         RelFileNodeBackend rnode;       /* the targeted relation */
131         ForkNumber      forknum;
132         BlockNumber segno;                      /* which segment */
133 } PendingOperationTag;
134
135 typedef uint16 CycleCtr;                /* can be any convenient integer size */
136
137 typedef struct
138 {
139         PendingOperationTag tag;        /* hash table key (must be first!) */
140         bool            canceled;               /* T => request canceled, not yet removed */
141         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
142 } PendingOperationEntry;
143
144 typedef struct
145 {
146         RelFileNodeBackend rnode;       /* the dead relation to delete */
147         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
148 } PendingUnlinkEntry;
149
150 static HTAB *pendingOpsTable = NULL;
151 static List *pendingUnlinks = NIL;
152
153 static CycleCtr mdsync_cycle_ctr = 0;
154 static CycleCtr mdckpt_cycle_ctr = 0;
155
156
157 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
158 {
159         EXTENSION_FAIL,                         /* ereport if segment not present */
160         EXTENSION_RETURN_NULL,          /* return NULL if not present */
161         EXTENSION_CREATE                        /* create new segments as needed */
162 } ExtensionBehavior;
163
164 /* local routines */
165 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
166            ExtensionBehavior behavior);
167 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
168                                            MdfdVec *seg);
169 static void register_unlink(RelFileNodeBackend rnode);
170 static MdfdVec *_fdvec_alloc(void);
171 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
172                           BlockNumber segno);
173 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
174                           BlockNumber segno, int oflags);
175 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
176                          BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
177 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
178                    MdfdVec *seg);
179
180
181 /*
182  *      mdinit() -- Initialize private state for magnetic disk storage manager.
183  */
184 void
185 mdinit(void)
186 {
187         MdCxt = AllocSetContextCreate(TopMemoryContext,
188                                                                   "MdSmgr",
189                                                                   ALLOCSET_DEFAULT_MINSIZE,
190                                                                   ALLOCSET_DEFAULT_INITSIZE,
191                                                                   ALLOCSET_DEFAULT_MAXSIZE);
192
193         /*
194          * Create pending-operations hashtable if we need it.  Currently, we need
195          * it if we are standalone (not under a postmaster) OR if we are a
196          * bootstrap-mode subprocess of a postmaster (that is, a startup or
197          * checkpointer process).
198          */
199         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
200         {
201                 HASHCTL         hash_ctl;
202
203                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
204                 hash_ctl.keysize = sizeof(PendingOperationTag);
205                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
206                 hash_ctl.hash = tag_hash;
207                 hash_ctl.hcxt = MdCxt;
208                 pendingOpsTable = hash_create("Pending Ops Table",
209                                                                           100L,
210                                                                           &hash_ctl,
211                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
212                 pendingUnlinks = NIL;
213         }
214 }
215
216 /*
217  * In archive recovery, we rely on checkpointer to do fsyncs, but we will have
218  * already created the pendingOpsTable during initialization of the startup
219  * process.  Calling this function drops the local pendingOpsTable so that
220  * subsequent requests will be forwarded to checkpointer.
221  */
222 void
223 SetForwardFsyncRequests(void)
224 {
225         /* Perform any pending ops we may have queued up */
226         if (pendingOpsTable)
227                 mdsync();
228         pendingOpsTable = NULL;
229 }
230
231 /*
232  *      mdexists() -- Does the physical file exist?
233  *
234  * Note: this will return true for lingering files, with pending deletions
235  */
236 bool
237 mdexists(SMgrRelation reln, ForkNumber forkNum)
238 {
239         /*
240          * Close it first, to ensure that we notice if the fork has been unlinked
241          * since we opened it.
242          */
243         mdclose(reln, forkNum);
244
245         return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
246 }
247
248 /*
249  *      mdcreate() -- Create a new relation on magnetic disk.
250  *
251  * If isRedo is true, it's okay for the relation to exist already.
252  */
253 void
254 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
255 {
256         char       *path;
257         File            fd;
258
259         if (isRedo && reln->md_fd[forkNum] != NULL)
260                 return;                                 /* created and opened already... */
261
262         Assert(reln->md_fd[forkNum] == NULL);
263
264         path = relpath(reln->smgr_rnode, forkNum);
265
266         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
267
268         if (fd < 0)
269         {
270                 int                     save_errno = errno;
271
272                 /*
273                  * During bootstrap, there are cases where a system relation will be
274                  * accessed (by internal backend processes) before the bootstrap
275                  * script nominally creates it.  Therefore, allow the file to exist
276                  * already, even if isRedo is not set.  (See also mdopen)
277                  */
278                 if (isRedo || IsBootstrapProcessingMode())
279                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
280                 if (fd < 0)
281                 {
282                         /* be sure to report the error reported by create, not open */
283                         errno = save_errno;
284                         ereport(ERROR,
285                                         (errcode_for_file_access(),
286                                          errmsg("could not create file \"%s\": %m", path)));
287                 }
288         }
289
290         pfree(path);
291
292         if (reln->smgr_transient)
293                 FileSetTransient(fd);
294
295         reln->md_fd[forkNum] = _fdvec_alloc();
296
297         reln->md_fd[forkNum]->mdfd_vfd = fd;
298         reln->md_fd[forkNum]->mdfd_segno = 0;
299         reln->md_fd[forkNum]->mdfd_chain = NULL;
300 }
301
302 /*
303  *      mdunlink() -- Unlink a relation.
304  *
305  * Note that we're passed a RelFileNode --- by the time this is called,
306  * there won't be an SMgrRelation hashtable entry anymore.
307  *
308  * Actually, we don't unlink the first segment file of the relation, but
309  * just truncate it to zero length, and record a request to unlink it after
310  * the next checkpoint.  Additional segments can be unlinked immediately,
311  * however.  Leaving the empty file in place prevents that relfilenode
312  * number from being reused.  The scenario this protects us from is:
313  * 1. We delete a relation (and commit, and actually remove its file).
314  * 2. We create a new relation, which by chance gets the same relfilenode as
315  *        the just-deleted one (OIDs must've wrapped around for that to happen).
316  * 3. We crash before another checkpoint occurs.
317  * During replay, we would delete the file and then recreate it, which is fine
318  * if the contents of the file were repopulated by subsequent WAL entries.
319  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
320  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
321  * the contents of the file would be lost forever.      By leaving the empty file
322  * until after the next checkpoint, we prevent reassignment of the relfilenode
323  * number until it's safe, because relfilenode assignment skips over any
324  * existing file.
325  *
326  * All the above applies only to the relation's main fork; other forks can
327  * just be removed immediately, since they are not needed to prevent the
328  * relfilenode number from being recycled.      Also, we do not carefully
329  * track whether other forks have been created or not, but just attempt to
330  * unlink them unconditionally; so we should never complain about ENOENT.
331  *
332  * If isRedo is true, it's unsurprising for the relation to be already gone.
333  * Also, we should remove the file immediately instead of queuing a request
334  * for later, since during redo there's no possibility of creating a
335  * conflicting relation.
336  *
337  * Note: any failure should be reported as WARNING not ERROR, because
338  * we are usually not in a transaction anymore when this is called.
339  */
340 void
341 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
342 {
343         char       *path;
344         int                     ret;
345
346         /*
347          * We have to clean out any pending fsync requests for the doomed
348          * relation, else the next mdsync() will fail.
349          */
350         ForgetRelationFsyncRequests(rnode, forkNum);
351
352         path = relpath(rnode, forkNum);
353
354         /*
355          * Delete or truncate the first segment.
356          */
357         if (isRedo || forkNum != MAIN_FORKNUM)
358         {
359                 ret = unlink(path);
360                 if (ret < 0 && errno != ENOENT)
361                         ereport(WARNING,
362                                         (errcode_for_file_access(),
363                                          errmsg("could not remove file \"%s\": %m", path)));
364         }
365         else
366         {
367                 /* truncate(2) would be easier here, but Windows hasn't got it */
368                 int                     fd;
369
370                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
371                 if (fd >= 0)
372                 {
373                         int                     save_errno;
374
375                         ret = ftruncate(fd, 0);
376                         save_errno = errno;
377                         close(fd);
378                         errno = save_errno;
379                 }
380                 else
381                         ret = -1;
382                 if (ret < 0 && errno != ENOENT)
383                         ereport(WARNING,
384                                         (errcode_for_file_access(),
385                                          errmsg("could not truncate file \"%s\": %m", path)));
386
387                 /* Register request to unlink first segment later */
388                 register_unlink(rnode);
389         }
390
391         /*
392          * Delete any additional segments.
393          */
394         if (ret >= 0)
395         {
396                 char       *segpath = (char *) palloc(strlen(path) + 12);
397                 BlockNumber segno;
398
399                 /*
400                  * Note that because we loop until getting ENOENT, we will correctly
401                  * remove all inactive segments as well as active ones.
402                  */
403                 for (segno = 1;; segno++)
404                 {
405                         sprintf(segpath, "%s.%u", path, segno);
406                         if (unlink(segpath) < 0)
407                         {
408                                 /* ENOENT is expected after the last segment... */
409                                 if (errno != ENOENT)
410                                         ereport(WARNING,
411                                                         (errcode_for_file_access(),
412                                            errmsg("could not remove file \"%s\": %m", segpath)));
413                                 break;
414                         }
415                 }
416                 pfree(segpath);
417         }
418
419         pfree(path);
420 }
421
422 /*
423  *      mdextend() -- Add a block to the specified relation.
424  *
425  *              The semantics are nearly the same as mdwrite(): write at the
426  *              specified position.  However, this is to be used for the case of
427  *              extending a relation (i.e., blocknum is at or beyond the current
428  *              EOF).  Note that we assume writing a block beyond current EOF
429  *              causes intervening file space to become filled with zeroes.
430  */
431 void
432 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
433                  char *buffer, bool skipFsync)
434 {
435         off_t           seekpos;
436         int                     nbytes;
437         MdfdVec    *v;
438
439         /* This assert is too expensive to have on normally ... */
440 #ifdef CHECK_WRITE_VS_EXTEND
441         Assert(blocknum >= mdnblocks(reln, forknum));
442 #endif
443
444         /*
445          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
446          * more --- we mustn't create a block whose number actually is
447          * InvalidBlockNumber.
448          */
449         if (blocknum == InvalidBlockNumber)
450                 ereport(ERROR,
451                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
452                                  errmsg("cannot extend file \"%s\" beyond %u blocks",
453                                                 relpath(reln->smgr_rnode, forknum),
454                                                 InvalidBlockNumber)));
455
456         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
457
458         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
459
460         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
461
462         /*
463          * Note: because caller usually obtained blocknum by calling mdnblocks,
464          * which did a seek(SEEK_END), this seek is often redundant and will be
465          * optimized away by fd.c.      It's not redundant, however, if there is a
466          * partial page at the end of the file. In that case we want to try to
467          * overwrite the partial page with a full page.  It's also not redundant
468          * if bufmgr.c had to dump another buffer of the same file to make room
469          * for the new page's buffer.
470          */
471         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
472                 ereport(ERROR,
473                                 (errcode_for_file_access(),
474                                  errmsg("could not seek to block %u in file \"%s\": %m",
475                                                 blocknum, FilePathName(v->mdfd_vfd))));
476
477         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
478         {
479                 if (nbytes < 0)
480                         ereport(ERROR,
481                                         (errcode_for_file_access(),
482                                          errmsg("could not extend file \"%s\": %m",
483                                                         FilePathName(v->mdfd_vfd)),
484                                          errhint("Check free disk space.")));
485                 /* short write: complain appropriately */
486                 ereport(ERROR,
487                                 (errcode(ERRCODE_DISK_FULL),
488                                  errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
489                                                 FilePathName(v->mdfd_vfd),
490                                                 nbytes, BLCKSZ, blocknum),
491                                  errhint("Check free disk space.")));
492         }
493
494         if (!skipFsync && !SmgrIsTemp(reln))
495                 register_dirty_segment(reln, forknum, v);
496
497         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
498 }
499
500 /*
501  *      mdopen() -- Open the specified relation.
502  *
503  * Note we only open the first segment, when there are multiple segments.
504  *
505  * If first segment is not present, either ereport or return NULL according
506  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
507  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
508  * invent one out of whole cloth.
509  */
510 static MdfdVec *
511 mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
512 {
513         MdfdVec    *mdfd;
514         char       *path;
515         File            fd;
516
517         /* No work if already open */
518         if (reln->md_fd[forknum])
519                 return reln->md_fd[forknum];
520
521         path = relpath(reln->smgr_rnode, forknum);
522
523         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
524
525         if (fd < 0)
526         {
527                 /*
528                  * During bootstrap, there are cases where a system relation will be
529                  * accessed (by internal backend processes) before the bootstrap
530                  * script nominally creates it.  Therefore, accept mdopen() as a
531                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
532                  */
533                 if (IsBootstrapProcessingMode())
534                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
535                 if (fd < 0)
536                 {
537                         if (behavior == EXTENSION_RETURN_NULL &&
538                                 FILE_POSSIBLY_DELETED(errno))
539                         {
540                                 pfree(path);
541                                 return NULL;
542                         }
543                         ereport(ERROR,
544                                         (errcode_for_file_access(),
545                                          errmsg("could not open file \"%s\": %m", path)));
546                 }
547         }
548
549         pfree(path);
550
551         if (reln->smgr_transient)
552                 FileSetTransient(fd);
553
554         reln->md_fd[forknum] = mdfd = _fdvec_alloc();
555
556         mdfd->mdfd_vfd = fd;
557         mdfd->mdfd_segno = 0;
558         mdfd->mdfd_chain = NULL;
559         Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
560
561         return mdfd;
562 }
563
564 /*
565  *      mdclose() -- Close the specified relation, if it isn't closed already.
566  */
567 void
568 mdclose(SMgrRelation reln, ForkNumber forknum)
569 {
570         MdfdVec    *v = reln->md_fd[forknum];
571
572         /* No work if already closed */
573         if (v == NULL)
574                 return;
575
576         reln->md_fd[forknum] = NULL;    /* prevent dangling pointer after error */
577
578         while (v != NULL)
579         {
580                 MdfdVec    *ov = v;
581
582                 /* if not closed already */
583                 if (v->mdfd_vfd >= 0)
584                         FileClose(v->mdfd_vfd);
585                 /* Now free vector */
586                 v = v->mdfd_chain;
587                 pfree(ov);
588         }
589 }
590
591 /*
592  *      mdprefetch() -- Initiate asynchronous read of the specified block of a relation
593  */
594 void
595 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
596 {
597 #ifdef USE_PREFETCH
598         off_t           seekpos;
599         MdfdVec    *v;
600
601         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
602
603         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
604
605         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
606
607         (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
608 #endif   /* USE_PREFETCH */
609 }
610
611
612 /*
613  *      mdread() -- Read the specified block from a relation.
614  */
615 void
616 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
617            char *buffer)
618 {
619         off_t           seekpos;
620         int                     nbytes;
621         MdfdVec    *v;
622
623         TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
624                                                                                 reln->smgr_rnode.node.spcNode,
625                                                                                 reln->smgr_rnode.node.dbNode,
626                                                                                 reln->smgr_rnode.node.relNode,
627                                                                                 reln->smgr_rnode.backend);
628
629         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
630
631         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
632
633         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
634
635         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
636                 ereport(ERROR,
637                                 (errcode_for_file_access(),
638                                  errmsg("could not seek to block %u in file \"%s\": %m",
639                                                 blocknum, FilePathName(v->mdfd_vfd))));
640
641         nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
642
643         TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
644                                                                            reln->smgr_rnode.node.spcNode,
645                                                                            reln->smgr_rnode.node.dbNode,
646                                                                            reln->smgr_rnode.node.relNode,
647                                                                            reln->smgr_rnode.backend,
648                                                                            nbytes,
649                                                                            BLCKSZ);
650
651         if (nbytes != BLCKSZ)
652         {
653                 if (nbytes < 0)
654                         ereport(ERROR,
655                                         (errcode_for_file_access(),
656                                          errmsg("could not read block %u in file \"%s\": %m",
657                                                         blocknum, FilePathName(v->mdfd_vfd))));
658
659                 /*
660                  * Short read: we are at or past EOF, or we read a partial block at
661                  * EOF.  Normally this is an error; upper levels should never try to
662                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
663                  * we are InRecovery, we should instead return zeroes without
664                  * complaining.  This allows, for example, the case of trying to
665                  * update a block that was later truncated away.
666                  */
667                 if (zero_damaged_pages || InRecovery)
668                         MemSet(buffer, 0, BLCKSZ);
669                 else
670                         ereport(ERROR,
671                                         (errcode(ERRCODE_DATA_CORRUPTED),
672                                          errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
673                                                         blocknum, FilePathName(v->mdfd_vfd),
674                                                         nbytes, BLCKSZ)));
675         }
676 }
677
678 /*
679  *      mdwrite() -- Write the supplied block at the appropriate location.
680  *
681  *              This is to be used only for updating already-existing blocks of a
682  *              relation (ie, those before the current EOF).  To extend a relation,
683  *              use mdextend().
684  */
685 void
686 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
687                 char *buffer, bool skipFsync)
688 {
689         off_t           seekpos;
690         int                     nbytes;
691         MdfdVec    *v;
692
693         /* This assert is too expensive to have on normally ... */
694 #ifdef CHECK_WRITE_VS_EXTEND
695         Assert(blocknum < mdnblocks(reln, forknum));
696 #endif
697
698         TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
699                                                                                  reln->smgr_rnode.node.spcNode,
700                                                                                  reln->smgr_rnode.node.dbNode,
701                                                                                  reln->smgr_rnode.node.relNode,
702                                                                                  reln->smgr_rnode.backend);
703
704         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL);
705
706         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
707
708         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
709
710         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
711                 ereport(ERROR,
712                                 (errcode_for_file_access(),
713                                  errmsg("could not seek to block %u in file \"%s\": %m",
714                                                 blocknum, FilePathName(v->mdfd_vfd))));
715
716         nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
717
718         TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
719                                                                                 reln->smgr_rnode.node.spcNode,
720                                                                                 reln->smgr_rnode.node.dbNode,
721                                                                                 reln->smgr_rnode.node.relNode,
722                                                                                 reln->smgr_rnode.backend,
723                                                                                 nbytes,
724                                                                                 BLCKSZ);
725
726         if (nbytes != BLCKSZ)
727         {
728                 if (nbytes < 0)
729                         ereport(ERROR,
730                                         (errcode_for_file_access(),
731                                          errmsg("could not write block %u in file \"%s\": %m",
732                                                         blocknum, FilePathName(v->mdfd_vfd))));
733                 /* short write: complain appropriately */
734                 ereport(ERROR,
735                                 (errcode(ERRCODE_DISK_FULL),
736                                  errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
737                                                 blocknum,
738                                                 FilePathName(v->mdfd_vfd),
739                                                 nbytes, BLCKSZ),
740                                  errhint("Check free disk space.")));
741         }
742
743         if (!skipFsync && !SmgrIsTemp(reln))
744                 register_dirty_segment(reln, forknum, v);
745 }
746
747 /*
748  *      mdnblocks() -- Get the number of blocks stored in a relation.
749  *
750  *              Important side effect: all active segments of the relation are opened
751  *              and added to the mdfd_chain list.  If this routine has not been
752  *              called, then only segments up to the last one actually touched
753  *              are present in the chain.
754  */
755 BlockNumber
756 mdnblocks(SMgrRelation reln, ForkNumber forknum)
757 {
758         MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
759         BlockNumber nblocks;
760         BlockNumber segno = 0;
761
762         /*
763          * Skip through any segments that aren't the last one, to avoid redundant
764          * seeks on them.  We have previously verified that these segments are
765          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
766          *
767          * NOTE: this assumption could only be wrong if another backend has
768          * truncated the relation.      We rely on higher code levels to handle that
769          * scenario by closing and re-opening the md fd, which is handled via
770          * relcache flush.      (Since the checkpointer doesn't participate in
771          * relcache flush, it could have segment chain entries for inactive
772          * segments; that's OK because the checkpointer never needs to compute
773          * relation size.)
774          */
775         while (v->mdfd_chain != NULL)
776         {
777                 segno++;
778                 v = v->mdfd_chain;
779         }
780
781         for (;;)
782         {
783                 nblocks = _mdnblocks(reln, forknum, v);
784                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
785                         elog(FATAL, "segment too big");
786                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
787                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
788
789                 /*
790                  * If segment is exactly RELSEG_SIZE, advance to next one.
791                  */
792                 segno++;
793
794                 if (v->mdfd_chain == NULL)
795                 {
796                         /*
797                          * Because we pass O_CREAT, we will create the next segment (with
798                          * zero length) immediately, if the last segment is of length
799                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
800                          * the logic simple.
801                          */
802                         v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
803                         if (v->mdfd_chain == NULL)
804                                 ereport(ERROR,
805                                                 (errcode_for_file_access(),
806                                                  errmsg("could not open file \"%s\": %m",
807                                                                 _mdfd_segpath(reln, forknum, segno))));
808                 }
809
810                 v = v->mdfd_chain;
811         }
812 }
813
814 /*
815  *      mdtruncate() -- Truncate relation to specified number of blocks.
816  */
817 void
818 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
819 {
820         MdfdVec    *v;
821         BlockNumber curnblk;
822         BlockNumber priorblocks;
823
824         /*
825          * NOTE: mdnblocks makes sure we have opened all active segments, so that
826          * truncation loop will get them all!
827          */
828         curnblk = mdnblocks(reln, forknum);
829         if (nblocks > curnblk)
830         {
831                 /* Bogus request ... but no complaint if InRecovery */
832                 if (InRecovery)
833                         return;
834                 ereport(ERROR,
835                                 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
836                                                 relpath(reln->smgr_rnode, forknum),
837                                                 nblocks, curnblk)));
838         }
839         if (nblocks == curnblk)
840                 return;                                 /* no work */
841
842         v = mdopen(reln, forknum, EXTENSION_FAIL);
843
844         priorblocks = 0;
845         while (v != NULL)
846         {
847                 MdfdVec    *ov = v;
848
849                 if (priorblocks > nblocks)
850                 {
851                         /*
852                          * This segment is no longer active (and has already been unlinked
853                          * from the mdfd_chain). We truncate the file, but do not delete
854                          * it, for reasons explained in the header comments.
855                          */
856                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
857                                 ereport(ERROR,
858                                                 (errcode_for_file_access(),
859                                                  errmsg("could not truncate file \"%s\": %m",
860                                                                 FilePathName(v->mdfd_vfd))));
861
862                         if (!SmgrIsTemp(reln))
863                                 register_dirty_segment(reln, forknum, v);
864                         v = v->mdfd_chain;
865                         Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
866                                                                                                  * segment */
867                         pfree(ov);
868                 }
869                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
870                 {
871                         /*
872                          * This is the last segment we want to keep. Truncate the file to
873                          * the right length, and clear chain link that points to any
874                          * remaining segments (which we shall zap). NOTE: if nblocks is
875                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
876                          * segment to 0 length but keep it. This adheres to the invariant
877                          * given in the header comments.
878                          */
879                         BlockNumber lastsegblocks = nblocks - priorblocks;
880
881                         if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
882                                 ereport(ERROR,
883                                                 (errcode_for_file_access(),
884                                         errmsg("could not truncate file \"%s\" to %u blocks: %m",
885                                                    FilePathName(v->mdfd_vfd),
886                                                    nblocks)));
887                         if (!SmgrIsTemp(reln))
888                                 register_dirty_segment(reln, forknum, v);
889                         v = v->mdfd_chain;
890                         ov->mdfd_chain = NULL;
891                 }
892                 else
893                 {
894                         /*
895                          * We still need this segment and 0 or more blocks beyond it, so
896                          * nothing to do here.
897                          */
898                         v = v->mdfd_chain;
899                 }
900                 priorblocks += RELSEG_SIZE;
901         }
902 }
903
904 /*
905  *      mdimmedsync() -- Immediately sync a relation to stable storage.
906  *
907  * Note that only writes already issued are synced; this routine knows
908  * nothing of dirty buffers that may exist inside the buffer manager.
909  */
910 void
911 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
912 {
913         MdfdVec    *v;
914
915         /*
916          * NOTE: mdnblocks makes sure we have opened all active segments, so that
917          * fsync loop will get them all!
918          */
919         mdnblocks(reln, forknum);
920
921         v = mdopen(reln, forknum, EXTENSION_FAIL);
922
923         while (v != NULL)
924         {
925                 if (FileSync(v->mdfd_vfd) < 0)
926                         ereport(ERROR,
927                                         (errcode_for_file_access(),
928                                          errmsg("could not fsync file \"%s\": %m",
929                                                         FilePathName(v->mdfd_vfd))));
930                 v = v->mdfd_chain;
931         }
932 }
933
934 /*
935  *      mdsync() -- Sync previous writes to stable storage.
936  */
937 void
938 mdsync(void)
939 {
940         static bool mdsync_in_progress = false;
941
942         HASH_SEQ_STATUS hstat;
943         PendingOperationEntry *entry;
944         int                     absorb_counter;
945
946         /* Statistics on sync times */
947         int                     processed = 0;
948         instr_time      sync_start,
949                                 sync_end,
950                                 sync_diff;
951         uint64          elapsed;
952         uint64          longest = 0;
953         uint64          total_elapsed = 0;
954
955         /*
956          * This is only called during checkpoints, and checkpoints should only
957          * occur in processes that have created a pendingOpsTable.
958          */
959         if (!pendingOpsTable)
960                 elog(ERROR, "cannot sync without a pendingOpsTable");
961
962         /*
963          * If we are in the checkpointer, the sync had better include all fsync
964          * requests that were queued by backends up to this point.      The tightest
965          * race condition that could occur is that a buffer that must be written
966          * and fsync'd for the checkpoint could have been dumped by a backend just
967          * before it was visited by BufferSync().  We know the backend will have
968          * queued an fsync request before clearing the buffer's dirtybit, so we
969          * are safe as long as we do an Absorb after completing BufferSync().
970          */
971         AbsorbFsyncRequests();
972
973         /*
974          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
975          * checkpoint), we want to ignore fsync requests that are entered into the
976          * hashtable after this point --- they should be processed next time,
977          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
978          * ones: new ones will have cycle_ctr equal to the incremented value of
979          * mdsync_cycle_ctr.
980          *
981          * In normal circumstances, all entries present in the table at this point
982          * will have cycle_ctr exactly equal to the current (about to be old)
983          * value of mdsync_cycle_ctr.  However, if we fail partway through the
984          * fsync'ing loop, then older values of cycle_ctr might remain when we
985          * come back here to try again.  Repeated checkpoint failures would
986          * eventually wrap the counter around to the point where an old entry
987          * might appear new, causing us to skip it, possibly allowing a checkpoint
988          * to succeed that should not have.  To forestall wraparound, any time the
989          * previous mdsync() failed to complete, run through the table and
990          * forcibly set cycle_ctr = mdsync_cycle_ctr.
991          *
992          * Think not to merge this loop with the main loop, as the problem is
993          * exactly that that loop may fail before having visited all the entries.
994          * From a performance point of view it doesn't matter anyway, as this path
995          * will never be taken in a system that's functioning normally.
996          */
997         if (mdsync_in_progress)
998         {
999                 /* prior try failed, so update any stale cycle_ctr values */
1000                 hash_seq_init(&hstat, pendingOpsTable);
1001                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1002                 {
1003                         entry->cycle_ctr = mdsync_cycle_ctr;
1004                 }
1005         }
1006
1007         /* Advance counter so that new hashtable entries are distinguishable */
1008         mdsync_cycle_ctr++;
1009
1010         /* Set flag to detect failure if we don't reach the end of the loop */
1011         mdsync_in_progress = true;
1012
1013         /* Now scan the hashtable for fsync requests to process */
1014         absorb_counter = FSYNCS_PER_ABSORB;
1015         hash_seq_init(&hstat, pendingOpsTable);
1016         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1017         {
1018                 /*
1019                  * If the entry is new then don't process it this time.  Note that
1020                  * "continue" bypasses the hash-remove call at the bottom of the loop.
1021                  */
1022                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1023                         continue;
1024
1025                 /* Else assert we haven't missed it */
1026                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1027
1028                 /*
1029                  * If fsync is off then we don't have to bother opening the file at
1030                  * all.  (We delay checking until this point so that changing fsync on
1031                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
1032                  * fall through to delete it.
1033                  */
1034                 if (enableFsync && !entry->canceled)
1035                 {
1036                         int                     failures;
1037
1038                         /*
1039                          * If in checkpointer, we want to absorb pending requests every so
1040                          * often to prevent overflow of the fsync request queue.  It is
1041                          * unspecified whether newly-added entries will be visited by
1042                          * hash_seq_search, but we don't care since we don't need to
1043                          * process them anyway.
1044                          */
1045                         if (--absorb_counter <= 0)
1046                         {
1047                                 AbsorbFsyncRequests();
1048                                 absorb_counter = FSYNCS_PER_ABSORB;
1049                         }
1050
1051                         /*
1052                          * The fsync table could contain requests to fsync segments that
1053                          * have been deleted (unlinked) by the time we get to them. Rather
1054                          * than just hoping an ENOENT (or EACCES on Windows) error can be
1055                          * ignored, what we do on error is absorb pending requests and
1056                          * then retry.  Since mdunlink() queues a "revoke" message before
1057                          * actually unlinking, the fsync request is guaranteed to be
1058                          * marked canceled after the absorb if it really was this case.
1059                          * DROP DATABASE likewise has to tell us to forget fsync requests
1060                          * before it starts deletions.
1061                          */
1062                         for (failures = 0;; failures++)         /* loop exits at "break" */
1063                         {
1064                                 SMgrRelation reln;
1065                                 MdfdVec    *seg;
1066                                 char       *path;
1067
1068                                 /*
1069                                  * Find or create an smgr hash entry for this relation. This
1070                                  * may seem a bit unclean -- md calling smgr?  But it's really
1071                                  * the best solution.  It ensures that the open file reference
1072                                  * isn't permanently leaked if we get an error here. (You may
1073                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1074                                  * really, because the only case in which a checkpoint is done
1075                                  * by a process that isn't about to shut down is in the
1076                                  * checkpointer, and it will periodically do smgrcloseall().
1077                                  * This fact justifies our not closing the reln in the success
1078                                  * path either, which is a good thing since in
1079                                  * non-checkpointer cases we couldn't safely do that.)
1080                                  * Furthermore, in many cases the relation will have been
1081                                  * dirtied through this same smgr relation, and so we can save
1082                                  * a file open/close cycle.
1083                                  */
1084                                 reln = smgropen(entry->tag.rnode.node,
1085                                                                 entry->tag.rnode.backend);
1086
1087                                 /*
1088                                  * It is possible that the relation has been dropped or
1089                                  * truncated since the fsync request was entered.  Therefore,
1090                                  * allow ENOENT, but only if we didn't fail already on this
1091                                  * file.  This applies both during _mdfd_getseg() and during
1092                                  * FileSync, since fd.c might have closed the file behind our
1093                                  * back.
1094                                  */
1095                                 seg = _mdfd_getseg(reln, entry->tag.forknum,
1096                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1097                                                                    false, EXTENSION_RETURN_NULL);
1098
1099                                 INSTR_TIME_SET_CURRENT(sync_start);
1100
1101                                 if (seg != NULL &&
1102                                         FileSync(seg->mdfd_vfd) >= 0)
1103                                 {
1104                                         INSTR_TIME_SET_CURRENT(sync_end);
1105                                         sync_diff = sync_end;
1106                                         INSTR_TIME_SUBTRACT(sync_diff, sync_start);
1107                                         elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
1108                                         if (elapsed > longest)
1109                                                 longest = elapsed;
1110                                         total_elapsed += elapsed;
1111                                         processed++;
1112                                         if (log_checkpoints)
1113                                                 elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
1114                                                          processed, FilePathName(seg->mdfd_vfd), (double) elapsed / 1000);
1115
1116                                         break;          /* success; break out of retry loop */
1117                                 }
1118
1119                                 /*
1120                                  * XXX is there any point in allowing more than one retry?
1121                                  * Don't see one at the moment, but easy to change the test
1122                                  * here if so.
1123                                  */
1124                                 path = _mdfd_segpath(reln, entry->tag.forknum,
1125                                                                          entry->tag.segno);
1126                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1127                                         failures > 0)
1128                                         ereport(ERROR,
1129                                                         (errcode_for_file_access(),
1130                                                    errmsg("could not fsync file \"%s\": %m", path)));
1131                                 else
1132                                         ereport(DEBUG1,
1133                                                         (errcode_for_file_access(),
1134                                            errmsg("could not fsync file \"%s\" but retrying: %m",
1135                                                           path)));
1136                                 pfree(path);
1137
1138                                 /*
1139                                  * Absorb incoming requests and check to see if canceled.
1140                                  */
1141                                 AbsorbFsyncRequests();
1142                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1143
1144                                 if (entry->canceled)
1145                                         break;
1146                         }                                       /* end retry loop */
1147                 }
1148
1149                 /*
1150                  * If we get here, either we fsync'd successfully, or we don't have to
1151                  * because enableFsync is off, or the entry is (now) marked canceled.
1152                  * Okay to delete it.
1153                  */
1154                 if (hash_search(pendingOpsTable, &entry->tag,
1155                                                 HASH_REMOVE, NULL) == NULL)
1156                         elog(ERROR, "pendingOpsTable corrupted");
1157         }                                                       /* end loop over hashtable entries */
1158
1159         /* Return sync performance metrics for report at checkpoint end */
1160         CheckpointStats.ckpt_sync_rels = processed;
1161         CheckpointStats.ckpt_longest_sync = longest;
1162         CheckpointStats.ckpt_agg_sync_time = total_elapsed;
1163
1164         /* Flag successful completion of mdsync */
1165         mdsync_in_progress = false;
1166 }
1167
1168 /*
1169  * mdpreckpt() -- Do pre-checkpoint work
1170  *
1171  * To distinguish unlink requests that arrived before this checkpoint
1172  * started from those that arrived during the checkpoint, we use a cycle
1173  * counter similar to the one we use for fsync requests. That cycle
1174  * counter is incremented here.
1175  *
1176  * This must be called *before* the checkpoint REDO point is determined.
1177  * That ensures that we won't delete files too soon.
1178  *
1179  * Note that we can't do anything here that depends on the assumption
1180  * that the checkpoint will be completed.
1181  */
1182 void
1183 mdpreckpt(void)
1184 {
1185         ListCell   *cell;
1186
1187         /*
1188          * In case the prior checkpoint wasn't completed, stamp all entries in the
1189          * list with the current cycle counter.  Anything that's in the list at
1190          * the start of checkpoint can surely be deleted after the checkpoint is
1191          * finished, regardless of when the request was made.
1192          */
1193         foreach(cell, pendingUnlinks)
1194         {
1195                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1196
1197                 entry->cycle_ctr = mdckpt_cycle_ctr;
1198         }
1199
1200         /*
1201          * Any unlink requests arriving after this point will be assigned the next
1202          * cycle counter, and won't be unlinked until next checkpoint.
1203          */
1204         mdckpt_cycle_ctr++;
1205 }
1206
1207 /*
1208  * mdpostckpt() -- Do post-checkpoint work
1209  *
1210  * Remove any lingering files that can now be safely removed.
1211  */
1212 void
1213 mdpostckpt(void)
1214 {
1215         while (pendingUnlinks != NIL)
1216         {
1217                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1218                 char       *path;
1219
1220                 /*
1221                  * New entries are appended to the end, so if the entry is new we've
1222                  * reached the end of old entries.
1223                  */
1224                 if (entry->cycle_ctr == mdckpt_cycle_ctr)
1225                         break;
1226
1227                 /* Else assert we haven't missed it */
1228                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1229
1230                 /* Unlink the file */
1231                 path = relpath(entry->rnode, MAIN_FORKNUM);
1232                 if (unlink(path) < 0)
1233                 {
1234                         /*
1235                          * There's a race condition, when the database is dropped at the
1236                          * same time that we process the pending unlink requests. If the
1237                          * DROP DATABASE deletes the file before we do, we will get ENOENT
1238                          * here. rmtree() also has to ignore ENOENT errors, to deal with
1239                          * the possibility that we delete the file first.
1240                          */
1241                         if (errno != ENOENT)
1242                                 ereport(WARNING,
1243                                                 (errcode_for_file_access(),
1244                                                  errmsg("could not remove file \"%s\": %m", path)));
1245                 }
1246                 pfree(path);
1247
1248                 pendingUnlinks = list_delete_first(pendingUnlinks);
1249                 pfree(entry);
1250         }
1251 }
1252
1253 /*
1254  * register_dirty_segment() -- Mark a relation segment as needing fsync
1255  *
1256  * If there is a local pending-ops table, just make an entry in it for
1257  * mdsync to process later.  Otherwise, try to pass off the fsync request
1258  * to the background writer process.  If that fails, just do the fsync
1259  * locally before returning (we expect this will not happen often enough
1260  * to be a performance problem).
1261  */
1262 static void
1263 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1264 {
1265         if (pendingOpsTable)
1266         {
1267                 /* push it into local pending-ops table */
1268                 RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno);
1269         }
1270         else
1271         {
1272                 if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno))
1273                         return;                         /* passed it off successfully */
1274
1275                 ereport(DEBUG1,
1276                                 (errmsg("could not forward fsync request because request queue is full")));
1277
1278                 if (FileSync(seg->mdfd_vfd) < 0)
1279                         ereport(ERROR,
1280                                         (errcode_for_file_access(),
1281                                          errmsg("could not fsync file \"%s\": %m",
1282                                                         FilePathName(seg->mdfd_vfd))));
1283         }
1284 }
1285
1286 /*
1287  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1288  *
1289  * As with register_dirty_segment, this could involve either a local or
1290  * a remote pending-ops table.
1291  */
1292 static void
1293 register_unlink(RelFileNodeBackend rnode)
1294 {
1295         if (pendingOpsTable)
1296         {
1297                 /* push it into local pending-ops table */
1298                 RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST);
1299         }
1300         else
1301         {
1302                 /*
1303                  * Notify the checkpointer about it.  If we fail to queue the request
1304                  * message, we have to sleep and try again, because we can't simply
1305                  * delete the file now.  Ugly, but hopefully won't happen often.
1306                  *
1307                  * XXX should we just leave the file orphaned instead?
1308                  */
1309                 Assert(IsUnderPostmaster);
1310                 while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM,
1311                                                                         UNLINK_RELATION_REQUEST))
1312                         pg_usleep(10000L);      /* 10 msec seems a good number */
1313         }
1314 }
1315
1316 /*
1317  * RememberFsyncRequest() -- callback from checkpointer side of fsync request
1318  *
1319  * We stuff most fsync requests into the local hash table for execution
1320  * during the checkpointer's next checkpoint.  UNLINK requests go into a
1321  * separate linked list, however, because they get processed separately.
1322  *
1323  * The range of possible segment numbers is way less than the range of
1324  * BlockNumber, so we can reserve high values of segno for special purposes.
1325  * We define three:
1326  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1327  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1328  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1329  *       checkpoint.
1330  *
1331  * (Handling the FORGET_* requests is a tad slow because the hash table has
1332  * to be searched linearly, but it doesn't seem worth rethinking the table
1333  * structure for them.)
1334  */
1335 void
1336 RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
1337                                          BlockNumber segno)
1338 {
1339         Assert(pendingOpsTable);
1340
1341         if (segno == FORGET_RELATION_FSYNC)
1342         {
1343                 /* Remove any pending requests for the entire relation */
1344                 HASH_SEQ_STATUS hstat;
1345                 PendingOperationEntry *entry;
1346
1347                 hash_seq_init(&hstat, pendingOpsTable);
1348                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1349                 {
1350                         if (RelFileNodeBackendEquals(entry->tag.rnode, rnode) &&
1351                                 entry->tag.forknum == forknum)
1352                         {
1353                                 /* Okay, cancel this entry */
1354                                 entry->canceled = true;
1355                         }
1356                 }
1357         }
1358         else if (segno == FORGET_DATABASE_FSYNC)
1359         {
1360                 /* Remove any pending requests for the entire database */
1361                 HASH_SEQ_STATUS hstat;
1362                 PendingOperationEntry *entry;
1363                 ListCell   *cell,
1364                                    *prev,
1365                                    *next;
1366
1367                 /* Remove fsync requests */
1368                 hash_seq_init(&hstat, pendingOpsTable);
1369                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1370                 {
1371                         if (entry->tag.rnode.node.dbNode == rnode.node.dbNode)
1372                         {
1373                                 /* Okay, cancel this entry */
1374                                 entry->canceled = true;
1375                         }
1376                 }
1377
1378                 /* Remove unlink requests */
1379                 prev = NULL;
1380                 for (cell = list_head(pendingUnlinks); cell; cell = next)
1381                 {
1382                         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1383
1384                         next = lnext(cell);
1385                         if (entry->rnode.node.dbNode == rnode.node.dbNode)
1386                         {
1387                                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1388                                 pfree(entry);
1389                         }
1390                         else
1391                                 prev = cell;
1392                 }
1393         }
1394         else if (segno == UNLINK_RELATION_REQUEST)
1395         {
1396                 /* Unlink request: put it in the linked list */
1397                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1398                 PendingUnlinkEntry *entry;
1399
1400                 entry = palloc(sizeof(PendingUnlinkEntry));
1401                 entry->rnode = rnode;
1402                 entry->cycle_ctr = mdckpt_cycle_ctr;
1403
1404                 pendingUnlinks = lappend(pendingUnlinks, entry);
1405
1406                 MemoryContextSwitchTo(oldcxt);
1407         }
1408         else
1409         {
1410                 /* Normal case: enter a request to fsync this segment */
1411                 PendingOperationTag key;
1412                 PendingOperationEntry *entry;
1413                 bool            found;
1414
1415                 /* ensure any pad bytes in the hash key are zeroed */
1416                 MemSet(&key, 0, sizeof(key));
1417                 key.rnode = rnode;
1418                 key.forknum = forknum;
1419                 key.segno = segno;
1420
1421                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1422                                                                                                           &key,
1423                                                                                                           HASH_ENTER,
1424                                                                                                           &found);
1425                 /* if new or previously canceled entry, initialize it */
1426                 if (!found || entry->canceled)
1427                 {
1428                         entry->canceled = false;
1429                         entry->cycle_ctr = mdsync_cycle_ctr;
1430                 }
1431
1432                 /*
1433                  * NB: it's intentional that we don't change cycle_ctr if the entry
1434                  * already exists.      The fsync request must be treated as old, even
1435                  * though the new request will be satisfied too by any subsequent
1436                  * fsync.
1437                  *
1438                  * However, if the entry is present but is marked canceled, we should
1439                  * act just as though it wasn't there.  The only case where this could
1440                  * happen would be if a file had been deleted, we received but did not
1441                  * yet act on the cancel request, and the same relfilenode was then
1442                  * assigned to a new file.      We mustn't lose the new request, but it
1443                  * should be considered new not old.
1444                  */
1445         }
1446 }
1447
1448 /*
1449  * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
1450  */
1451 void
1452 ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum)
1453 {
1454         if (pendingOpsTable)
1455         {
1456                 /* standalone backend or startup process: fsync state is local */
1457                 RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
1458         }
1459         else if (IsUnderPostmaster)
1460         {
1461                 /*
1462                  * Notify the checkpointer about it.  If we fail to queue the revoke
1463                  * message, we have to sleep and try again ... ugly, but hopefully
1464                  * won't happen often.
1465                  *
1466                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1467                  * error would leave the no-longer-used file still present on disk,
1468                  * which would be bad, so I'm inclined to assume that the checkpointer
1469                  * will always empty the queue soon.
1470                  */
1471                 while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
1472                         pg_usleep(10000L);      /* 10 msec seems a good number */
1473
1474                 /*
1475                  * Note we don't wait for the checkpointer to actually absorb the
1476                  * revoke message; see mdsync() for the implications.
1477                  */
1478         }
1479 }
1480
1481 /*
1482  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1483  */
1484 void
1485 ForgetDatabaseFsyncRequests(Oid dbid)
1486 {
1487         RelFileNodeBackend rnode;
1488
1489         rnode.node.dbNode = dbid;
1490         rnode.node.spcNode = 0;
1491         rnode.node.relNode = 0;
1492         rnode.backend = InvalidBackendId;
1493
1494         if (pendingOpsTable)
1495         {
1496                 /* standalone backend or startup process: fsync state is local */
1497                 RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
1498         }
1499         else if (IsUnderPostmaster)
1500         {
1501                 /* see notes in ForgetRelationFsyncRequests */
1502                 while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
1503                                                                         FORGET_DATABASE_FSYNC))
1504                         pg_usleep(10000L);      /* 10 msec seems a good number */
1505         }
1506 }
1507
1508
1509 /*
1510  *      _fdvec_alloc() -- Make a MdfdVec object.
1511  */
1512 static MdfdVec *
1513 _fdvec_alloc(void)
1514 {
1515         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1516 }
1517
1518 /*
1519  * Return the filename for the specified segment of the relation. The
1520  * returned string is palloc'd.
1521  */
1522 static char *
1523 _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
1524 {
1525         char       *path,
1526                            *fullpath;
1527
1528         path = relpath(reln->smgr_rnode, forknum);
1529
1530         if (segno > 0)
1531         {
1532                 /* be sure we have enough space for the '.segno' */
1533                 fullpath = (char *) palloc(strlen(path) + 12);
1534                 sprintf(fullpath, "%s.%u", path, segno);
1535                 pfree(path);
1536         }
1537         else
1538                 fullpath = path;
1539
1540         return fullpath;
1541 }
1542
1543 /*
1544  * Open the specified segment of the relation,
1545  * and make a MdfdVec object for it.  Returns NULL on failure.
1546  */
1547 static MdfdVec *
1548 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1549                           int oflags)
1550 {
1551         MdfdVec    *v;
1552         int                     fd;
1553         char       *fullpath;
1554
1555         fullpath = _mdfd_segpath(reln, forknum, segno);
1556
1557         /* open the file */
1558         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1559
1560         pfree(fullpath);
1561
1562         if (fd < 0)
1563                 return NULL;
1564
1565         if (reln->smgr_transient)
1566                 FileSetTransient(fd);
1567
1568         /* allocate an mdfdvec entry for it */
1569         v = _fdvec_alloc();
1570
1571         /* fill the entry */
1572         v->mdfd_vfd = fd;
1573         v->mdfd_segno = segno;
1574         v->mdfd_chain = NULL;
1575         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1576
1577         /* all done */
1578         return v;
1579 }
1580
1581 /*
1582  *      _mdfd_getseg() -- Find the segment of the relation holding the
1583  *              specified block.
1584  *
1585  * If the segment doesn't exist, we ereport, return NULL, or create the
1586  * segment, according to "behavior".  Note: skipFsync is only used in the
1587  * EXTENSION_CREATE case.
1588  */
1589 static MdfdVec *
1590 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1591                          bool skipFsync, ExtensionBehavior behavior)
1592 {
1593         MdfdVec    *v = mdopen(reln, forknum, behavior);
1594         BlockNumber targetseg;
1595         BlockNumber nextsegno;
1596
1597         if (!v)
1598                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1599
1600         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1601         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1602         {
1603                 Assert(nextsegno == v->mdfd_segno + 1);
1604
1605                 if (v->mdfd_chain == NULL)
1606                 {
1607                         /*
1608                          * Normally we will create new segments only if authorized by the
1609                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1610                          * recovery, create segments anyway; this allows cases such as
1611                          * replaying WAL data that has a write into a high-numbered
1612                          * segment of a relation that was later deleted.  We want to go
1613                          * ahead and create the segments so we can finish out the replay.
1614                          *
1615                          * We have to maintain the invariant that segments before the last
1616                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1617                          * with zeroes if needed.  (This only matters if caller is
1618                          * extending the relation discontiguously, but that can happen in
1619                          * hash indexes.)
1620                          */
1621                         if (behavior == EXTENSION_CREATE || InRecovery)
1622                         {
1623                                 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
1624                                 {
1625                                         char       *zerobuf = palloc0(BLCKSZ);
1626
1627                                         mdextend(reln, forknum,
1628                                                          nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1629                                                          zerobuf, skipFsync);
1630                                         pfree(zerobuf);
1631                                 }
1632                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
1633                         }
1634                         else
1635                         {
1636                                 /* We won't create segment if not existent */
1637                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
1638                         }
1639                         if (v->mdfd_chain == NULL)
1640                         {
1641                                 if (behavior == EXTENSION_RETURN_NULL &&
1642                                         FILE_POSSIBLY_DELETED(errno))
1643                                         return NULL;
1644                                 ereport(ERROR,
1645                                                 (errcode_for_file_access(),
1646                                    errmsg("could not open file \"%s\" (target block %u): %m",
1647                                                   _mdfd_segpath(reln, forknum, nextsegno),
1648                                                   blkno)));
1649                         }
1650                 }
1651                 v = v->mdfd_chain;
1652         }
1653         return v;
1654 }
1655
1656 /*
1657  * Get number of blocks present in a single disk file
1658  */
1659 static BlockNumber
1660 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1661 {
1662         off_t           len;
1663
1664         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1665         if (len < 0)
1666                 ereport(ERROR,
1667                                 (errcode_for_file_access(),
1668                                  errmsg("could not seek to end of file \"%s\": %m",
1669                                                 FilePathName(seg->mdfd_vfd))));
1670         /* note that this calculation will ignore any partial block at EOF */
1671         return (BlockNumber) (len / BLCKSZ);
1672 }