]> granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c
Prevent re-use of a deleted relation's relfilenode until after the next
[postgresql] / src / backend / storage / smgr / md.c
1 /*-------------------------------------------------------------------------
2  *
3  * md.c
4  *        This code manages relations that reside on magnetic disk.
5  *
6  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.130 2007/11/15 20:36:40 tgl Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include <unistd.h>
18 #include <fcntl.h>
19 #include <sys/file.h>
20
21 #include "catalog/catalog.h"
22 #include "miscadmin.h"
23 #include "postmaster/bgwriter.h"
24 #include "storage/fd.h"
25 #include "storage/bufmgr.h"
26 #include "storage/smgr.h"
27 #include "utils/hsearch.h"
28 #include "utils/memutils.h"
29
30
31 /* interval for calling AbsorbFsyncRequests in mdsync */
32 #define FSYNCS_PER_ABSORB               10
33
34 /* special values for the segno arg to RememberFsyncRequest */
35 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
36 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
37 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
38
39 /*
40  * On Windows, we have to interpret EACCES as possibly meaning the same as
41  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
42  * that's what you get.  Ugh.  This code is designed so that we don't
43  * actually believe these cases are okay without further evidence (namely,
44  * a pending fsync request getting revoked ... see mdsync).
45  */
46 #ifndef WIN32
47 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT)
48 #else
49 #define FILE_POSSIBLY_DELETED(err)  ((err) == ENOENT || (err) == EACCES)
50 #endif
51
52 /*
53  *      The magnetic disk storage manager keeps track of open file
54  *      descriptors in its own descriptor pool.  This is done to make it
55  *      easier to support relations that are larger than the operating
56  *      system's file size limit (often 2GBytes).  In order to do that,
57  *      we break relations up into "segment" files that are each shorter than
58  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
59  *      configuration constant in pg_config_manual.h.
60  *
61  *      On disk, a relation must consist of consecutively numbered segment
62  *      files in the pattern
63  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
64  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
65  *              -- Optionally, any number of inactive segments of size 0 blocks.
66  *      The full and partial segments are collectively the "active" segments.
67  *      Inactive segments are those that once contained data but are currently
68  *      not needed because of an mdtruncate() operation.  The reason for leaving
69  *      them present at size zero, rather than unlinking them, is that other
70  *      backends and/or the bgwriter might be holding open file references to
71  *      such segments.  If the relation expands again after mdtruncate(), such
72  *      that a deactivated segment becomes active again, it is important that
73  *      such file references still be valid --- else data might get written
74  *      out to an unlinked old copy of a segment file that will eventually
75  *      disappear.
76  *
77  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
78  *      cache is, therefore, just the head of a list of MdfdVec objects, one
79  *      per segment.  But note the md_fd pointer can be NULL, indicating
80  *      relation not open.
81  *
82  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
83  *      doesn't have another segment after this one; we may just not have
84  *      opened the next segment yet.  (We could not have "all segments are
85  *      in the chain" as an invariant anyway, since another backend could
86  *      extend the relation when we weren't looking.)  We do not make chain
87  *      entries for inactive segments, however; as soon as we find a partial
88  *      segment, we assume that any subsequent segments are inactive.
89  *
90  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
91  *
92  *      Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic,
93  *      for use on machines that support large files.  Beware that that
94  *      code has not been tested in a long time and is probably bit-rotted.
95  */
96
97 typedef struct _MdfdVec
98 {
99         File            mdfd_vfd;               /* fd number in fd.c's pool */
100         BlockNumber mdfd_segno;         /* segment number, from 0 */
101 #ifndef LET_OS_MANAGE_FILESIZE  /* for large relations */
102         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
103 #endif
104 } MdfdVec;
105
106 static MemoryContext MdCxt;             /* context for all md.c allocations */
107
108
109 /*
110  * In some contexts (currently, standalone backends and the bgwriter process)
111  * we keep track of pending fsync operations: we need to remember all relation
112  * segments that have been written since the last checkpoint, so that we can
113  * fsync them down to disk before completing the next checkpoint.  This hash
114  * table remembers the pending operations.      We use a hash table mostly as
115  * a convenient way of eliminating duplicate requests.
116  *
117  * We use a similar mechanism to remember no-longer-needed files that can
118  * be deleted after the next checkpoint, but we use a linked list instead of
119  * a hash table, because we don't expect there to be any duplicate requests.
120  *
121  * (Regular backends do not track pending operations locally, but forward
122  * them to the bgwriter.)
123  */
124 typedef struct
125 {
126         RelFileNode rnode;                      /* the targeted relation */
127         BlockNumber segno;                      /* which segment */
128 } PendingOperationTag;
129
130 typedef uint16 CycleCtr;                /* can be any convenient integer size */
131
132 typedef struct
133 {
134         PendingOperationTag tag;        /* hash table key (must be first!) */
135         bool            canceled;               /* T => request canceled, not yet removed */
136         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
137 } PendingOperationEntry;
138
139 typedef struct
140 {
141         RelFileNode rnode;                      /* the dead relation to delete */
142         CycleCtr cycle_ctr;                     /* mdckpt_cycle_ctr when request was made */
143 } PendingUnlinkEntry;
144
145 static HTAB *pendingOpsTable = NULL;
146 static List *pendingUnlinks = NIL;
147
148 static CycleCtr mdsync_cycle_ctr = 0;
149 static CycleCtr mdckpt_cycle_ctr = 0;
150
151
152 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
153 {
154         EXTENSION_FAIL,                         /* ereport if segment not present */
155         EXTENSION_RETURN_NULL,          /* return NULL if not present */
156         EXTENSION_CREATE                        /* create new segments as needed */
157 } ExtensionBehavior;
158
159 /* local routines */
160 static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
161 static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
162 static void register_unlink(RelFileNode rnode);
163 static MdfdVec *_fdvec_alloc(void);
164
165 #ifndef LET_OS_MANAGE_FILESIZE
166 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
167                           int oflags);
168 #endif
169 static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
170                                                          bool isTemp, ExtensionBehavior behavior);
171 static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);
172
173
174 /*
175  *      mdinit() -- Initialize private state for magnetic disk storage manager.
176  */
177 void
178 mdinit(void)
179 {
180         MdCxt = AllocSetContextCreate(TopMemoryContext,
181                                                                   "MdSmgr",
182                                                                   ALLOCSET_DEFAULT_MINSIZE,
183                                                                   ALLOCSET_DEFAULT_INITSIZE,
184                                                                   ALLOCSET_DEFAULT_MAXSIZE);
185
186         /*
187          * Create pending-operations hashtable if we need it.  Currently, we need
188          * it if we are standalone (not under a postmaster) OR if we are a
189          * bootstrap-mode subprocess of a postmaster (that is, a startup or
190          * bgwriter process).
191          */
192         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
193         {
194                 HASHCTL         hash_ctl;
195
196                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
197                 hash_ctl.keysize = sizeof(PendingOperationTag);
198                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
199                 hash_ctl.hash = tag_hash;
200                 hash_ctl.hcxt = MdCxt;
201                 pendingOpsTable = hash_create("Pending Ops Table",
202                                                                           100L,
203                                                                           &hash_ctl,
204                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
205                 pendingUnlinks = NIL;
206         }
207 }
208
209 /*
210  *      mdcreate() -- Create a new relation on magnetic disk.
211  *
212  * If isRedo is true, it's okay for the relation to exist already.
213  */
214 void
215 mdcreate(SMgrRelation reln, bool isRedo)
216 {
217         char       *path;
218         File            fd;
219
220         if (isRedo && reln->md_fd != NULL)
221                 return;                                 /* created and opened already... */
222
223         Assert(reln->md_fd == NULL);
224
225         path = relpath(reln->smgr_rnode);
226
227         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
228
229         if (fd < 0)
230         {
231                 int                     save_errno = errno;
232
233                 /*
234                  * During bootstrap, there are cases where a system relation will be
235                  * accessed (by internal backend processes) before the bootstrap
236                  * script nominally creates it.  Therefore, allow the file to exist
237                  * already, even if isRedo is not set.  (See also mdopen)
238                  */
239                 if (isRedo || IsBootstrapProcessingMode())
240                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
241                 if (fd < 0)
242                 {
243                         pfree(path);
244                         /* be sure to report the error reported by create, not open */
245                         errno = save_errno;
246                         ereport(ERROR,
247                                         (errcode_for_file_access(),
248                                          errmsg("could not create relation %u/%u/%u: %m",
249                                                         reln->smgr_rnode.spcNode,
250                                                         reln->smgr_rnode.dbNode,
251                                                         reln->smgr_rnode.relNode)));
252                 }
253         }
254
255         pfree(path);
256
257         reln->md_fd = _fdvec_alloc();
258
259         reln->md_fd->mdfd_vfd = fd;
260         reln->md_fd->mdfd_segno = 0;
261 #ifndef LET_OS_MANAGE_FILESIZE
262         reln->md_fd->mdfd_chain = NULL;
263 #endif
264 }
265
266 /*
267  *      mdunlink() -- Unlink a relation.
268  *
269  * Note that we're passed a RelFileNode --- by the time this is called,
270  * there won't be an SMgrRelation hashtable entry anymore.
271  *
272  * Actually, we don't unlink the first segment file of the relation, but
273  * just truncate it to zero length, and record a request to unlink it after
274  * the next checkpoint.  Additional segments can be unlinked immediately,
275  * however.  Leaving the empty file in place prevents that relfilenode
276  * number from being reused.  The scenario this protects us from is:
277  * 1. We delete a relation (and commit, and actually remove its file).
278  * 2. We create a new relation, which by chance gets the same relfilenode as
279  *    the just-deleted one (OIDs must've wrapped around for that to happen).
280  * 3. We crash before another checkpoint occurs.
281  * During replay, we would delete the file and then recreate it, which is fine
282  * if the contents of the file were repopulated by subsequent WAL entries.
283  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
284  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
285  * the contents of the file would be lost forever.  By leaving the empty file
286  * until after the next checkpoint, we prevent reassignment of the relfilenode
287  * number until it's safe, because relfilenode assignment skips over any
288  * existing file.
289  *
290  * If isRedo is true, it's okay for the relation to be already gone.
291  * Also, we should remove the file immediately instead of queuing a request
292  * for later, since during redo there's no possibility of creating a
293  * conflicting relation.
294  *
295  * Note: any failure should be reported as WARNING not ERROR, because
296  * we are usually not in a transaction anymore when this is called.
297  */
298 void
299 mdunlink(RelFileNode rnode, bool isRedo)
300 {
301         char       *path;
302         int ret;
303
304         /*
305          * We have to clean out any pending fsync requests for the doomed relation,
306          * else the next mdsync() will fail.
307          */
308         ForgetRelationFsyncRequests(rnode);
309
310         path = relpath(rnode);
311
312         /*
313          * Delete or truncate the first segment, or only segment if not doing
314          * segmenting
315          */
316         if (isRedo)
317                 ret = unlink(path);
318         else
319                 ret = truncate(path, 0);
320         if (ret < 0)
321         {
322                 if (!isRedo || errno != ENOENT)
323                         ereport(WARNING,
324                                         (errcode_for_file_access(),
325                                          errmsg("could not remove relation %u/%u/%u: %m",
326                                                         rnode.spcNode,
327                                                         rnode.dbNode,
328                                                         rnode.relNode)));
329         }
330
331 #ifndef LET_OS_MANAGE_FILESIZE
332         /* Delete the additional segments, if any */
333         else
334         {
335                 char       *segpath = (char *) palloc(strlen(path) + 12);
336                 BlockNumber segno;
337
338                 /*
339                  * Note that because we loop until getting ENOENT, we will
340                  * correctly remove all inactive segments as well as active ones.
341                  */
342                 for (segno = 1;; segno++)
343                 {
344                         sprintf(segpath, "%s.%u", path, segno);
345                         if (unlink(segpath) < 0)
346                         {
347                                 /* ENOENT is expected after the last segment... */
348                                 if (errno != ENOENT)
349                                         ereport(WARNING,
350                                                         (errcode_for_file_access(),
351                                                          errmsg("could not remove segment %u of relation %u/%u/%u: %m",
352                                                                         segno,
353                                                                         rnode.spcNode,
354                                                                         rnode.dbNode,
355                                                                         rnode.relNode)));
356                                 break;
357                         }
358                 }
359                 pfree(segpath);
360         }
361 #endif
362
363         pfree(path);
364
365         /* Register request to unlink first segment later */
366         if (!isRedo)
367                 register_unlink(rnode);
368 }
369
370 /*
371  *      mdextend() -- Add a block to the specified relation.
372  *
373  *              The semantics are nearly the same as mdwrite(): write at the
374  *              specified position.  However, this is to be used for the case of
375  *              extending a relation (i.e., blocknum is at or beyond the current
376  *              EOF).  Note that we assume writing a block beyond current EOF
377  *              causes intervening file space to become filled with zeroes.
378  */
379 void
380 mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
381 {
382         long            seekpos;
383         int                     nbytes;
384         MdfdVec    *v;
385
386         /* This assert is too expensive to have on normally ... */
387 #ifdef CHECK_WRITE_VS_EXTEND
388         Assert(blocknum >= mdnblocks(reln));
389 #endif
390
391         /*
392          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it
393          * any more --- we mustn't create a block whose number
394          * actually is InvalidBlockNumber.
395          */
396         if (blocknum == InvalidBlockNumber)
397                 ereport(ERROR,
398                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
399                                  errmsg("cannot extend relation %u/%u/%u beyond %u blocks",
400                                                 reln->smgr_rnode.spcNode,
401                                                 reln->smgr_rnode.dbNode,
402                                                 reln->smgr_rnode.relNode,
403                                                 InvalidBlockNumber)));
404
405         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
406
407 #ifndef LET_OS_MANAGE_FILESIZE
408         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
409         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
410 #else
411         seekpos = (long) (BLCKSZ * (blocknum));
412 #endif
413
414         /*
415          * Note: because caller usually obtained blocknum by calling mdnblocks,
416          * which did a seek(SEEK_END), this seek is often redundant and will be
417          * optimized away by fd.c.  It's not redundant, however, if there is a
418          * partial page at the end of the file. In that case we want to try to
419          * overwrite the partial page with a full page.  It's also not redundant
420          * if bufmgr.c had to dump another buffer of the same file to make room
421          * for the new page's buffer.
422          */
423         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
424                 ereport(ERROR,
425                                 (errcode_for_file_access(),
426                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
427                                                 blocknum,
428                                                 reln->smgr_rnode.spcNode,
429                                                 reln->smgr_rnode.dbNode,
430                                                 reln->smgr_rnode.relNode)));
431
432         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
433         {
434                 if (nbytes < 0)
435                         ereport(ERROR,
436                                         (errcode_for_file_access(),
437                                          errmsg("could not extend relation %u/%u/%u: %m",
438                                                         reln->smgr_rnode.spcNode,
439                                                         reln->smgr_rnode.dbNode,
440                                                         reln->smgr_rnode.relNode),
441                                          errhint("Check free disk space.")));
442                 /* short write: complain appropriately */
443                 ereport(ERROR,
444                                 (errcode(ERRCODE_DISK_FULL),
445                                  errmsg("could not extend relation %u/%u/%u: wrote only %d of %d bytes at block %u",
446                                                 reln->smgr_rnode.spcNode,
447                                                 reln->smgr_rnode.dbNode,
448                                                 reln->smgr_rnode.relNode,
449                                                 nbytes, BLCKSZ, blocknum),
450                                  errhint("Check free disk space.")));
451         }
452
453         if (!isTemp)
454                 register_dirty_segment(reln, v);
455
456 #ifndef LET_OS_MANAGE_FILESIZE
457         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
458 #endif
459 }
460
461 /*
462  *      mdopen() -- Open the specified relation.
463  *
464  * Note we only open the first segment, when there are multiple segments.
465  *
466  * If first segment is not present, either ereport or return NULL according
467  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
468  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
469  * invent one out of whole cloth.
470  */
471 static MdfdVec *
472 mdopen(SMgrRelation reln, ExtensionBehavior behavior)
473 {
474         MdfdVec    *mdfd;
475         char       *path;
476         File            fd;
477
478         /* No work if already open */
479         if (reln->md_fd)
480                 return reln->md_fd;
481
482         path = relpath(reln->smgr_rnode);
483
484         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
485
486         if (fd < 0)
487         {
488                 /*
489                  * During bootstrap, there are cases where a system relation will be
490                  * accessed (by internal backend processes) before the bootstrap
491                  * script nominally creates it.  Therefore, accept mdopen() as a
492                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
493                  */
494                 if (IsBootstrapProcessingMode())
495                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
496                 if (fd < 0)
497                 {
498                         pfree(path);
499                         if (behavior == EXTENSION_RETURN_NULL &&
500                                 FILE_POSSIBLY_DELETED(errno))
501                                 return NULL;
502                         ereport(ERROR,
503                                         (errcode_for_file_access(),
504                                          errmsg("could not open relation %u/%u/%u: %m",
505                                                         reln->smgr_rnode.spcNode,
506                                                         reln->smgr_rnode.dbNode,
507                                                         reln->smgr_rnode.relNode)));
508                 }
509         }
510
511         pfree(path);
512
513         reln->md_fd = mdfd = _fdvec_alloc();
514
515         mdfd->mdfd_vfd = fd;
516         mdfd->mdfd_segno = 0;
517 #ifndef LET_OS_MANAGE_FILESIZE
518         mdfd->mdfd_chain = NULL;
519         Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
520 #endif
521
522         return mdfd;
523 }
524
525 /*
526  *      mdclose() -- Close the specified relation, if it isn't closed already.
527  */
528 void
529 mdclose(SMgrRelation reln)
530 {
531         MdfdVec    *v = reln->md_fd;
532
533         /* No work if already closed */
534         if (v == NULL)
535                 return;
536
537         reln->md_fd = NULL;                     /* prevent dangling pointer after error */
538
539 #ifndef LET_OS_MANAGE_FILESIZE
540         while (v != NULL)
541         {
542                 MdfdVec    *ov = v;
543
544                 /* if not closed already */
545                 if (v->mdfd_vfd >= 0)
546                         FileClose(v->mdfd_vfd);
547                 /* Now free vector */
548                 v = v->mdfd_chain;
549                 pfree(ov);
550         }
551 #else
552         if (v->mdfd_vfd >= 0)
553                 FileClose(v->mdfd_vfd);
554         pfree(v);
555 #endif
556 }
557
558 /*
559  *      mdread() -- Read the specified block from a relation.
560  */
561 void
562 mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
563 {
564         long            seekpos;
565         int                     nbytes;
566         MdfdVec    *v;
567
568         v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
569
570 #ifndef LET_OS_MANAGE_FILESIZE
571         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
572         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
573 #else
574         seekpos = (long) (BLCKSZ * (blocknum));
575 #endif
576
577         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
578                 ereport(ERROR,
579                                 (errcode_for_file_access(),
580                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
581                                                 blocknum,
582                                                 reln->smgr_rnode.spcNode,
583                                                 reln->smgr_rnode.dbNode,
584                                                 reln->smgr_rnode.relNode)));
585
586         if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
587         {
588                 if (nbytes < 0)
589                         ereport(ERROR,
590                                         (errcode_for_file_access(),
591                                          errmsg("could not read block %u of relation %u/%u/%u: %m",
592                                                         blocknum,
593                                                         reln->smgr_rnode.spcNode,
594                                                         reln->smgr_rnode.dbNode,
595                                                         reln->smgr_rnode.relNode)));
596                 /*
597                  * Short read: we are at or past EOF, or we read a partial block at
598                  * EOF.  Normally this is an error; upper levels should never try to
599                  * read a nonexistent block.  However, if zero_damaged_pages is ON
600                  * or we are InRecovery, we should instead return zeroes without
601                  * complaining.  This allows, for example, the case of trying to
602                  * update a block that was later truncated away.
603                  */
604                 if (zero_damaged_pages || InRecovery)
605                         MemSet(buffer, 0, BLCKSZ);
606                 else
607                         ereport(ERROR,
608                                         (errcode(ERRCODE_DATA_CORRUPTED),
609                                          errmsg("could not read block %u of relation %u/%u/%u: read only %d of %d bytes",
610                                                         blocknum,
611                                                         reln->smgr_rnode.spcNode,
612                                                         reln->smgr_rnode.dbNode,
613                                                         reln->smgr_rnode.relNode,
614                                                         nbytes, BLCKSZ)));
615         }
616 }
617
618 /*
619  *      mdwrite() -- Write the supplied block at the appropriate location.
620  *
621  *              This is to be used only for updating already-existing blocks of a
622  *              relation (ie, those before the current EOF).  To extend a relation,
623  *              use mdextend().
624  */
625 void
626 mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
627 {
628         long            seekpos;
629         int                     nbytes;
630         MdfdVec    *v;
631
632         /* This assert is too expensive to have on normally ... */
633 #ifdef CHECK_WRITE_VS_EXTEND
634         Assert(blocknum < mdnblocks(reln));
635 #endif
636
637         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
638
639 #ifndef LET_OS_MANAGE_FILESIZE
640         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
641         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
642 #else
643         seekpos = (long) (BLCKSZ * (blocknum));
644 #endif
645
646         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
647                 ereport(ERROR,
648                                 (errcode_for_file_access(),
649                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
650                                                 blocknum,
651                                                 reln->smgr_rnode.spcNode,
652                                                 reln->smgr_rnode.dbNode,
653                                                 reln->smgr_rnode.relNode)));
654
655         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
656         {
657                 if (nbytes < 0)
658                         ereport(ERROR,
659                                         (errcode_for_file_access(),
660                                          errmsg("could not write block %u of relation %u/%u/%u: %m",
661                                                         blocknum,
662                                                         reln->smgr_rnode.spcNode,
663                                                         reln->smgr_rnode.dbNode,
664                                                         reln->smgr_rnode.relNode)));
665                 /* short write: complain appropriately */
666                 ereport(ERROR,
667                                 (errcode(ERRCODE_DISK_FULL),
668                                  errmsg("could not write block %u of relation %u/%u/%u: wrote only %d of %d bytes",
669                                                 blocknum,
670                                                 reln->smgr_rnode.spcNode,
671                                                 reln->smgr_rnode.dbNode,
672                                                 reln->smgr_rnode.relNode,
673                                                 nbytes, BLCKSZ),
674                                  errhint("Check free disk space.")));
675         }
676
677         if (!isTemp)
678                 register_dirty_segment(reln, v);
679 }
680
681 /*
682  *      mdnblocks() -- Get the number of blocks stored in a relation.
683  *
684  *              Important side effect: all active segments of the relation are opened
685  *              and added to the mdfd_chain list.  If this routine has not been
686  *              called, then only segments up to the last one actually touched
687  *              are present in the chain.
688  */
689 BlockNumber
690 mdnblocks(SMgrRelation reln)
691 {
692         MdfdVec    *v = mdopen(reln, EXTENSION_FAIL);
693
694 #ifndef LET_OS_MANAGE_FILESIZE
695         BlockNumber nblocks;
696         BlockNumber segno = 0;
697
698         /*
699          * Skip through any segments that aren't the last one, to avoid redundant
700          * seeks on them.  We have previously verified that these segments are
701          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
702          *
703          * NOTE: this assumption could only be wrong if another backend has
704          * truncated the relation.      We rely on higher code levels to handle that
705          * scenario by closing and re-opening the md fd, which is handled via
706          * relcache flush.  (Since the bgwriter doesn't participate in relcache
707          * flush, it could have segment chain entries for inactive segments;
708          * that's OK because the bgwriter never needs to compute relation size.)
709          */
710         while (v->mdfd_chain != NULL)
711         {
712                 segno++;
713                 v = v->mdfd_chain;
714         }
715
716         for (;;)
717         {
718                 nblocks = _mdnblocks(reln, v);
719                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
720                         elog(FATAL, "segment too big");
721                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
722                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
723
724                 /*
725                  * If segment is exactly RELSEG_SIZE, advance to next one.
726                  */
727                 segno++;
728
729                 if (v->mdfd_chain == NULL)
730                 {
731                         /*
732                          * Because we pass O_CREAT, we will create the next segment (with
733                          * zero length) immediately, if the last segment is of length
734                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
735                          * the logic simple.
736                          */
737                         v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
738                         if (v->mdfd_chain == NULL)
739                                 ereport(ERROR,
740                                                 (errcode_for_file_access(),
741                                                  errmsg("could not open segment %u of relation %u/%u/%u: %m",
742                                                                 segno,
743                                                                 reln->smgr_rnode.spcNode,
744                                                                 reln->smgr_rnode.dbNode,
745                                                                 reln->smgr_rnode.relNode)));
746                 }
747
748                 v = v->mdfd_chain;
749         }
750 #else
751         return _mdnblocks(reln, v);
752 #endif
753 }
754
755 /*
756  *      mdtruncate() -- Truncate relation to specified number of blocks.
757  */
758 void
759 mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
760 {
761         MdfdVec    *v;
762         BlockNumber curnblk;
763
764 #ifndef LET_OS_MANAGE_FILESIZE
765         BlockNumber priorblocks;
766 #endif
767
768         /*
769          * NOTE: mdnblocks makes sure we have opened all active segments, so
770          * that truncation loop will get them all!
771          */
772         curnblk = mdnblocks(reln);
773         if (nblocks > curnblk)
774         {
775                 /* Bogus request ... but no complaint if InRecovery */
776                 if (InRecovery)
777                         return;
778                 ereport(ERROR,
779                                 (errmsg("could not truncate relation %u/%u/%u to %u blocks: it's only %u blocks now",
780                                                 reln->smgr_rnode.spcNode,
781                                                 reln->smgr_rnode.dbNode,
782                                                 reln->smgr_rnode.relNode,
783                                                 nblocks, curnblk)));
784         }
785         if (nblocks == curnblk)
786                 return;                                 /* no work */
787
788         v = mdopen(reln, EXTENSION_FAIL);
789
790 #ifndef LET_OS_MANAGE_FILESIZE
791         priorblocks = 0;
792         while (v != NULL)
793         {
794                 MdfdVec    *ov = v;
795
796                 if (priorblocks > nblocks)
797                 {
798                         /*
799                          * This segment is no longer active (and has already been
800                          * unlinked from the mdfd_chain). We truncate the file, but do
801                          * not delete it, for reasons explained in the header comments.
802                          */
803                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
804                                 ereport(ERROR,
805                                                 (errcode_for_file_access(),
806                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
807                                                                 reln->smgr_rnode.spcNode,
808                                                                 reln->smgr_rnode.dbNode,
809                                                                 reln->smgr_rnode.relNode,
810                                                                 nblocks)));
811                         if (!isTemp)
812                                 register_dirty_segment(reln, v);
813                         v = v->mdfd_chain;
814                         Assert(ov != reln->md_fd);      /* we never drop the 1st segment */
815                         pfree(ov);
816                 }
817                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
818                 {
819                         /*
820                          * This is the last segment we want to keep. Truncate the file to
821                          * the right length, and clear chain link that points to any
822                          * remaining segments (which we shall zap). NOTE: if nblocks is
823                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
824                          * segment to 0 length but keep it. This adheres to the invariant
825                          * given in the header comments.
826                          */
827                         BlockNumber lastsegblocks = nblocks - priorblocks;
828
829                         if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
830                                 ereport(ERROR,
831                                                 (errcode_for_file_access(),
832                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
833                                                                 reln->smgr_rnode.spcNode,
834                                                                 reln->smgr_rnode.dbNode,
835                                                                 reln->smgr_rnode.relNode,
836                                                                 nblocks)));
837                         if (!isTemp)
838                                 register_dirty_segment(reln, v);
839                         v = v->mdfd_chain;
840                         ov->mdfd_chain = NULL;
841                 }
842                 else
843                 {
844                         /*
845                          * We still need this segment and 0 or more blocks beyond it, so
846                          * nothing to do here.
847                          */
848                         v = v->mdfd_chain;
849                 }
850                 priorblocks += RELSEG_SIZE;
851         }
852 #else
853         if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
854                 ereport(ERROR,
855                                 (errcode_for_file_access(),
856                           errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
857                                          reln->smgr_rnode.spcNode,
858                                          reln->smgr_rnode.dbNode,
859                                          reln->smgr_rnode.relNode,
860                                          nblocks)));
861         if (!isTemp)
862                 register_dirty_segment(reln, v);
863 #endif
864 }
865
866 /*
867  *      mdimmedsync() -- Immediately sync a relation to stable storage.
868  *
869  * Note that only writes already issued are synced; this routine knows
870  * nothing of dirty buffers that may exist inside the buffer manager.
871  */
872 void
873 mdimmedsync(SMgrRelation reln)
874 {
875         MdfdVec    *v;
876         BlockNumber curnblk;
877
878         /*
879          * NOTE: mdnblocks makes sure we have opened all active segments, so
880          * that fsync loop will get them all!
881          */
882         curnblk = mdnblocks(reln);
883
884         v = mdopen(reln, EXTENSION_FAIL);
885
886 #ifndef LET_OS_MANAGE_FILESIZE
887         while (v != NULL)
888         {
889                 if (FileSync(v->mdfd_vfd) < 0)
890                         ereport(ERROR,
891                                         (errcode_for_file_access(),
892                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
893                                                         v->mdfd_segno,
894                                                         reln->smgr_rnode.spcNode,
895                                                         reln->smgr_rnode.dbNode,
896                                                         reln->smgr_rnode.relNode)));
897                 v = v->mdfd_chain;
898         }
899 #else
900         if (FileSync(v->mdfd_vfd) < 0)
901                 ereport(ERROR,
902                                 (errcode_for_file_access(),
903                                  errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
904                                                 v->mdfd_segno,
905                                                 reln->smgr_rnode.spcNode,
906                                                 reln->smgr_rnode.dbNode,
907                                                 reln->smgr_rnode.relNode)));
908 #endif
909 }
910
911 /*
912  *      mdsync() -- Sync previous writes to stable storage.
913  */
914 void
915 mdsync(void)
916 {
917         static bool mdsync_in_progress = false;
918
919         HASH_SEQ_STATUS hstat;
920         PendingOperationEntry *entry;
921         int                     absorb_counter;
922
923         /*
924          * This is only called during checkpoints, and checkpoints should only
925          * occur in processes that have created a pendingOpsTable.
926          */
927         if (!pendingOpsTable)
928                 elog(ERROR, "cannot sync without a pendingOpsTable");
929
930         /*
931          * If we are in the bgwriter, the sync had better include all fsync
932          * requests that were queued by backends up to this point.  The tightest
933          * race condition that could occur is that a buffer that must be written
934          * and fsync'd for the checkpoint could have been dumped by a backend
935          * just before it was visited by BufferSync().  We know the backend will
936          * have queued an fsync request before clearing the buffer's dirtybit,
937          * so we are safe as long as we do an Absorb after completing BufferSync().
938          */
939         AbsorbFsyncRequests();
940
941         /*
942          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
943          * checkpoint), we want to ignore fsync requests that are entered into the
944          * hashtable after this point --- they should be processed next time,
945          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
946          * ones: new ones will have cycle_ctr equal to the incremented value of
947          * mdsync_cycle_ctr.
948          *
949          * In normal circumstances, all entries present in the table at this
950          * point will have cycle_ctr exactly equal to the current (about to be old)
951          * value of mdsync_cycle_ctr.  However, if we fail partway through the
952          * fsync'ing loop, then older values of cycle_ctr might remain when we
953          * come back here to try again.  Repeated checkpoint failures would
954          * eventually wrap the counter around to the point where an old entry
955          * might appear new, causing us to skip it, possibly allowing a checkpoint
956          * to succeed that should not have.  To forestall wraparound, any time
957          * the previous mdsync() failed to complete, run through the table and
958          * forcibly set cycle_ctr = mdsync_cycle_ctr.
959          *
960          * Think not to merge this loop with the main loop, as the problem is
961          * exactly that that loop may fail before having visited all the entries.
962          * From a performance point of view it doesn't matter anyway, as this
963          * path will never be taken in a system that's functioning normally.
964          */
965         if (mdsync_in_progress)
966         {
967                 /* prior try failed, so update any stale cycle_ctr values */
968                 hash_seq_init(&hstat, pendingOpsTable);
969                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
970                 {
971                         entry->cycle_ctr = mdsync_cycle_ctr;
972                 }
973         }
974
975         /* Advance counter so that new hashtable entries are distinguishable */
976         mdsync_cycle_ctr++;
977
978         /* Set flag to detect failure if we don't reach the end of the loop */
979         mdsync_in_progress = true;
980
981         /* Now scan the hashtable for fsync requests to process */
982         absorb_counter = FSYNCS_PER_ABSORB;
983         hash_seq_init(&hstat, pendingOpsTable);
984         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
985         {
986                 /*
987                  * If the entry is new then don't process it this time.  Note that
988                  * "continue" bypasses the hash-remove call at the bottom of the loop.
989                  */
990                 if (entry->cycle_ctr == mdsync_cycle_ctr)
991                         continue;
992
993                 /* Else assert we haven't missed it */
994                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
995
996                 /*
997                  * If fsync is off then we don't have to bother opening the file
998                  * at all.  (We delay checking until this point so that changing
999                  * fsync on the fly behaves sensibly.)  Also, if the entry is
1000                  * marked canceled, fall through to delete it.
1001                  */
1002                 if (enableFsync && !entry->canceled)
1003                 {
1004                         int                     failures;
1005
1006                         /*
1007                          * If in bgwriter, we want to absorb pending requests every so
1008                          * often to prevent overflow of the fsync request queue.  It is
1009                          * unspecified whether newly-added entries will be visited by
1010                          * hash_seq_search, but we don't care since we don't need to
1011                          * process them anyway.
1012                          */
1013                         if (--absorb_counter <= 0)
1014                         {
1015                                 AbsorbFsyncRequests();
1016                                 absorb_counter = FSYNCS_PER_ABSORB;
1017                         }
1018
1019                         /*
1020                          * The fsync table could contain requests to fsync segments that
1021                          * have been deleted (unlinked) by the time we get to them.
1022                          * Rather than just hoping an ENOENT (or EACCES on Windows) error
1023                          * can be ignored, what we do on error is absorb pending requests
1024                          * and then retry.  Since mdunlink() queues a "revoke" message
1025                          * before actually unlinking, the fsync request is guaranteed to
1026                          * be marked canceled after the absorb if it really was this case.
1027                          * DROP DATABASE likewise has to tell us to forget fsync requests
1028                          * before it starts deletions.
1029                          */
1030                         for (failures = 0; ; failures++)        /* loop exits at "break" */
1031                         {
1032                                 SMgrRelation reln;
1033                                 MdfdVec    *seg;
1034
1035                                 /*
1036                                  * Find or create an smgr hash entry for this relation. This
1037                                  * may seem a bit unclean -- md calling smgr?  But it's really
1038                                  * the best solution.  It ensures that the open file reference
1039                                  * isn't permanently leaked if we get an error here. (You may
1040                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1041                                  * really, because the only case in which a checkpoint is done
1042                                  * by a process that isn't about to shut down is in the
1043                                  * bgwriter, and it will periodically do smgrcloseall(). This
1044                                  * fact justifies our not closing the reln in the success path
1045                                  * either, which is a good thing since in non-bgwriter cases
1046                                  * we couldn't safely do that.)  Furthermore, in many cases
1047                                  * the relation will have been dirtied through this same smgr
1048                                  * relation, and so we can save a file open/close cycle.
1049                                  */
1050                                 reln = smgropen(entry->tag.rnode);
1051
1052                                 /*
1053                                  * It is possible that the relation has been dropped or
1054                                  * truncated since the fsync request was entered.  Therefore,
1055                                  * allow ENOENT, but only if we didn't fail already on
1056                                  * this file.  This applies both during _mdfd_getseg() and
1057                                  * during FileSync, since fd.c might have closed the file
1058                                  * behind our back.
1059                                  */
1060                                 seg = _mdfd_getseg(reln,
1061                                                                    entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1062                                                                    false, EXTENSION_RETURN_NULL);
1063                                 if (seg != NULL &&
1064                                         FileSync(seg->mdfd_vfd) >= 0)
1065                                         break;          /* success; break out of retry loop */
1066
1067                                 /*
1068                                  * XXX is there any point in allowing more than one retry?
1069                                  * Don't see one at the moment, but easy to change the
1070                                  * test here if so.
1071                                  */
1072                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1073                                         failures > 0)
1074                                         ereport(ERROR,
1075                                                         (errcode_for_file_access(),
1076                                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1077                                                                         entry->tag.segno,
1078                                                                         entry->tag.rnode.spcNode,
1079                                                                         entry->tag.rnode.dbNode,
1080                                                                         entry->tag.rnode.relNode)));
1081                                 else
1082                                         ereport(DEBUG1,
1083                                                         (errcode_for_file_access(),
1084                                                          errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
1085                                                                         entry->tag.segno,
1086                                                                         entry->tag.rnode.spcNode,
1087                                                                         entry->tag.rnode.dbNode,
1088                                                                         entry->tag.rnode.relNode)));
1089
1090                                 /*
1091                                  * Absorb incoming requests and check to see if canceled.
1092                                  */
1093                                 AbsorbFsyncRequests();
1094                                 absorb_counter = FSYNCS_PER_ABSORB;     /* might as well... */
1095
1096                                 if (entry->canceled)
1097                                         break;
1098                         }       /* end retry loop */
1099                 }
1100
1101                 /*
1102                  * If we get here, either we fsync'd successfully, or we don't have
1103                  * to because enableFsync is off, or the entry is (now) marked
1104                  * canceled.  Okay to delete it.
1105                  */
1106                 if (hash_search(pendingOpsTable, &entry->tag,
1107                                                 HASH_REMOVE, NULL) == NULL)
1108                         elog(ERROR, "pendingOpsTable corrupted");
1109         }       /* end loop over hashtable entries */
1110
1111         /* Flag successful completion of mdsync */
1112         mdsync_in_progress = false;
1113 }
1114
1115 /*
1116  * mdpreckpt() -- Do pre-checkpoint work
1117  *
1118  * To distinguish unlink requests that arrived before this checkpoint
1119  * started from those that arrived during the checkpoint, we use a cycle
1120  * counter similar to the one we use for fsync requests. That cycle
1121  * counter is incremented here.
1122  *
1123  * This must be called *before* the checkpoint REDO point is determined.
1124  * That ensures that we won't delete files too soon.
1125  *
1126  * Note that we can't do anything here that depends on the assumption
1127  * that the checkpoint will be completed.
1128  */
1129 void
1130 mdpreckpt(void)
1131 {
1132         ListCell *cell;
1133
1134         /*
1135          * In case the prior checkpoint wasn't completed, stamp all entries in
1136          * the list with the current cycle counter.  Anything that's in the
1137          * list at the start of checkpoint can surely be deleted after the
1138          * checkpoint is finished, regardless of when the request was made.
1139          */
1140         foreach(cell, pendingUnlinks)
1141         {
1142                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1143
1144                 entry->cycle_ctr = mdckpt_cycle_ctr;
1145         }
1146
1147         /*
1148          * Any unlink requests arriving after this point will be assigned the
1149          * next cycle counter, and won't be unlinked until next checkpoint.
1150          */
1151         mdckpt_cycle_ctr++;
1152 }
1153
1154 /*
1155  * mdpostckpt() -- Do post-checkpoint work
1156  *
1157  * Remove any lingering files that can now be safely removed.
1158  */
1159 void
1160 mdpostckpt(void)
1161 {
1162         while (pendingUnlinks != NIL)
1163         {
1164                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1165                 char *path;
1166
1167                 /*
1168                  * New entries are appended to the end, so if the entry is new
1169                  * we've reached the end of old entries.
1170                  */
1171                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1172                         break;
1173
1174                 /* Else assert we haven't missed it */
1175                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1176
1177                 /* Unlink the file */
1178                 path = relpath(entry->rnode);
1179                 if (unlink(path) < 0)
1180                 {
1181                         /*
1182                          * ENOENT shouldn't happen either, but it doesn't really matter
1183                          * because we would've deleted it now anyway.
1184                          */
1185                         if (errno != ENOENT)
1186                                 ereport(WARNING,
1187                                                 (errcode_for_file_access(),
1188                                                  errmsg("could not remove relation %u/%u/%u: %m",
1189                                                                 entry->rnode.spcNode,
1190                                                                 entry->rnode.dbNode,
1191                                                                 entry->rnode.relNode)));
1192                 }
1193                 pfree(path);
1194
1195                 pendingUnlinks = list_delete_first(pendingUnlinks);
1196                 pfree(entry);
1197         }
1198 }
1199
1200 /*
1201  * register_dirty_segment() -- Mark a relation segment as needing fsync
1202  *
1203  * If there is a local pending-ops table, just make an entry in it for
1204  * mdsync to process later.  Otherwise, try to pass off the fsync request
1205  * to the background writer process.  If that fails, just do the fsync
1206  * locally before returning (we expect this will not happen often enough
1207  * to be a performance problem).
1208  */
1209 static void
1210 register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
1211 {
1212         if (pendingOpsTable)
1213         {
1214                 /* push it into local pending-ops table */
1215                 RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
1216         }
1217         else
1218         {
1219                 if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
1220                         return;                         /* passed it off successfully */
1221
1222                 if (FileSync(seg->mdfd_vfd) < 0)
1223                         ereport(ERROR,
1224                                         (errcode_for_file_access(),
1225                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1226                                                         seg->mdfd_segno,
1227                                                         reln->smgr_rnode.spcNode,
1228                                                         reln->smgr_rnode.dbNode,
1229                                                         reln->smgr_rnode.relNode)));
1230         }
1231 }
1232
1233 /*
1234  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1235  *
1236  * As with register_dirty_segment, this could involve either a local or
1237  * a remote pending-ops table.
1238  */
1239 static void
1240 register_unlink(RelFileNode rnode)
1241 {
1242         if (pendingOpsTable)
1243         {
1244                 /* push it into local pending-ops table */
1245                 RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
1246         }
1247         else
1248         {
1249                 /*
1250                  * Notify the bgwriter about it.  If we fail to queue the request
1251                  * message, we have to sleep and try again, because we can't simply
1252                  * delete the file now.  Ugly, but hopefully won't happen often.
1253                  *
1254                  * XXX should we just leave the file orphaned instead?
1255                  */
1256                 Assert(IsUnderPostmaster);
1257                 while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
1258                         pg_usleep(10000L);      /* 10 msec seems a good number */
1259         }
1260 }
1261
1262 /*
1263  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1264  *
1265  * We stuff most fsync requests into the local hash table for execution
1266  * during the bgwriter's next checkpoint.  UNLINK requests go into a
1267  * separate linked list, however, because they get processed separately.
1268  *
1269  * The range of possible segment numbers is way less than the range of
1270  * BlockNumber, so we can reserve high values of segno for special purposes.
1271  * We define three:
1272  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1273  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1274  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1275  *   checkpoint.
1276  *
1277  * (Handling the FORGET_* requests is a tad slow because the hash table has
1278  * to be searched linearly, but it doesn't seem worth rethinking the table
1279  * structure for them.)
1280  */
1281 void
1282 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
1283 {
1284         Assert(pendingOpsTable);
1285
1286         if (segno == FORGET_RELATION_FSYNC)
1287         {
1288                 /* Remove any pending requests for the entire relation */
1289                 HASH_SEQ_STATUS hstat;
1290                 PendingOperationEntry *entry;
1291
1292                 hash_seq_init(&hstat, pendingOpsTable);
1293                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1294                 {
1295                         if (RelFileNodeEquals(entry->tag.rnode, rnode))
1296                         {
1297                                 /* Okay, cancel this entry */
1298                                 entry->canceled = true;
1299                         }
1300                 }
1301         }
1302         else if (segno == FORGET_DATABASE_FSYNC)
1303         {
1304                 /* Remove any pending requests for the entire database */
1305                 HASH_SEQ_STATUS hstat;
1306                 PendingOperationEntry *entry;
1307
1308                 hash_seq_init(&hstat, pendingOpsTable);
1309                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1310                 {
1311                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1312                         {
1313                                 /* Okay, cancel this entry */
1314                                 entry->canceled = true;
1315                         }
1316                 }
1317         }
1318         else if (segno == UNLINK_RELATION_REQUEST)
1319         {
1320                 /* Unlink request: put it in the linked list */
1321                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1322                 PendingUnlinkEntry *entry;
1323
1324                 entry = palloc(sizeof(PendingUnlinkEntry));
1325                 entry->rnode = rnode;
1326                 entry->cycle_ctr = mdckpt_cycle_ctr;
1327
1328                 pendingUnlinks = lappend(pendingUnlinks, entry);
1329
1330                 MemoryContextSwitchTo(oldcxt);
1331         }
1332         else
1333         {
1334                 /* Normal case: enter a request to fsync this segment */
1335                 PendingOperationTag key;
1336                 PendingOperationEntry *entry;
1337                 bool            found;
1338
1339                 /* ensure any pad bytes in the hash key are zeroed */
1340                 MemSet(&key, 0, sizeof(key));
1341                 key.rnode = rnode;
1342                 key.segno = segno;
1343
1344                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1345                                                                                                           &key,
1346                                                                                                           HASH_ENTER,
1347                                                                                                           &found);
1348                 /* if new or previously canceled entry, initialize it */
1349                 if (!found || entry->canceled)
1350                 {
1351                         entry->canceled = false;
1352                         entry->cycle_ctr = mdsync_cycle_ctr;
1353                 }
1354                 /*
1355                  * NB: it's intentional that we don't change cycle_ctr if the entry
1356                  * already exists.  The fsync request must be treated as old, even
1357                  * though the new request will be satisfied too by any subsequent
1358                  * fsync.
1359                  *
1360                  * However, if the entry is present but is marked canceled, we should
1361                  * act just as though it wasn't there.  The only case where this could
1362                  * happen would be if a file had been deleted, we received but did not
1363                  * yet act on the cancel request, and the same relfilenode was then
1364                  * assigned to a new file.  We mustn't lose the new request, but
1365                  * it should be considered new not old.
1366                  */
1367         }
1368 }
1369
1370 /*
1371  * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten
1372  */
1373 void
1374 ForgetRelationFsyncRequests(RelFileNode rnode)
1375 {
1376         if (pendingOpsTable)
1377         {
1378                 /* standalone backend or startup process: fsync state is local */
1379                 RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
1380         }
1381         else if (IsUnderPostmaster)
1382         {
1383                 /*
1384                  * Notify the bgwriter about it.  If we fail to queue the revoke
1385                  * message, we have to sleep and try again ... ugly, but hopefully
1386                  * won't happen often.
1387                  *
1388                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with
1389                  * an error would leave the no-longer-used file still present on
1390                  * disk, which would be bad, so I'm inclined to assume that the
1391                  * bgwriter will always empty the queue soon.
1392                  */
1393                 while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
1394                         pg_usleep(10000L);      /* 10 msec seems a good number */
1395                 /*
1396                  * Note we don't wait for the bgwriter to actually absorb the
1397                  * revoke message; see mdsync() for the implications.
1398                  */
1399         }
1400 }
1401
1402 /*
1403  * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten
1404  */
1405 void
1406 ForgetDatabaseFsyncRequests(Oid dbid)
1407 {
1408         RelFileNode rnode;
1409
1410         rnode.dbNode = dbid;
1411         rnode.spcNode = 0;
1412         rnode.relNode = 0;
1413
1414         if (pendingOpsTable)
1415         {
1416                 /* standalone backend or startup process: fsync state is local */
1417                 RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
1418         }
1419         else if (IsUnderPostmaster)
1420         {
1421                 /* see notes in ForgetRelationFsyncRequests */
1422                 while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
1423                         pg_usleep(10000L);      /* 10 msec seems a good number */
1424         }
1425 }
1426
1427
1428 /*
1429  *      _fdvec_alloc() -- Make a MdfdVec object.
1430  */
1431 static MdfdVec *
1432 _fdvec_alloc(void)
1433 {
1434         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1435 }
1436
1437 #ifndef LET_OS_MANAGE_FILESIZE
1438
1439 /*
1440  * Open the specified segment of the relation,
1441  * and make a MdfdVec object for it.  Returns NULL on failure.
1442  */
1443 static MdfdVec *
1444 _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
1445 {
1446         MdfdVec    *v;
1447         int                     fd;
1448         char       *path,
1449                            *fullpath;
1450
1451         path = relpath(reln->smgr_rnode);
1452
1453         if (segno > 0)
1454         {
1455                 /* be sure we have enough space for the '.segno' */
1456                 fullpath = (char *) palloc(strlen(path) + 12);
1457                 sprintf(fullpath, "%s.%u", path, segno);
1458                 pfree(path);
1459         }
1460         else
1461                 fullpath = path;
1462
1463         /* open the file */
1464         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1465
1466         pfree(fullpath);
1467
1468         if (fd < 0)
1469                 return NULL;
1470
1471         /* allocate an mdfdvec entry for it */
1472         v = _fdvec_alloc();
1473
1474         /* fill the entry */
1475         v->mdfd_vfd = fd;
1476         v->mdfd_segno = segno;
1477         v->mdfd_chain = NULL;
1478         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
1479
1480         /* all done */
1481         return v;
1482 }
1483 #endif   /* LET_OS_MANAGE_FILESIZE */
1484
1485 /*
1486  *      _mdfd_getseg() -- Find the segment of the relation holding the
1487  *              specified block.
1488  *
1489  * If the segment doesn't exist, we ereport, return NULL, or create the
1490  * segment, according to "behavior".  Note: isTemp need only be correct
1491  * in the EXTENSION_CREATE case.
1492  */
1493 static MdfdVec *
1494 _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
1495                          ExtensionBehavior behavior)
1496 {
1497         MdfdVec    *v = mdopen(reln, behavior);
1498
1499 #ifndef LET_OS_MANAGE_FILESIZE
1500         BlockNumber targetseg;
1501         BlockNumber nextsegno;
1502
1503         if (!v)
1504                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1505
1506         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1507         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1508         {
1509                 Assert(nextsegno == v->mdfd_segno + 1);
1510
1511                 if (v->mdfd_chain == NULL)
1512                 {
1513                         /*
1514                          * Normally we will create new segments only if authorized by
1515                          * the caller (i.e., we are doing mdextend()).  But when doing
1516                          * WAL recovery, create segments anyway; this allows cases such as
1517                          * replaying WAL data that has a write into a high-numbered
1518                          * segment of a relation that was later deleted.  We want to go
1519                          * ahead and create the segments so we can finish out the replay.
1520                          *
1521                          * We have to maintain the invariant that segments before the
1522                          * last active segment are of size RELSEG_SIZE; therefore, pad
1523                          * them out with zeroes if needed.  (This only matters if caller
1524                          * is extending the relation discontiguously, but that can happen
1525                          * in hash indexes.)
1526                          */
1527                         if (behavior == EXTENSION_CREATE || InRecovery)
1528                         {
1529                                 if (_mdnblocks(reln, v) < RELSEG_SIZE)
1530                                 {
1531                                         char   *zerobuf = palloc0(BLCKSZ);
1532
1533                                         mdextend(reln, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1534                                                          zerobuf, isTemp);
1535                                         pfree(zerobuf);
1536                                 }
1537                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, O_CREAT);
1538                         }
1539                         else
1540                         {
1541                                 /* We won't create segment if not existent */
1542                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, 0);
1543                         }
1544                         if (v->mdfd_chain == NULL)
1545                         {
1546                                 if (behavior == EXTENSION_RETURN_NULL &&
1547                                         FILE_POSSIBLY_DELETED(errno))
1548                                         return NULL;
1549                                 ereport(ERROR,
1550                                                 (errcode_for_file_access(),
1551                                                  errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
1552                                                                 nextsegno,
1553                                                                 reln->smgr_rnode.spcNode,
1554                                                                 reln->smgr_rnode.dbNode,
1555                                                                 reln->smgr_rnode.relNode,
1556                                                                 blkno)));
1557                         }
1558                 }
1559                 v = v->mdfd_chain;
1560         }
1561 #endif
1562
1563         return v;
1564 }
1565
1566 /*
1567  * Get number of blocks present in a single disk file
1568  */
1569 static BlockNumber
1570 _mdnblocks(SMgrRelation reln, MdfdVec *seg)
1571 {
1572         long            len;
1573
1574         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1575         if (len < 0)
1576                 ereport(ERROR,
1577                                 (errcode_for_file_access(),
1578                                  errmsg("could not seek to end of segment %u of relation %u/%u/%u: %m",
1579                                                 seg->mdfd_segno,
1580                                                 reln->smgr_rnode.spcNode,
1581                                                 reln->smgr_rnode.dbNode,
1582                                                 reln->smgr_rnode.relNode)));
1583         /* note that this calculation will ignore any partial block at EOF */
1584         return (BlockNumber) (len / BLCKSZ);
1585 }