]> granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c
Use ftruncate() not truncate() in mdunlink. Seems Windows doesn't
[postgresql] / src / backend / storage / smgr / md.c
1 /*-------------------------------------------------------------------------
2  *
3  * md.c
4  *        This code manages relations that reside on magnetic disk.
5  *
6  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.132 2007/11/15 21:49:47 tgl Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include <unistd.h>
18 #include <fcntl.h>
19 #include <sys/file.h>
20
21 #include "catalog/catalog.h"
22 #include "miscadmin.h"
23 #include "postmaster/bgwriter.h"
24 #include "storage/fd.h"
25 #include "storage/bufmgr.h"
26 #include "storage/smgr.h"
27 #include "utils/hsearch.h"
28 #include "utils/memutils.h"
29
30
31 /* interval for calling AbsorbFsyncRequests in mdsync */
32 #define FSYNCS_PER_ABSORB               10
33
34 /* special values for the segno arg to RememberFsyncRequest */
35 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
36 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
37 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
38
39 /*
40  * On Windows, we have to interpret EACCES as possibly meaning the same as
41  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
42  * that's what you get.  Ugh.  This code is designed so that we don't
43  * actually believe these cases are okay without further evidence (namely,
44  * a pending fsync request getting revoked ... see mdsync).
45  */
46 #ifndef WIN32
47 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
48 #else
49 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
50 #endif
51
52 /*
53  *      The magnetic disk storage manager keeps track of open file
54  *      descriptors in its own descriptor pool.  This is done to make it
55  *      easier to support relations that are larger than the operating
56  *      system's file size limit (often 2GBytes).  In order to do that,
57  *      we break relations up into "segment" files that are each shorter than
58  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
59  *      configuration constant in pg_config_manual.h.
60  *
61  *      On disk, a relation must consist of consecutively numbered segment
62  *      files in the pattern
63  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
64  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
65  *              -- Optionally, any number of inactive segments of size 0 blocks.
66  *      The full and partial segments are collectively the "active" segments.
67  *      Inactive segments are those that once contained data but are currently
68  *      not needed because of an mdtruncate() operation.  The reason for leaving
69  *      them present at size zero, rather than unlinking them, is that other
70  *      backends and/or the bgwriter might be holding open file references to
71  *      such segments.  If the relation expands again after mdtruncate(), such
72  *      that a deactivated segment becomes active again, it is important that
73  *      such file references still be valid --- else data might get written
74  *      out to an unlinked old copy of a segment file that will eventually
75  *      disappear.
76  *
77  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
78  *      cache is, therefore, just the head of a list of MdfdVec objects, one
79  *      per segment.  But note the md_fd pointer can be NULL, indicating
80  *      relation not open.
81  *
82  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
83  *      doesn't have another segment after this one; we may just not have
84  *      opened the next segment yet.  (We could not have "all segments are
85  *      in the chain" as an invariant anyway, since another backend could
86  *      extend the relation when we weren't looking.)  We do not make chain
87  *      entries for inactive segments, however; as soon as we find a partial
88  *      segment, we assume that any subsequent segments are inactive.
89  *
90  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
91  *
92  *      Defining LET_OS_MANAGE_FILESIZE disables the segmentation logic,
93  *      for use on machines that support large files.  Beware that that
94  *      code has not been tested in a long time and is probably bit-rotted.
95  */
96
97 typedef struct _MdfdVec
98 {
99         File            mdfd_vfd;               /* fd number in fd.c's pool */
100         BlockNumber mdfd_segno;         /* segment number, from 0 */
101 #ifndef LET_OS_MANAGE_FILESIZE  /* for large relations */
102         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
103 #endif
104 } MdfdVec;
105
106 static MemoryContext MdCxt;             /* context for all md.c allocations */
107
108
109 /*
110  * In some contexts (currently, standalone backends and the bgwriter process)
111  * we keep track of pending fsync operations: we need to remember all relation
112  * segments that have been written since the last checkpoint, so that we can
113  * fsync them down to disk before completing the next checkpoint.  This hash
114  * table remembers the pending operations.      We use a hash table mostly as
115  * a convenient way of eliminating duplicate requests.
116  *
117  * We use a similar mechanism to remember no-longer-needed files that can
118  * be deleted after the next checkpoint, but we use a linked list instead of
119  * a hash table, because we don't expect there to be any duplicate requests.
120  *
121  * (Regular backends do not track pending operations locally, but forward
122  * them to the bgwriter.)
123  */
124 typedef struct
125 {
126         RelFileNode rnode;                      /* the targeted relation */
127         BlockNumber segno;                      /* which segment */
128 }       PendingOperationTag;
129
130 typedef uint16 CycleCtr;                /* can be any convenient integer size */
131
132 typedef struct
133 {
134         PendingOperationTag tag;        /* hash table key (must be first!) */
135         bool            canceled;               /* T => request canceled, not yet removed */
136         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
137 } PendingOperationEntry;
138
139 typedef struct
140 {
141         RelFileNode rnode;                      /* the dead relation to delete */
142         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
143 }       PendingUnlinkEntry;
144
145 static HTAB *pendingOpsTable = NULL;
146 static List *pendingUnlinks = NIL;
147
148 static CycleCtr mdsync_cycle_ctr = 0;
149 static CycleCtr mdckpt_cycle_ctr = 0;
150
151
152 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
153 {
154         EXTENSION_FAIL,                         /* ereport if segment not present */
155         EXTENSION_RETURN_NULL,          /* return NULL if not present */
156         EXTENSION_CREATE                        /* create new segments as needed */
157 }       ExtensionBehavior;
158
159 /* local routines */
160 static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
161 static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
162 static void register_unlink(RelFileNode rnode);
163 static MdfdVec *_fdvec_alloc(void);
164
165 #ifndef LET_OS_MANAGE_FILESIZE
166 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
167                           int oflags);
168 #endif
169 static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
170                          bool isTemp, ExtensionBehavior behavior);
171 static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);
172
173
174 /*
175  *      mdinit() -- Initialize private state for magnetic disk storage manager.
176  */
177 void
178 mdinit(void)
179 {
180         MdCxt = AllocSetContextCreate(TopMemoryContext,
181                                                                   "MdSmgr",
182                                                                   ALLOCSET_DEFAULT_MINSIZE,
183                                                                   ALLOCSET_DEFAULT_INITSIZE,
184                                                                   ALLOCSET_DEFAULT_MAXSIZE);
185
186         /*
187          * Create pending-operations hashtable if we need it.  Currently, we need
188          * it if we are standalone (not under a postmaster) OR if we are a
189          * bootstrap-mode subprocess of a postmaster (that is, a startup or
190          * bgwriter process).
191          */
192         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
193         {
194                 HASHCTL         hash_ctl;
195
196                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
197                 hash_ctl.keysize = sizeof(PendingOperationTag);
198                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
199                 hash_ctl.hash = tag_hash;
200                 hash_ctl.hcxt = MdCxt;
201                 pendingOpsTable = hash_create("Pending Ops Table",
202                                                                           100L,
203                                                                           &hash_ctl,
204                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
205                 pendingUnlinks = NIL;
206         }
207 }
208
209 /*
210  *      mdcreate() -- Create a new relation on magnetic disk.
211  *
212  * If isRedo is true, it's okay for the relation to exist already.
213  */
214 void
215 mdcreate(SMgrRelation reln, bool isRedo)
216 {
217         char       *path;
218         File            fd;
219
220         if (isRedo && reln->md_fd != NULL)
221                 return;                                 /* created and opened already... */
222
223         Assert(reln->md_fd == NULL);
224
225         path = relpath(reln->smgr_rnode);
226
227         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
228
229         if (fd < 0)
230         {
231                 int                     save_errno = errno;
232
233                 /*
234                  * During bootstrap, there are cases where a system relation will be
235                  * accessed (by internal backend processes) before the bootstrap
236                  * script nominally creates it.  Therefore, allow the file to exist
237                  * already, even if isRedo is not set.  (See also mdopen)
238                  */
239                 if (isRedo || IsBootstrapProcessingMode())
240                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
241                 if (fd < 0)
242                 {
243                         pfree(path);
244                         /* be sure to report the error reported by create, not open */
245                         errno = save_errno;
246                         ereport(ERROR,
247                                         (errcode_for_file_access(),
248                                          errmsg("could not create relation %u/%u/%u: %m",
249                                                         reln->smgr_rnode.spcNode,
250                                                         reln->smgr_rnode.dbNode,
251                                                         reln->smgr_rnode.relNode)));
252                 }
253         }
254
255         pfree(path);
256
257         reln->md_fd = _fdvec_alloc();
258
259         reln->md_fd->mdfd_vfd = fd;
260         reln->md_fd->mdfd_segno = 0;
261 #ifndef LET_OS_MANAGE_FILESIZE
262         reln->md_fd->mdfd_chain = NULL;
263 #endif
264 }
265
266 /*
267  *      mdunlink() -- Unlink a relation.
268  *
269  * Note that we're passed a RelFileNode --- by the time this is called,
270  * there won't be an SMgrRelation hashtable entry anymore.
271  *
272  * Actually, we don't unlink the first segment file of the relation, but
273  * just truncate it to zero length, and record a request to unlink it after
274  * the next checkpoint.  Additional segments can be unlinked immediately,
275  * however.  Leaving the empty file in place prevents that relfilenode
276  * number from being reused.  The scenario this protects us from is:
277  * 1. We delete a relation (and commit, and actually remove its file).
278  * 2. We create a new relation, which by chance gets the same relfilenode as
279  *        the just-deleted one (OIDs must've wrapped around for that to happen).
280  * 3. We crash before another checkpoint occurs.
281  * During replay, we would delete the file and then recreate it, which is fine
282  * if the contents of the file were repopulated by subsequent WAL entries.
283  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
284  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
285  * the contents of the file would be lost forever.      By leaving the empty file
286  * until after the next checkpoint, we prevent reassignment of the relfilenode
287  * number until it's safe, because relfilenode assignment skips over any
288  * existing file.
289  *
290  * If isRedo is true, it's okay for the relation to be already gone.
291  * Also, we should remove the file immediately instead of queuing a request
292  * for later, since during redo there's no possibility of creating a
293  * conflicting relation.
294  *
295  * Note: any failure should be reported as WARNING not ERROR, because
296  * we are usually not in a transaction anymore when this is called.
297  */
298 void
299 mdunlink(RelFileNode rnode, bool isRedo)
300 {
301         char       *path;
302         int                     ret;
303
304         /*
305          * We have to clean out any pending fsync requests for the doomed
306          * relation, else the next mdsync() will fail.
307          */
308         ForgetRelationFsyncRequests(rnode);
309
310         path = relpath(rnode);
311
312         /*
313          * Delete or truncate the first segment, or only segment if not doing
314          * segmenting
315          */
316         if (isRedo)
317                 ret = unlink(path);
318         else
319         {
320                 /* truncate(2) would be easier here, but Windows hasn't got it */
321                 int             fd;
322
323                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
324                 if (fd >= 0)
325                 {
326                         int             save_errno;
327
328                         ret = ftruncate(fd, 0);
329                         save_errno = errno;
330                         close(fd);
331                         errno = save_errno;
332                 }
333                 else
334                         ret = -1;
335         }
336         if (ret < 0)
337         {
338                 if (!isRedo || errno != ENOENT)
339                         ereport(WARNING,
340                                         (errcode_for_file_access(),
341                                          errmsg("could not remove relation %u/%u/%u: %m",
342                                                         rnode.spcNode,
343                                                         rnode.dbNode,
344                                                         rnode.relNode)));
345         }
346
347 #ifndef LET_OS_MANAGE_FILESIZE
348         /* Delete the additional segments, if any */
349         else
350         {
351                 char       *segpath = (char *) palloc(strlen(path) + 12);
352                 BlockNumber segno;
353
354                 /*
355                  * Note that because we loop until getting ENOENT, we will correctly
356                  * remove all inactive segments as well as active ones.
357                  */
358                 for (segno = 1;; segno++)
359                 {
360                         sprintf(segpath, "%s.%u", path, segno);
361                         if (unlink(segpath) < 0)
362                         {
363                                 /* ENOENT is expected after the last segment... */
364                                 if (errno != ENOENT)
365                                         ereport(WARNING,
366                                                         (errcode_for_file_access(),
367                                                          errmsg("could not remove segment %u of relation %u/%u/%u: %m",
368                                                                         segno,
369                                                                         rnode.spcNode,
370                                                                         rnode.dbNode,
371                                                                         rnode.relNode)));
372                                 break;
373                         }
374                 }
375                 pfree(segpath);
376         }
377 #endif
378
379         pfree(path);
380
381         /* Register request to unlink first segment later */
382         if (!isRedo)
383                 register_unlink(rnode);
384 }
385
386 /*
387  *      mdextend() -- Add a block to the specified relation.
388  *
389  *              The semantics are nearly the same as mdwrite(): write at the
390  *              specified position.  However, this is to be used for the case of
391  *              extending a relation (i.e., blocknum is at or beyond the current
392  *              EOF).  Note that we assume writing a block beyond current EOF
393  *              causes intervening file space to become filled with zeroes.
394  */
395 void
396 mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
397 {
398         long            seekpos;
399         int                     nbytes;
400         MdfdVec    *v;
401
402         /* This assert is too expensive to have on normally ... */
403 #ifdef CHECK_WRITE_VS_EXTEND
404         Assert(blocknum >= mdnblocks(reln));
405 #endif
406
407         /*
408          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
409          * more --- we mustn't create a block whose number actually is
410          * InvalidBlockNumber.
411          */
412         if (blocknum == InvalidBlockNumber)
413                 ereport(ERROR,
414                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
415                                  errmsg("cannot extend relation %u/%u/%u beyond %u blocks",
416                                                 reln->smgr_rnode.spcNode,
417                                                 reln->smgr_rnode.dbNode,
418                                                 reln->smgr_rnode.relNode,
419                                                 InvalidBlockNumber)));
420
421         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
422
423 #ifndef LET_OS_MANAGE_FILESIZE
424         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
425         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
426 #else
427         seekpos = (long) (BLCKSZ * (blocknum));
428 #endif
429
430         /*
431          * Note: because caller usually obtained blocknum by calling mdnblocks,
432          * which did a seek(SEEK_END), this seek is often redundant and will be
433          * optimized away by fd.c.      It's not redundant, however, if there is a
434          * partial page at the end of the file. In that case we want to try to
435          * overwrite the partial page with a full page.  It's also not redundant
436          * if bufmgr.c had to dump another buffer of the same file to make room
437          * for the new page's buffer.
438          */
439         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
440                 ereport(ERROR,
441                                 (errcode_for_file_access(),
442                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
443                                                 blocknum,
444                                                 reln->smgr_rnode.spcNode,
445                                                 reln->smgr_rnode.dbNode,
446                                                 reln->smgr_rnode.relNode)));
447
448         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
449         {
450                 if (nbytes < 0)
451                         ereport(ERROR,
452                                         (errcode_for_file_access(),
453                                          errmsg("could not extend relation %u/%u/%u: %m",
454                                                         reln->smgr_rnode.spcNode,
455                                                         reln->smgr_rnode.dbNode,
456                                                         reln->smgr_rnode.relNode),
457                                          errhint("Check free disk space.")));
458                 /* short write: complain appropriately */
459                 ereport(ERROR,
460                                 (errcode(ERRCODE_DISK_FULL),
461                                  errmsg("could not extend relation %u/%u/%u: wrote only %d of %d bytes at block %u",
462                                                 reln->smgr_rnode.spcNode,
463                                                 reln->smgr_rnode.dbNode,
464                                                 reln->smgr_rnode.relNode,
465                                                 nbytes, BLCKSZ, blocknum),
466                                  errhint("Check free disk space.")));
467         }
468
469         if (!isTemp)
470                 register_dirty_segment(reln, v);
471
472 #ifndef LET_OS_MANAGE_FILESIZE
473         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
474 #endif
475 }
476
477 /*
478  *      mdopen() -- Open the specified relation.
479  *
480  * Note we only open the first segment, when there are multiple segments.
481  *
482  * If first segment is not present, either ereport or return NULL according
483  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
484  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
485  * invent one out of whole cloth.
486  */
487 static MdfdVec *
488 mdopen(SMgrRelation reln, ExtensionBehavior behavior)
489 {
490         MdfdVec    *mdfd;
491         char       *path;
492         File            fd;
493
494         /* No work if already open */
495         if (reln->md_fd)
496                 return reln->md_fd;
497
498         path = relpath(reln->smgr_rnode);
499
500         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
501
502         if (fd < 0)
503         {
504                 /*
505                  * During bootstrap, there are cases where a system relation will be
506                  * accessed (by internal backend processes) before the bootstrap
507                  * script nominally creates it.  Therefore, accept mdopen() as a
508                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
509                  */
510                 if (IsBootstrapProcessingMode())
511                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
512                 if (fd < 0)
513                 {
514                         pfree(path);
515                         if (behavior == EXTENSION_RETURN_NULL &&
516                                 FILE_POSSIBLY_DELETED(errno))
517                                 return NULL;
518                         ereport(ERROR,
519                                         (errcode_for_file_access(),
520                                          errmsg("could not open relation %u/%u/%u: %m",
521                                                         reln->smgr_rnode.spcNode,
522                                                         reln->smgr_rnode.dbNode,
523                                                         reln->smgr_rnode.relNode)));
524                 }
525         }
526
527         pfree(path);
528
529         reln->md_fd = mdfd = _fdvec_alloc();
530
531         mdfd->mdfd_vfd = fd;
532         mdfd->mdfd_segno = 0;
533 #ifndef LET_OS_MANAGE_FILESIZE
534         mdfd->mdfd_chain = NULL;
535         Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
536 #endif
537
538         return mdfd;
539 }
540
541 /*
542  *      mdclose() -- Close the specified relation, if it isn't closed already.
543  */
544 void
545 mdclose(SMgrRelation reln)
546 {
547         MdfdVec    *v = reln->md_fd;
548
549         /* No work if already closed */
550         if (v == NULL)
551                 return;
552
553         reln->md_fd = NULL;                     /* prevent dangling pointer after error */
554
555 #ifndef LET_OS_MANAGE_FILESIZE
556         while (v != NULL)
557         {
558                 MdfdVec    *ov = v;
559
560                 /* if not closed already */
561                 if (v->mdfd_vfd >= 0)
562                         FileClose(v->mdfd_vfd);
563                 /* Now free vector */
564                 v = v->mdfd_chain;
565                 pfree(ov);
566         }
567 #else
568         if (v->mdfd_vfd >= 0)
569                 FileClose(v->mdfd_vfd);
570         pfree(v);
571 #endif
572 }
573
574 /*
575  *      mdread() -- Read the specified block from a relation.
576  */
577 void
578 mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
579 {
580         long            seekpos;
581         int                     nbytes;
582         MdfdVec    *v;
583
584         v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
585
586 #ifndef LET_OS_MANAGE_FILESIZE
587         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
588         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
589 #else
590         seekpos = (long) (BLCKSZ * (blocknum));
591 #endif
592
593         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
594                 ereport(ERROR,
595                                 (errcode_for_file_access(),
596                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
597                                                 blocknum,
598                                                 reln->smgr_rnode.spcNode,
599                                                 reln->smgr_rnode.dbNode,
600                                                 reln->smgr_rnode.relNode)));
601
602         if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
603         {
604                 if (nbytes < 0)
605                         ereport(ERROR,
606                                         (errcode_for_file_access(),
607                                    errmsg("could not read block %u of relation %u/%u/%u: %m",
608                                                   blocknum,
609                                                   reln->smgr_rnode.spcNode,
610                                                   reln->smgr_rnode.dbNode,
611                                                   reln->smgr_rnode.relNode)));
612
613                 /*
614                  * Short read: we are at or past EOF, or we read a partial block at
615                  * EOF.  Normally this is an error; upper levels should never try to
616                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
617                  * we are InRecovery, we should instead return zeroes without
618                  * complaining.  This allows, for example, the case of trying to
619                  * update a block that was later truncated away.
620                  */
621                 if (zero_damaged_pages || InRecovery)
622                         MemSet(buffer, 0, BLCKSZ);
623                 else
624                         ereport(ERROR,
625                                         (errcode(ERRCODE_DATA_CORRUPTED),
626                                          errmsg("could not read block %u of relation %u/%u/%u: read only %d of %d bytes",
627                                                         blocknum,
628                                                         reln->smgr_rnode.spcNode,
629                                                         reln->smgr_rnode.dbNode,
630                                                         reln->smgr_rnode.relNode,
631                                                         nbytes, BLCKSZ)));
632         }
633 }
634
635 /*
636  *      mdwrite() -- Write the supplied block at the appropriate location.
637  *
638  *              This is to be used only for updating already-existing blocks of a
639  *              relation (ie, those before the current EOF).  To extend a relation,
640  *              use mdextend().
641  */
642 void
643 mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
644 {
645         long            seekpos;
646         int                     nbytes;
647         MdfdVec    *v;
648
649         /* This assert is too expensive to have on normally ... */
650 #ifdef CHECK_WRITE_VS_EXTEND
651         Assert(blocknum < mdnblocks(reln));
652 #endif
653
654         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
655
656 #ifndef LET_OS_MANAGE_FILESIZE
657         seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
658         Assert(seekpos < BLCKSZ * RELSEG_SIZE);
659 #else
660         seekpos = (long) (BLCKSZ * (blocknum));
661 #endif
662
663         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
664                 ereport(ERROR,
665                                 (errcode_for_file_access(),
666                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
667                                                 blocknum,
668                                                 reln->smgr_rnode.spcNode,
669                                                 reln->smgr_rnode.dbNode,
670                                                 reln->smgr_rnode.relNode)));
671
672         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
673         {
674                 if (nbytes < 0)
675                         ereport(ERROR,
676                                         (errcode_for_file_access(),
677                                   errmsg("could not write block %u of relation %u/%u/%u: %m",
678                                                  blocknum,
679                                                  reln->smgr_rnode.spcNode,
680                                                  reln->smgr_rnode.dbNode,
681                                                  reln->smgr_rnode.relNode)));
682                 /* short write: complain appropriately */
683                 ereport(ERROR,
684                                 (errcode(ERRCODE_DISK_FULL),
685                                  errmsg("could not write block %u of relation %u/%u/%u: wrote only %d of %d bytes",
686                                                 blocknum,
687                                                 reln->smgr_rnode.spcNode,
688                                                 reln->smgr_rnode.dbNode,
689                                                 reln->smgr_rnode.relNode,
690                                                 nbytes, BLCKSZ),
691                                  errhint("Check free disk space.")));
692         }
693
694         if (!isTemp)
695                 register_dirty_segment(reln, v);
696 }
697
698 /*
699  *      mdnblocks() -- Get the number of blocks stored in a relation.
700  *
701  *              Important side effect: all active segments of the relation are opened
702  *              and added to the mdfd_chain list.  If this routine has not been
703  *              called, then only segments up to the last one actually touched
704  *              are present in the chain.
705  */
706 BlockNumber
707 mdnblocks(SMgrRelation reln)
708 {
709         MdfdVec    *v = mdopen(reln, EXTENSION_FAIL);
710
711 #ifndef LET_OS_MANAGE_FILESIZE
712         BlockNumber nblocks;
713         BlockNumber segno = 0;
714
715         /*
716          * Skip through any segments that aren't the last one, to avoid redundant
717          * seeks on them.  We have previously verified that these segments are
718          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
719          *
720          * NOTE: this assumption could only be wrong if another backend has
721          * truncated the relation.      We rely on higher code levels to handle that
722          * scenario by closing and re-opening the md fd, which is handled via
723          * relcache flush.      (Since the bgwriter doesn't participate in relcache
724          * flush, it could have segment chain entries for inactive segments;
725          * that's OK because the bgwriter never needs to compute relation size.)
726          */
727         while (v->mdfd_chain != NULL)
728         {
729                 segno++;
730                 v = v->mdfd_chain;
731         }
732
733         for (;;)
734         {
735                 nblocks = _mdnblocks(reln, v);
736                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
737                         elog(FATAL, "segment too big");
738                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
739                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
740
741                 /*
742                  * If segment is exactly RELSEG_SIZE, advance to next one.
743                  */
744                 segno++;
745
746                 if (v->mdfd_chain == NULL)
747                 {
748                         /*
749                          * Because we pass O_CREAT, we will create the next segment (with
750                          * zero length) immediately, if the last segment is of length
751                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
752                          * the logic simple.
753                          */
754                         v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
755                         if (v->mdfd_chain == NULL)
756                                 ereport(ERROR,
757                                                 (errcode_for_file_access(),
758                                  errmsg("could not open segment %u of relation %u/%u/%u: %m",
759                                                 segno,
760                                                 reln->smgr_rnode.spcNode,
761                                                 reln->smgr_rnode.dbNode,
762                                                 reln->smgr_rnode.relNode)));
763                 }
764
765                 v = v->mdfd_chain;
766         }
767 #else
768         return _mdnblocks(reln, v);
769 #endif
770 }
771
772 /*
773  *      mdtruncate() -- Truncate relation to specified number of blocks.
774  */
775 void
776 mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
777 {
778         MdfdVec    *v;
779         BlockNumber curnblk;
780
781 #ifndef LET_OS_MANAGE_FILESIZE
782         BlockNumber priorblocks;
783 #endif
784
785         /*
786          * NOTE: mdnblocks makes sure we have opened all active segments, so that
787          * truncation loop will get them all!
788          */
789         curnblk = mdnblocks(reln);
790         if (nblocks > curnblk)
791         {
792                 /* Bogus request ... but no complaint if InRecovery */
793                 if (InRecovery)
794                         return;
795                 ereport(ERROR,
796                                 (errmsg("could not truncate relation %u/%u/%u to %u blocks: it's only %u blocks now",
797                                                 reln->smgr_rnode.spcNode,
798                                                 reln->smgr_rnode.dbNode,
799                                                 reln->smgr_rnode.relNode,
800                                                 nblocks, curnblk)));
801         }
802         if (nblocks == curnblk)
803                 return;                                 /* no work */
804
805         v = mdopen(reln, EXTENSION_FAIL);
806
807 #ifndef LET_OS_MANAGE_FILESIZE
808         priorblocks = 0;
809         while (v != NULL)
810         {
811                 MdfdVec    *ov = v;
812
813                 if (priorblocks > nblocks)
814                 {
815                         /*
816                          * This segment is no longer active (and has already been unlinked
817                          * from the mdfd_chain). We truncate the file, but do not delete
818                          * it, for reasons explained in the header comments.
819                          */
820                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
821                                 ereport(ERROR,
822                                                 (errcode_for_file_access(),
823                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
824                                                                 reln->smgr_rnode.spcNode,
825                                                                 reln->smgr_rnode.dbNode,
826                                                                 reln->smgr_rnode.relNode,
827                                                                 nblocks)));
828                         if (!isTemp)
829                                 register_dirty_segment(reln, v);
830                         v = v->mdfd_chain;
831                         Assert(ov != reln->md_fd);      /* we never drop the 1st segment */
832                         pfree(ov);
833                 }
834                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
835                 {
836                         /*
837                          * This is the last segment we want to keep. Truncate the file to
838                          * the right length, and clear chain link that points to any
839                          * remaining segments (which we shall zap). NOTE: if nblocks is
840                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
841                          * segment to 0 length but keep it. This adheres to the invariant
842                          * given in the header comments.
843                          */
844                         BlockNumber lastsegblocks = nblocks - priorblocks;
845
846                         if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
847                                 ereport(ERROR,
848                                                 (errcode_for_file_access(),
849                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
850                                                                 reln->smgr_rnode.spcNode,
851                                                                 reln->smgr_rnode.dbNode,
852                                                                 reln->smgr_rnode.relNode,
853                                                                 nblocks)));
854                         if (!isTemp)
855                                 register_dirty_segment(reln, v);
856                         v = v->mdfd_chain;
857                         ov->mdfd_chain = NULL;
858                 }
859                 else
860                 {
861                         /*
862                          * We still need this segment and 0 or more blocks beyond it, so
863                          * nothing to do here.
864                          */
865                         v = v->mdfd_chain;
866                 }
867                 priorblocks += RELSEG_SIZE;
868         }
869 #else
870         if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
871                 ereport(ERROR,
872                                 (errcode_for_file_access(),
873                           errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
874                                          reln->smgr_rnode.spcNode,
875                                          reln->smgr_rnode.dbNode,
876                                          reln->smgr_rnode.relNode,
877                                          nblocks)));
878         if (!isTemp)
879                 register_dirty_segment(reln, v);
880 #endif
881 }
882
883 /*
884  *      mdimmedsync() -- Immediately sync a relation to stable storage.
885  *
886  * Note that only writes already issued are synced; this routine knows
887  * nothing of dirty buffers that may exist inside the buffer manager.
888  */
889 void
890 mdimmedsync(SMgrRelation reln)
891 {
892         MdfdVec    *v;
893         BlockNumber curnblk;
894
895         /*
896          * NOTE: mdnblocks makes sure we have opened all active segments, so that
897          * fsync loop will get them all!
898          */
899         curnblk = mdnblocks(reln);
900
901         v = mdopen(reln, EXTENSION_FAIL);
902
903 #ifndef LET_OS_MANAGE_FILESIZE
904         while (v != NULL)
905         {
906                 if (FileSync(v->mdfd_vfd) < 0)
907                         ereport(ERROR,
908                                         (errcode_for_file_access(),
909                                 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
910                                            v->mdfd_segno,
911                                            reln->smgr_rnode.spcNode,
912                                            reln->smgr_rnode.dbNode,
913                                            reln->smgr_rnode.relNode)));
914                 v = v->mdfd_chain;
915         }
916 #else
917         if (FileSync(v->mdfd_vfd) < 0)
918                 ereport(ERROR,
919                                 (errcode_for_file_access(),
920                                  errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
921                                                 v->mdfd_segno,
922                                                 reln->smgr_rnode.spcNode,
923                                                 reln->smgr_rnode.dbNode,
924                                                 reln->smgr_rnode.relNode)));
925 #endif
926 }
927
928 /*
929  *      mdsync() -- Sync previous writes to stable storage.
930  */
931 void
932 mdsync(void)
933 {
934         static bool mdsync_in_progress = false;
935
936         HASH_SEQ_STATUS hstat;
937         PendingOperationEntry *entry;
938         int                     absorb_counter;
939
940         /*
941          * This is only called during checkpoints, and checkpoints should only
942          * occur in processes that have created a pendingOpsTable.
943          */
944         if (!pendingOpsTable)
945                 elog(ERROR, "cannot sync without a pendingOpsTable");
946
947         /*
948          * If we are in the bgwriter, the sync had better include all fsync
949          * requests that were queued by backends up to this point.      The tightest
950          * race condition that could occur is that a buffer that must be written
951          * and fsync'd for the checkpoint could have been dumped by a backend just
952          * before it was visited by BufferSync().  We know the backend will have
953          * queued an fsync request before clearing the buffer's dirtybit, so we
954          * are safe as long as we do an Absorb after completing BufferSync().
955          */
956         AbsorbFsyncRequests();
957
958         /*
959          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
960          * checkpoint), we want to ignore fsync requests that are entered into the
961          * hashtable after this point --- they should be processed next time,
962          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
963          * ones: new ones will have cycle_ctr equal to the incremented value of
964          * mdsync_cycle_ctr.
965          *
966          * In normal circumstances, all entries present in the table at this point
967          * will have cycle_ctr exactly equal to the current (about to be old)
968          * value of mdsync_cycle_ctr.  However, if we fail partway through the
969          * fsync'ing loop, then older values of cycle_ctr might remain when we
970          * come back here to try again.  Repeated checkpoint failures would
971          * eventually wrap the counter around to the point where an old entry
972          * might appear new, causing us to skip it, possibly allowing a checkpoint
973          * to succeed that should not have.  To forestall wraparound, any time the
974          * previous mdsync() failed to complete, run through the table and
975          * forcibly set cycle_ctr = mdsync_cycle_ctr.
976          *
977          * Think not to merge this loop with the main loop, as the problem is
978          * exactly that that loop may fail before having visited all the entries.
979          * From a performance point of view it doesn't matter anyway, as this path
980          * will never be taken in a system that's functioning normally.
981          */
982         if (mdsync_in_progress)
983         {
984                 /* prior try failed, so update any stale cycle_ctr values */
985                 hash_seq_init(&hstat, pendingOpsTable);
986                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
987                 {
988                         entry->cycle_ctr = mdsync_cycle_ctr;
989                 }
990         }
991
992         /* Advance counter so that new hashtable entries are distinguishable */
993         mdsync_cycle_ctr++;
994
995         /* Set flag to detect failure if we don't reach the end of the loop */
996         mdsync_in_progress = true;
997
998         /* Now scan the hashtable for fsync requests to process */
999         absorb_counter = FSYNCS_PER_ABSORB;
1000         hash_seq_init(&hstat, pendingOpsTable);
1001         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1002         {
1003                 /*
1004                  * If the entry is new then don't process it this time.  Note that
1005                  * "continue" bypasses the hash-remove call at the bottom of the loop.
1006                  */
1007                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1008                         continue;
1009
1010                 /* Else assert we haven't missed it */
1011                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1012
1013                 /*
1014                  * If fsync is off then we don't have to bother opening the file at
1015                  * all.  (We delay checking until this point so that changing fsync on
1016                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
1017                  * fall through to delete it.
1018                  */
1019                 if (enableFsync && !entry->canceled)
1020                 {
1021                         int                     failures;
1022
1023                         /*
1024                          * If in bgwriter, we want to absorb pending requests every so
1025                          * often to prevent overflow of the fsync request queue.  It is
1026                          * unspecified whether newly-added entries will be visited by
1027                          * hash_seq_search, but we don't care since we don't need to
1028                          * process them anyway.
1029                          */
1030                         if (--absorb_counter <= 0)
1031                         {
1032                                 AbsorbFsyncRequests();
1033                                 absorb_counter = FSYNCS_PER_ABSORB;
1034                         }
1035
1036                         /*
1037                          * The fsync table could contain requests to fsync segments that
1038                          * have been deleted (unlinked) by the time we get to them. Rather
1039                          * than just hoping an ENOENT (or EACCES on Windows) error can be
1040                          * ignored, what we do on error is absorb pending requests and
1041                          * then retry.  Since mdunlink() queues a "revoke" message before
1042                          * actually unlinking, the fsync request is guaranteed to be
1043                          * marked canceled after the absorb if it really was this case.
1044                          * DROP DATABASE likewise has to tell us to forget fsync requests
1045                          * before it starts deletions.
1046                          */
1047                         for (failures = 0;; failures++)         /* loop exits at "break" */
1048                         {
1049                                 SMgrRelation reln;
1050                                 MdfdVec    *seg;
1051
1052                                 /*
1053                                  * Find or create an smgr hash entry for this relation. This
1054                                  * may seem a bit unclean -- md calling smgr?  But it's really
1055                                  * the best solution.  It ensures that the open file reference
1056                                  * isn't permanently leaked if we get an error here. (You may
1057                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1058                                  * really, because the only case in which a checkpoint is done
1059                                  * by a process that isn't about to shut down is in the
1060                                  * bgwriter, and it will periodically do smgrcloseall(). This
1061                                  * fact justifies our not closing the reln in the success path
1062                                  * either, which is a good thing since in non-bgwriter cases
1063                                  * we couldn't safely do that.)  Furthermore, in many cases
1064                                  * the relation will have been dirtied through this same smgr
1065                                  * relation, and so we can save a file open/close cycle.
1066                                  */
1067                                 reln = smgropen(entry->tag.rnode);
1068
1069                                 /*
1070                                  * It is possible that the relation has been dropped or
1071                                  * truncated since the fsync request was entered.  Therefore,
1072                                  * allow ENOENT, but only if we didn't fail already on this
1073                                  * file.  This applies both during _mdfd_getseg() and during
1074                                  * FileSync, since fd.c might have closed the file behind our
1075                                  * back.
1076                                  */
1077                                 seg = _mdfd_getseg(reln,
1078                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1079                                                                    false, EXTENSION_RETURN_NULL);
1080                                 if (seg != NULL &&
1081                                         FileSync(seg->mdfd_vfd) >= 0)
1082                                         break;          /* success; break out of retry loop */
1083
1084                                 /*
1085                                  * XXX is there any point in allowing more than one retry?
1086                                  * Don't see one at the moment, but easy to change the test
1087                                  * here if so.
1088                                  */
1089                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1090                                         failures > 0)
1091                                         ereport(ERROR,
1092                                                         (errcode_for_file_access(),
1093                                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1094                                                                         entry->tag.segno,
1095                                                                         entry->tag.rnode.spcNode,
1096                                                                         entry->tag.rnode.dbNode,
1097                                                                         entry->tag.rnode.relNode)));
1098                                 else
1099                                         ereport(DEBUG1,
1100                                                         (errcode_for_file_access(),
1101                                                          errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
1102                                                                         entry->tag.segno,
1103                                                                         entry->tag.rnode.spcNode,
1104                                                                         entry->tag.rnode.dbNode,
1105                                                                         entry->tag.rnode.relNode)));
1106
1107                                 /*
1108                                  * Absorb incoming requests and check to see if canceled.
1109                                  */
1110                                 AbsorbFsyncRequests();
1111                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1112
1113                                 if (entry->canceled)
1114                                         break;
1115                         }                                       /* end retry loop */
1116                 }
1117
1118                 /*
1119                  * If we get here, either we fsync'd successfully, or we don't have to
1120                  * because enableFsync is off, or the entry is (now) marked canceled.
1121                  * Okay to delete it.
1122                  */
1123                 if (hash_search(pendingOpsTable, &entry->tag,
1124                                                 HASH_REMOVE, NULL) == NULL)
1125                         elog(ERROR, "pendingOpsTable corrupted");
1126         }                                                       /* end loop over hashtable entries */
1127
1128         /* Flag successful completion of mdsync */
1129         mdsync_in_progress = false;
1130 }
1131
1132 /*
1133  * mdpreckpt() -- Do pre-checkpoint work
1134  *
1135  * To distinguish unlink requests that arrived before this checkpoint
1136  * started from those that arrived during the checkpoint, we use a cycle
1137  * counter similar to the one we use for fsync requests. That cycle
1138  * counter is incremented here.
1139  *
1140  * This must be called *before* the checkpoint REDO point is determined.
1141  * That ensures that we won't delete files too soon.
1142  *
1143  * Note that we can't do anything here that depends on the assumption
1144  * that the checkpoint will be completed.
1145  */
1146 void
1147 mdpreckpt(void)
1148 {
1149         ListCell   *cell;
1150
1151         /*
1152          * In case the prior checkpoint wasn't completed, stamp all entries in the
1153          * list with the current cycle counter.  Anything that's in the list at
1154          * the start of checkpoint can surely be deleted after the checkpoint is
1155          * finished, regardless of when the request was made.
1156          */
1157         foreach(cell, pendingUnlinks)
1158         {
1159                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1160
1161                 entry->cycle_ctr = mdckpt_cycle_ctr;
1162         }
1163
1164         /*
1165          * Any unlink requests arriving after this point will be assigned the next
1166          * cycle counter, and won't be unlinked until next checkpoint.
1167          */
1168         mdckpt_cycle_ctr++;
1169 }
1170
1171 /*
1172  * mdpostckpt() -- Do post-checkpoint work
1173  *
1174  * Remove any lingering files that can now be safely removed.
1175  */
1176 void
1177 mdpostckpt(void)
1178 {
1179         while (pendingUnlinks != NIL)
1180         {
1181                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1182                 char       *path;
1183
1184                 /*
1185                  * New entries are appended to the end, so if the entry is new we've
1186                  * reached the end of old entries.
1187                  */
1188                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1189                         break;
1190
1191                 /* Else assert we haven't missed it */
1192                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1193
1194                 /* Unlink the file */
1195                 path = relpath(entry->rnode);
1196                 if (unlink(path) < 0)
1197                 {
1198                         /*
1199                          * ENOENT shouldn't happen either, but it doesn't really matter
1200                          * because we would've deleted it now anyway.
1201                          */
1202                         if (errno != ENOENT)
1203                                 ereport(WARNING,
1204                                                 (errcode_for_file_access(),
1205                                                  errmsg("could not remove relation %u/%u/%u: %m",
1206                                                                 entry->rnode.spcNode,
1207                                                                 entry->rnode.dbNode,
1208                                                                 entry->rnode.relNode)));
1209                 }
1210                 pfree(path);
1211
1212                 pendingUnlinks = list_delete_first(pendingUnlinks);
1213                 pfree(entry);
1214         }
1215 }
1216
1217 /*
1218  * register_dirty_segment() -- Mark a relation segment as needing fsync
1219  *
1220  * If there is a local pending-ops table, just make an entry in it for
1221  * mdsync to process later.  Otherwise, try to pass off the fsync request
1222  * to the background writer process.  If that fails, just do the fsync
1223  * locally before returning (we expect this will not happen often enough
1224  * to be a performance problem).
1225  */
1226 static void
1227 register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
1228 {
1229         if (pendingOpsTable)
1230         {
1231                 /* push it into local pending-ops table */
1232                 RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
1233         }
1234         else
1235         {
1236                 if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
1237                         return;                         /* passed it off successfully */
1238
1239                 if (FileSync(seg->mdfd_vfd) < 0)
1240                         ereport(ERROR,
1241                                         (errcode_for_file_access(),
1242                                 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1243                                            seg->mdfd_segno,
1244                                            reln->smgr_rnode.spcNode,
1245                                            reln->smgr_rnode.dbNode,
1246                                            reln->smgr_rnode.relNode)));
1247         }
1248 }
1249
1250 /*
1251  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1252  *
1253  * As with register_dirty_segment, this could involve either a local or
1254  * a remote pending-ops table.
1255  */
1256 static void
1257 register_unlink(RelFileNode rnode)
1258 {
1259         if (pendingOpsTable)
1260         {
1261                 /* push it into local pending-ops table */
1262                 RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
1263         }
1264         else
1265         {
1266                 /*
1267                  * Notify the bgwriter about it.  If we fail to queue the request
1268                  * message, we have to sleep and try again, because we can't simply
1269                  * delete the file now.  Ugly, but hopefully won't happen often.
1270                  *
1271                  * XXX should we just leave the file orphaned instead?
1272                  */
1273                 Assert(IsUnderPostmaster);
1274                 while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
1275                         pg_usleep(10000L);      /* 10 msec seems a good number */
1276         }
1277 }
1278
1279 /*
1280  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1281  *
1282  * We stuff most fsync requests into the local hash table for execution
1283  * during the bgwriter's next checkpoint.  UNLINK requests go into a
1284  * separate linked list, however, because they get processed separately.
1285  *
1286  * The range of possible segment numbers is way less than the range of
1287  * BlockNumber, so we can reserve high values of segno for special purposes.
1288  * We define three:
1289  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1290  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1291  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1292  *       checkpoint.
1293  *
1294  * (Handling the FORGET_* requests is a tad slow because the hash table has
1295  * to be searched linearly, but it doesn't seem worth rethinking the table
1296  * structure for them.)
1297  */
1298 void
1299 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
1300 {
1301         Assert(pendingOpsTable);
1302
1303         if (segno == FORGET_RELATION_FSYNC)
1304         {
1305                 /* Remove any pending requests for the entire relation */
1306                 HASH_SEQ_STATUS hstat;
1307                 PendingOperationEntry *entry;
1308
1309                 hash_seq_init(&hstat, pendingOpsTable);
1310                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1311                 {
1312                         if (RelFileNodeEquals(entry->tag.rnode, rnode))
1313                         {
1314                                 /* Okay, cancel this entry */
1315                                 entry->canceled = true;
1316                         }
1317                 }
1318         }
1319         else if (segno == FORGET_DATABASE_FSYNC)
1320         {
1321                 /* Remove any pending requests for the entire database */
1322                 HASH_SEQ_STATUS hstat;
1323                 PendingOperationEntry *entry;
1324
1325                 hash_seq_init(&hstat, pendingOpsTable);
1326                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1327                 {
1328                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1329                         {
1330                                 /* Okay, cancel this entry */
1331                                 entry->canceled = true;
1332                         }
1333                 }
1334         }
1335         else if (segno == UNLINK_RELATION_REQUEST)
1336         {
1337                 /* Unlink request: put it in the linked list */
1338                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1339                 PendingUnlinkEntry *entry;
1340
1341                 entry = palloc(sizeof(PendingUnlinkEntry));
1342                 entry->rnode = rnode;
1343                 entry->cycle_ctr = mdckpt_cycle_ctr;
1344
1345                 pendingUnlinks = lappend(pendingUnlinks, entry);
1346
1347                 MemoryContextSwitchTo(oldcxt);
1348         }
1349         else
1350         {
1351                 /* Normal case: enter a request to fsync this segment */
1352                 PendingOperationTag key;
1353                 PendingOperationEntry *entry;
1354                 bool            found;
1355
1356                 /* ensure any pad bytes in the hash key are zeroed */
1357                 MemSet(&key, 0, sizeof(key));
1358                 key.rnode = rnode;
1359                 key.segno = segno;
1360
1361                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1362                                                                                                           &key,
1363                                                                                                           HASH_ENTER,
1364                                                                                                           &found);
1365                 /* if new or previously canceled entry, initialize it */
1366                 if (!found || entry->canceled)
1367                 {
1368                         entry->canceled = false;
1369                         entry->cycle_ctr = mdsync_cycle_ctr;
1370                 }
1371
1372                 /*
1373                  * NB: it's intentional that we don't change cycle_ctr if the entry
1374                  * already exists.      The fsync request must be treated as old, even
1375                  * though the new request will be satisfied too by any subsequent
1376                  * fsync.
1377                  *
1378                  * However, if the entry is present but is marked canceled, we should
1379                  * act just as though it wasn't there.  The only case where this could
1380                  * happen would be if a file had been deleted, we received but did not
1381                  * yet act on the cancel request, and the same relfilenode was then
1382                  * assigned to a new file.      We mustn't lose the new request, but it
1383                  * should be considered new not old.
1384                  */
1385         }
1386 }
1387
1388 /*
1389  * ForgetRelationFsyncRequests -- ensure any fsyncs for a rel are forgotten
1390  */
1391 void
1392 ForgetRelationFsyncRequests(RelFileNode rnode)
1393 {
1394         if (pendingOpsTable)
1395         {
1396                 /* standalone backend or startup process: fsync state is local */
1397                 RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
1398         }
1399         else if (IsUnderPostmaster)
1400         {
1401                 /*
1402                  * Notify the bgwriter about it.  If we fail to queue the revoke
1403                  * message, we have to sleep and try again ... ugly, but hopefully
1404                  * won't happen often.
1405                  *
1406                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1407                  * error would leave the no-longer-used file still present on disk,
1408                  * which would be bad, so I'm inclined to assume that the bgwriter
1409                  * will always empty the queue soon.
1410                  */
1411                 while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
1412                         pg_usleep(10000L);      /* 10 msec seems a good number */
1413
1414                 /*
1415                  * Note we don't wait for the bgwriter to actually absorb the revoke
1416                  * message; see mdsync() for the implications.
1417                  */
1418         }
1419 }
1420
1421 /*
1422  * ForgetDatabaseFsyncRequests -- ensure any fsyncs for a DB are forgotten
1423  */
1424 void
1425 ForgetDatabaseFsyncRequests(Oid dbid)
1426 {
1427         RelFileNode rnode;
1428
1429         rnode.dbNode = dbid;
1430         rnode.spcNode = 0;
1431         rnode.relNode = 0;
1432
1433         if (pendingOpsTable)
1434         {
1435                 /* standalone backend or startup process: fsync state is local */
1436                 RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
1437         }
1438         else if (IsUnderPostmaster)
1439         {
1440                 /* see notes in ForgetRelationFsyncRequests */
1441                 while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
1442                         pg_usleep(10000L);      /* 10 msec seems a good number */
1443         }
1444 }
1445
1446
1447 /*
1448  *      _fdvec_alloc() -- Make a MdfdVec object.
1449  */
1450 static MdfdVec *
1451 _fdvec_alloc(void)
1452 {
1453         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1454 }
1455
1456 #ifndef LET_OS_MANAGE_FILESIZE
1457
1458 /*
1459  * Open the specified segment of the relation,
1460  * and make a MdfdVec object for it.  Returns NULL on failure.
1461  */
1462 static MdfdVec *
1463 _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
1464 {
1465         MdfdVec    *v;
1466         int                     fd;
1467         char       *path,
1468                            *fullpath;
1469
1470         path = relpath(reln->smgr_rnode);
1471
1472         if (segno > 0)
1473         {
1474                 /* be sure we have enough space for the '.segno' */
1475                 fullpath = (char *) palloc(strlen(path) + 12);
1476                 sprintf(fullpath, "%s.%u", path, segno);
1477                 pfree(path);
1478         }
1479         else
1480                 fullpath = path;
1481
1482         /* open the file */
1483         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1484
1485         pfree(fullpath);
1486
1487         if (fd < 0)
1488                 return NULL;
1489
1490         /* allocate an mdfdvec entry for it */
1491         v = _fdvec_alloc();
1492
1493         /* fill the entry */
1494         v->mdfd_vfd = fd;
1495         v->mdfd_segno = segno;
1496         v->mdfd_chain = NULL;
1497         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
1498
1499         /* all done */
1500         return v;
1501 }
1502 #endif   /* LET_OS_MANAGE_FILESIZE */
1503
1504 /*
1505  *      _mdfd_getseg() -- Find the segment of the relation holding the
1506  *              specified block.
1507  *
1508  * If the segment doesn't exist, we ereport, return NULL, or create the
1509  * segment, according to "behavior".  Note: isTemp need only be correct
1510  * in the EXTENSION_CREATE case.
1511  */
1512 static MdfdVec *
1513 _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
1514                          ExtensionBehavior behavior)
1515 {
1516         MdfdVec    *v = mdopen(reln, behavior);
1517
1518 #ifndef LET_OS_MANAGE_FILESIZE
1519         BlockNumber targetseg;
1520         BlockNumber nextsegno;
1521
1522         if (!v)
1523                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1524
1525         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1526         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1527         {
1528                 Assert(nextsegno == v->mdfd_segno + 1);
1529
1530                 if (v->mdfd_chain == NULL)
1531                 {
1532                         /*
1533                          * Normally we will create new segments only if authorized by the
1534                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1535                          * recovery, create segments anyway; this allows cases such as
1536                          * replaying WAL data that has a write into a high-numbered
1537                          * segment of a relation that was later deleted.  We want to go
1538                          * ahead and create the segments so we can finish out the replay.
1539                          *
1540                          * We have to maintain the invariant that segments before the last
1541                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1542                          * with zeroes if needed.  (This only matters if caller is
1543                          * extending the relation discontiguously, but that can happen in
1544                          * hash indexes.)
1545                          */
1546                         if (behavior == EXTENSION_CREATE || InRecovery)
1547                         {
1548                                 if (_mdnblocks(reln, v) < RELSEG_SIZE)
1549                                 {
1550                                         char       *zerobuf = palloc0(BLCKSZ);
1551
1552                                         mdextend(reln, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1553                                                          zerobuf, isTemp);
1554                                         pfree(zerobuf);
1555                                 }
1556                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, O_CREAT);
1557                         }
1558                         else
1559                         {
1560                                 /* We won't create segment if not existent */
1561                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, 0);
1562                         }
1563                         if (v->mdfd_chain == NULL)
1564                         {
1565                                 if (behavior == EXTENSION_RETURN_NULL &&
1566                                         FILE_POSSIBLY_DELETED(errno))
1567                                         return NULL;
1568                                 ereport(ERROR,
1569                                                 (errcode_for_file_access(),
1570                                                  errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
1571                                                                 nextsegno,
1572                                                                 reln->smgr_rnode.spcNode,
1573                                                                 reln->smgr_rnode.dbNode,
1574                                                                 reln->smgr_rnode.relNode,
1575                                                                 blkno)));
1576                         }
1577                 }
1578                 v = v->mdfd_chain;
1579         }
1580 #endif
1581
1582         return v;
1583 }
1584
1585 /*
1586  * Get number of blocks present in a single disk file
1587  */
1588 static BlockNumber
1589 _mdnblocks(SMgrRelation reln, MdfdVec *seg)
1590 {
1591         long            len;
1592
1593         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1594         if (len < 0)
1595                 ereport(ERROR,
1596                                 (errcode_for_file_access(),
1597                 errmsg("could not seek to end of segment %u of relation %u/%u/%u: %m",
1598                            seg->mdfd_segno,
1599                            reln->smgr_rnode.spcNode,
1600                            reln->smgr_rnode.dbNode,
1601                            reln->smgr_rnode.relNode)));
1602         /* note that this calculation will ignore any partial block at EOF */
1603         return (BlockNumber) (len / BLCKSZ);
1604 }