]> granicus.if.org Git - postgresql/blob - src/backend/storage/smgr/md.c
Update copyright for 2009.
[postgresql] / src / backend / storage / smgr / md.c
1 /*-------------------------------------------------------------------------
2  *
3  * md.c
4  *        This code manages relations that reside on magnetic disk.
5  *
6  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.143 2009/01/01 17:23:48 momjian Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include <unistd.h>
18 #include <fcntl.h>
19 #include <sys/file.h>
20
21 #include "catalog/catalog.h"
22 #include "miscadmin.h"
23 #include "postmaster/bgwriter.h"
24 #include "storage/fd.h"
25 #include "storage/bufmgr.h"
26 #include "storage/relfilenode.h"
27 #include "storage/smgr.h"
28 #include "utils/hsearch.h"
29 #include "utils/memutils.h"
30 #include "pg_trace.h"
31
32
33 /* interval for calling AbsorbFsyncRequests in mdsync */
34 #define FSYNCS_PER_ABSORB               10
35
36 /* special values for the segno arg to RememberFsyncRequest */
37 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
38 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
39 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
40
41 /*
42  * On Windows, we have to interpret EACCES as possibly meaning the same as
43  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
44  * that's what you get.  Ugh.  This code is designed so that we don't
45  * actually believe these cases are okay without further evidence (namely,
46  * a pending fsync request getting revoked ... see mdsync).
47  */
48 #ifndef WIN32
49 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
50 #else
51 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
52 #endif
53
54 /*
55  *      The magnetic disk storage manager keeps track of open file
56  *      descriptors in its own descriptor pool.  This is done to make it
57  *      easier to support relations that are larger than the operating
58  *      system's file size limit (often 2GBytes).  In order to do that,
59  *      we break relations up into "segment" files that are each shorter than
60  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
61  *      configuration constant in pg_config.h.
62  *
63  *      On disk, a relation must consist of consecutively numbered segment
64  *      files in the pattern
65  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
66  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
67  *              -- Optionally, any number of inactive segments of size 0 blocks.
68  *      The full and partial segments are collectively the "active" segments.
69  *      Inactive segments are those that once contained data but are currently
70  *      not needed because of an mdtruncate() operation.  The reason for leaving
71  *      them present at size zero, rather than unlinking them, is that other
72  *      backends and/or the bgwriter might be holding open file references to
73  *      such segments.  If the relation expands again after mdtruncate(), such
74  *      that a deactivated segment becomes active again, it is important that
75  *      such file references still be valid --- else data might get written
76  *      out to an unlinked old copy of a segment file that will eventually
77  *      disappear.
78  *
79  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
80  *      cache is, therefore, just the head of a list of MdfdVec objects, one
81  *      per segment.  But note the md_fd pointer can be NULL, indicating
82  *      relation not open.
83  *
84  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
85  *      doesn't have another segment after this one; we may just not have
86  *      opened the next segment yet.  (We could not have "all segments are
87  *      in the chain" as an invariant anyway, since another backend could
88  *      extend the relation when we weren't looking.)  We do not make chain
89  *      entries for inactive segments, however; as soon as we find a partial
90  *      segment, we assume that any subsequent segments are inactive.
91  *
92  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
93  */
94
95 typedef struct _MdfdVec
96 {
97         File            mdfd_vfd;               /* fd number in fd.c's pool */
98         BlockNumber mdfd_segno;         /* segment number, from 0 */
99         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
100 } MdfdVec;
101
102 static MemoryContext MdCxt;             /* context for all md.c allocations */
103
104
105 /*
106  * In some contexts (currently, standalone backends and the bgwriter process)
107  * we keep track of pending fsync operations: we need to remember all relation
108  * segments that have been written since the last checkpoint, so that we can
109  * fsync them down to disk before completing the next checkpoint.  This hash
110  * table remembers the pending operations.      We use a hash table mostly as
111  * a convenient way of eliminating duplicate requests.
112  *
113  * We use a similar mechanism to remember no-longer-needed files that can
114  * be deleted after the next checkpoint, but we use a linked list instead of
115  * a hash table, because we don't expect there to be any duplicate requests.
116  *
117  * (Regular backends do not track pending operations locally, but forward
118  * them to the bgwriter.)
119  */
120 typedef struct
121 {
122         RelFileNode rnode;                      /* the targeted relation */
123         ForkNumber forknum;
124         BlockNumber segno;                      /* which segment */
125 } PendingOperationTag;
126
127 typedef uint16 CycleCtr;                /* can be any convenient integer size */
128
129 typedef struct
130 {
131         PendingOperationTag tag;        /* hash table key (must be first!) */
132         bool            canceled;               /* T => request canceled, not yet removed */
133         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
134 } PendingOperationEntry;
135
136 typedef struct
137 {
138         RelFileNode rnode;                      /* the dead relation to delete */
139         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
140 } PendingUnlinkEntry;
141
142 static HTAB *pendingOpsTable = NULL;
143 static List *pendingUnlinks = NIL;
144
145 static CycleCtr mdsync_cycle_ctr = 0;
146 static CycleCtr mdckpt_cycle_ctr = 0;
147
148
149 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
150 {
151         EXTENSION_FAIL,                         /* ereport if segment not present */
152         EXTENSION_RETURN_NULL,          /* return NULL if not present */
153         EXTENSION_CREATE                        /* create new segments as needed */
154 } ExtensionBehavior;
155
156 /* local routines */
157 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, 
158                                            ExtensionBehavior behavior);
159 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
160                                                                    MdfdVec *seg);
161 static void register_unlink(RelFileNode rnode);
162 static MdfdVec *_fdvec_alloc(void);
163 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
164                                                           BlockNumber segno, int oflags);
165 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
166                          BlockNumber blkno, bool isTemp, ExtensionBehavior behavior);
167 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
168                                                           MdfdVec *seg);
169
170
171 /*
172  *      mdinit() -- Initialize private state for magnetic disk storage manager.
173  */
174 void
175 mdinit(void)
176 {
177         MdCxt = AllocSetContextCreate(TopMemoryContext,
178                                                                   "MdSmgr",
179                                                                   ALLOCSET_DEFAULT_MINSIZE,
180                                                                   ALLOCSET_DEFAULT_INITSIZE,
181                                                                   ALLOCSET_DEFAULT_MAXSIZE);
182
183         /*
184          * Create pending-operations hashtable if we need it.  Currently, we need
185          * it if we are standalone (not under a postmaster) OR if we are a
186          * bootstrap-mode subprocess of a postmaster (that is, a startup or
187          * bgwriter process).
188          */
189         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
190         {
191                 HASHCTL         hash_ctl;
192
193                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
194                 hash_ctl.keysize = sizeof(PendingOperationTag);
195                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
196                 hash_ctl.hash = tag_hash;
197                 hash_ctl.hcxt = MdCxt;
198                 pendingOpsTable = hash_create("Pending Ops Table",
199                                                                           100L,
200                                                                           &hash_ctl,
201                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
202                 pendingUnlinks = NIL;
203         }
204 }
205
206 /*
207  *  mdexists() -- Does the physical file exist?
208  *
209  * Note: this will return true for lingering files, with pending deletions
210  */
211 bool
212 mdexists(SMgrRelation reln, ForkNumber forkNum)
213 {
214         /*
215          * Close it first, to ensure that we notice if the fork has been
216          * unlinked since we opened it.
217          */
218         mdclose(reln, forkNum);
219
220         return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
221 }
222
223 /*
224  *      mdcreate() -- Create a new relation on magnetic disk.
225  *
226  * If isRedo is true, it's okay for the relation to exist already.
227  */
228 void
229 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
230 {
231         char       *path;
232         File            fd;
233
234         if (isRedo && reln->md_fd[forkNum] != NULL)
235                 return;                                 /* created and opened already... */
236
237         Assert(reln->md_fd[forkNum] == NULL);
238
239         path = relpath(reln->smgr_rnode, forkNum);
240
241         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
242
243         if (fd < 0)
244         {
245                 int                     save_errno = errno;
246
247                 /*
248                  * During bootstrap, there are cases where a system relation will be
249                  * accessed (by internal backend processes) before the bootstrap
250                  * script nominally creates it.  Therefore, allow the file to exist
251                  * already, even if isRedo is not set.  (See also mdopen)
252                  */
253                 if (isRedo || IsBootstrapProcessingMode())
254                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
255                 if (fd < 0)
256                 {
257                         /* be sure to report the error reported by create, not open */
258                         errno = save_errno;
259                         ereport(ERROR,
260                                         (errcode_for_file_access(),
261                                          errmsg("could not create relation %s: %m", path)));
262                 }
263         }
264
265         pfree(path);
266
267         reln->md_fd[forkNum] = _fdvec_alloc();
268
269         reln->md_fd[forkNum]->mdfd_vfd = fd;
270         reln->md_fd[forkNum]->mdfd_segno = 0;
271         reln->md_fd[forkNum]->mdfd_chain = NULL;
272 }
273
274 /*
275  *      mdunlink() -- Unlink a relation.
276  *
277  * Note that we're passed a RelFileNode --- by the time this is called,
278  * there won't be an SMgrRelation hashtable entry anymore.
279  *
280  * Actually, we don't unlink the first segment file of the relation, but
281  * just truncate it to zero length, and record a request to unlink it after
282  * the next checkpoint.  Additional segments can be unlinked immediately,
283  * however.  Leaving the empty file in place prevents that relfilenode
284  * number from being reused.  The scenario this protects us from is:
285  * 1. We delete a relation (and commit, and actually remove its file).
286  * 2. We create a new relation, which by chance gets the same relfilenode as
287  *        the just-deleted one (OIDs must've wrapped around for that to happen).
288  * 3. We crash before another checkpoint occurs.
289  * During replay, we would delete the file and then recreate it, which is fine
290  * if the contents of the file were repopulated by subsequent WAL entries.
291  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
292  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
293  * the contents of the file would be lost forever.      By leaving the empty file
294  * until after the next checkpoint, we prevent reassignment of the relfilenode
295  * number until it's safe, because relfilenode assignment skips over any
296  * existing file.
297  *
298  * If isRedo is true, it's okay for the relation to be already gone.
299  * Also, we should remove the file immediately instead of queuing a request
300  * for later, since during redo there's no possibility of creating a
301  * conflicting relation.
302  *
303  * Note: any failure should be reported as WARNING not ERROR, because
304  * we are usually not in a transaction anymore when this is called.
305  */
306 void
307 mdunlink(RelFileNode rnode, ForkNumber forkNum, bool isRedo)
308 {
309         char       *path;
310         int                     ret;
311
312         /*
313          * We have to clean out any pending fsync requests for the doomed
314          * relation, else the next mdsync() will fail.
315          */
316         ForgetRelationFsyncRequests(rnode, forkNum);
317
318         path = relpath(rnode, forkNum);
319
320         /*
321          * Delete or truncate the first segment.
322          */
323         if (isRedo || forkNum != MAIN_FORKNUM)
324                 ret = unlink(path);
325         else
326         {
327                 /* truncate(2) would be easier here, but Windows hasn't got it */
328                 int                     fd;
329
330                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
331                 if (fd >= 0)
332                 {
333                         int                     save_errno;
334
335                         ret = ftruncate(fd, 0);
336                         save_errno = errno;
337                         close(fd);
338                         errno = save_errno;
339                 }
340                 else
341                         ret = -1;
342         }
343         if (ret < 0)
344         {
345                 if (!isRedo || errno != ENOENT)
346                         ereport(WARNING,
347                                         (errcode_for_file_access(),
348                                          errmsg("could not remove relation %s: %m", path)));
349         }
350
351         /*
352          * Delete any additional segments.
353          */
354         else
355         {
356                 char       *segpath = (char *) palloc(strlen(path) + 12);
357                 BlockNumber segno;
358
359                 /*
360                  * Note that because we loop until getting ENOENT, we will correctly
361                  * remove all inactive segments as well as active ones.
362                  */
363                 for (segno = 1;; segno++)
364                 {
365                         sprintf(segpath, "%s.%u", path, segno);
366                         if (unlink(segpath) < 0)
367                         {
368                                 /* ENOENT is expected after the last segment... */
369                                 if (errno != ENOENT)
370                                         ereport(WARNING,
371                                                         (errcode_for_file_access(),
372                                                          errmsg("could not remove segment %u of relation %s: %m",
373                                                                         segno, path)));
374                                 break;
375                         }
376                 }
377                 pfree(segpath);
378         }
379
380         pfree(path);
381
382         /* Register request to unlink first segment later */
383         if (!isRedo && forkNum == MAIN_FORKNUM)
384                 register_unlink(rnode);
385 }
386
387 /*
388  *      mdextend() -- Add a block to the specified relation.
389  *
390  *              The semantics are nearly the same as mdwrite(): write at the
391  *              specified position.  However, this is to be used for the case of
392  *              extending a relation (i.e., blocknum is at or beyond the current
393  *              EOF).  Note that we assume writing a block beyond current EOF
394  *              causes intervening file space to become filled with zeroes.
395  */
396 void
397 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
398                  char *buffer, bool isTemp)
399 {
400         off_t           seekpos;
401         int                     nbytes;
402         MdfdVec    *v;
403
404         /* This assert is too expensive to have on normally ... */
405 #ifdef CHECK_WRITE_VS_EXTEND
406         Assert(blocknum >= mdnblocks(reln, forknum));
407 #endif
408
409         /*
410          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
411          * more --- we mustn't create a block whose number actually is
412          * InvalidBlockNumber.
413          */
414         if (blocknum == InvalidBlockNumber)
415                 ereport(ERROR,
416                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
417                                  errmsg("cannot extend relation %s beyond %u blocks",
418                                                 relpath(reln->smgr_rnode, forknum),
419                                                 InvalidBlockNumber)));
420
421         v = _mdfd_getseg(reln, forknum, blocknum, isTemp, EXTENSION_CREATE);
422
423         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
424         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
425
426         /*
427          * Note: because caller usually obtained blocknum by calling mdnblocks,
428          * which did a seek(SEEK_END), this seek is often redundant and will be
429          * optimized away by fd.c.      It's not redundant, however, if there is a
430          * partial page at the end of the file. In that case we want to try to
431          * overwrite the partial page with a full page.  It's also not redundant
432          * if bufmgr.c had to dump another buffer of the same file to make room
433          * for the new page's buffer.
434          */
435         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
436                 ereport(ERROR,
437                                 (errcode_for_file_access(),
438                                  errmsg("could not seek to block %u of relation %s: %m",
439                                                 blocknum,
440                                                 relpath(reln->smgr_rnode, forknum))));
441
442         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
443         {
444                 if (nbytes < 0)
445                         ereport(ERROR,
446                                         (errcode_for_file_access(),
447                                          errmsg("could not extend relation %s: %m",
448                                                         relpath(reln->smgr_rnode, forknum)),
449                                          errhint("Check free disk space.")));
450                 /* short write: complain appropriately */
451                 ereport(ERROR,
452                                 (errcode(ERRCODE_DISK_FULL),
453                                  errmsg("could not extend relation %s: wrote only %d of %d bytes at block %u",
454                                                 relpath(reln->smgr_rnode, forknum),
455                                                 nbytes, BLCKSZ, blocknum),
456                                  errhint("Check free disk space.")));
457         }
458
459         if (!isTemp)
460                 register_dirty_segment(reln, forknum, v);
461
462         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
463 }
464
465 /*
466  *      mdopen() -- Open the specified relation.
467  *
468  * Note we only open the first segment, when there are multiple segments.
469  *
470  * If first segment is not present, either ereport or return NULL according
471  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
472  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
473  * invent one out of whole cloth.
474  */
475 static MdfdVec *
476 mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
477 {
478         MdfdVec    *mdfd;
479         char       *path;
480         File            fd;
481
482         /* No work if already open */
483         if (reln->md_fd[forknum])
484                 return reln->md_fd[forknum];
485
486         path = relpath(reln->smgr_rnode, forknum);
487
488         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
489
490         if (fd < 0)
491         {
492                 /*
493                  * During bootstrap, there are cases where a system relation will be
494                  * accessed (by internal backend processes) before the bootstrap
495                  * script nominally creates it.  Therefore, accept mdopen() as a
496                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
497                  */
498                 if (IsBootstrapProcessingMode())
499                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
500                 if (fd < 0)
501                 {
502                         if (behavior == EXTENSION_RETURN_NULL &&
503                                 FILE_POSSIBLY_DELETED(errno))
504                         {
505                                 pfree(path);
506                                 return NULL;
507                         }
508                         ereport(ERROR,
509                                         (errcode_for_file_access(),
510                                          errmsg("could not open relation %s: %m", path)));
511                 }
512         }
513
514         pfree(path);
515
516         reln->md_fd[forknum] = mdfd = _fdvec_alloc();
517
518         mdfd->mdfd_vfd = fd;
519         mdfd->mdfd_segno = 0;
520         mdfd->mdfd_chain = NULL;
521         Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
522
523         return mdfd;
524 }
525
526 /*
527  *      mdclose() -- Close the specified relation, if it isn't closed already.
528  */
529 void
530 mdclose(SMgrRelation reln, ForkNumber forknum)
531 {
532         MdfdVec    *v = reln->md_fd[forknum];
533
534         /* No work if already closed */
535         if (v == NULL)
536                 return;
537
538         reln->md_fd[forknum] = NULL;                    /* prevent dangling pointer after error */
539
540         while (v != NULL)
541         {
542                 MdfdVec    *ov = v;
543
544                 /* if not closed already */
545                 if (v->mdfd_vfd >= 0)
546                         FileClose(v->mdfd_vfd);
547                 /* Now free vector */
548                 v = v->mdfd_chain;
549                 pfree(ov);
550         }
551 }
552
553 /*
554  *      mdread() -- Read the specified block from a relation.
555  */
556 void
557 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
558            char *buffer)
559 {
560         off_t           seekpos;
561         int                     nbytes;
562         MdfdVec    *v;
563
564         TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode);
565
566         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
567
568         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
569         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
570
571         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
572                 ereport(ERROR,
573                                 (errcode_for_file_access(),
574                                  errmsg("could not seek to block %u of relation %s: %m",
575                                                 blocknum, relpath(reln->smgr_rnode, forknum))));
576
577         nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
578
579         TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode, relpath(reln->smgr_rnode, forknum), nbytes, BLCKSZ);
580
581         if (nbytes != BLCKSZ)
582         {
583                 if (nbytes < 0)
584                         ereport(ERROR,
585                                         (errcode_for_file_access(),
586                                    errmsg("could not read block %u of relation %s: %m",
587                                                   blocknum, relpath(reln->smgr_rnode, forknum))));
588
589                 /*
590                  * Short read: we are at or past EOF, or we read a partial block at
591                  * EOF.  Normally this is an error; upper levels should never try to
592                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
593                  * we are InRecovery, we should instead return zeroes without
594                  * complaining.  This allows, for example, the case of trying to
595                  * update a block that was later truncated away.
596                  */
597                 if (zero_damaged_pages || InRecovery)
598                         MemSet(buffer, 0, BLCKSZ);
599                 else
600                         ereport(ERROR,
601                                         (errcode(ERRCODE_DATA_CORRUPTED),
602                                          errmsg("could not read block %u of relation %s: read only %d of %d bytes",
603                                                         blocknum, relpath(reln->smgr_rnode, forknum),
604                                                         nbytes, BLCKSZ)));
605         }
606 }
607
608 /*
609  *      mdwrite() -- Write the supplied block at the appropriate location.
610  *
611  *              This is to be used only for updating already-existing blocks of a
612  *              relation (ie, those before the current EOF).  To extend a relation,
613  *              use mdextend().
614  */
615 void
616 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
617                 char *buffer, bool isTemp)
618 {
619         off_t           seekpos;
620         int                     nbytes;
621         MdfdVec    *v;
622
623         /* This assert is too expensive to have on normally ... */
624 #ifdef CHECK_WRITE_VS_EXTEND
625         Assert(blocknum < mdnblocks(reln, forknum));
626 #endif
627
628         TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode);
629
630         v = _mdfd_getseg(reln, forknum, blocknum, isTemp, EXTENSION_FAIL);
631
632         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
633         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
634
635         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
636                 ereport(ERROR,
637                                 (errcode_for_file_access(),
638                                  errmsg("could not seek to block %u of relation %s: %m",
639                                                 blocknum, relpath(reln->smgr_rnode, forknum))));
640
641         nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
642
643         TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum, reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode, reln->smgr_rnode.relNode, relpath(reln->smgr_rnode, forknum), nbytes, BLCKSZ);
644
645         if (nbytes != BLCKSZ)
646         {
647                 if (nbytes < 0)
648                         ereport(ERROR,
649                                         (errcode_for_file_access(),
650                                   errmsg("could not write block %u of relation %s: %m",
651                                                  blocknum, relpath(reln->smgr_rnode, forknum))));
652                 /* short write: complain appropriately */
653                 ereport(ERROR,
654                                 (errcode(ERRCODE_DISK_FULL),
655                                  errmsg("could not write block %u of relation %s: wrote only %d of %d bytes",
656                                                 blocknum,
657                                                 relpath(reln->smgr_rnode, forknum),
658                                                 nbytes, BLCKSZ),
659                                  errhint("Check free disk space.")));
660         }
661
662         if (!isTemp)
663                 register_dirty_segment(reln, forknum, v);
664 }
665
666 /*
667  *      mdnblocks() -- Get the number of blocks stored in a relation.
668  *
669  *              Important side effect: all active segments of the relation are opened
670  *              and added to the mdfd_chain list.  If this routine has not been
671  *              called, then only segments up to the last one actually touched
672  *              are present in the chain.
673  */
674 BlockNumber
675 mdnblocks(SMgrRelation reln, ForkNumber forknum)
676 {
677         MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
678         BlockNumber nblocks;
679         BlockNumber segno = 0;
680
681         /*
682          * Skip through any segments that aren't the last one, to avoid redundant
683          * seeks on them.  We have previously verified that these segments are
684          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
685          *
686          * NOTE: this assumption could only be wrong if another backend has
687          * truncated the relation.      We rely on higher code levels to handle that
688          * scenario by closing and re-opening the md fd, which is handled via
689          * relcache flush.      (Since the bgwriter doesn't participate in relcache
690          * flush, it could have segment chain entries for inactive segments;
691          * that's OK because the bgwriter never needs to compute relation size.)
692          */
693         while (v->mdfd_chain != NULL)
694         {
695                 segno++;
696                 v = v->mdfd_chain;
697         }
698
699         for (;;)
700         {
701                 nblocks = _mdnblocks(reln, forknum, v);
702                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
703                         elog(FATAL, "segment too big");
704                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
705                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
706
707                 /*
708                  * If segment is exactly RELSEG_SIZE, advance to next one.
709                  */
710                 segno++;
711
712                 if (v->mdfd_chain == NULL)
713                 {
714                         /*
715                          * Because we pass O_CREAT, we will create the next segment (with
716                          * zero length) immediately, if the last segment is of length
717                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
718                          * the logic simple.
719                          */
720                         v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
721                         if (v->mdfd_chain == NULL)
722                                 ereport(ERROR,
723                                                 (errcode_for_file_access(),
724                                  errmsg("could not open segment %u of relation %s: %m",
725                                                 segno,
726                                                 relpath(reln->smgr_rnode, forknum))));
727                 }
728
729                 v = v->mdfd_chain;
730         }
731 }
732
733 /*
734  *      mdtruncate() -- Truncate relation to specified number of blocks.
735  */
736 void
737 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks,
738                    bool isTemp)
739 {
740         MdfdVec    *v;
741         BlockNumber curnblk;
742         BlockNumber priorblocks;
743
744         /*
745          * NOTE: mdnblocks makes sure we have opened all active segments, so that
746          * truncation loop will get them all!
747          */
748         curnblk = mdnblocks(reln, forknum);
749         if (nblocks > curnblk)
750         {
751                 /* Bogus request ... but no complaint if InRecovery */
752                 if (InRecovery)
753                         return;
754                 ereport(ERROR,
755                                 (errmsg("could not truncate relation %s to %u blocks: it's only %u blocks now",
756                                                 relpath(reln->smgr_rnode, forknum),
757                                                 nblocks, curnblk)));
758         }
759         if (nblocks == curnblk)
760                 return;                                 /* no work */
761
762         v = mdopen(reln, forknum, EXTENSION_FAIL);
763
764         priorblocks = 0;
765         while (v != NULL)
766         {
767                 MdfdVec    *ov = v;
768
769                 if (priorblocks > nblocks)
770                 {
771                         /*
772                          * This segment is no longer active (and has already been unlinked
773                          * from the mdfd_chain). We truncate the file, but do not delete
774                          * it, for reasons explained in the header comments.
775                          */
776                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
777                                 ereport(ERROR,
778                                                 (errcode_for_file_access(),
779                                                  errmsg("could not truncate relation %s to %u blocks: %m",
780                                                                 relpath(reln->smgr_rnode, forknum),
781                                                                 nblocks)));
782                         if (!isTemp)
783                                 register_dirty_segment(reln, forknum, v);
784                         v = v->mdfd_chain;
785                         Assert(ov != reln->md_fd[forknum]);     /* we never drop the 1st segment */
786                         pfree(ov);
787                 }
788                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
789                 {
790                         /*
791                          * This is the last segment we want to keep. Truncate the file to
792                          * the right length, and clear chain link that points to any
793                          * remaining segments (which we shall zap). NOTE: if nblocks is
794                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
795                          * segment to 0 length but keep it. This adheres to the invariant
796                          * given in the header comments.
797                          */
798                         BlockNumber lastsegblocks = nblocks - priorblocks;
799
800                         if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
801                                 ereport(ERROR,
802                                                 (errcode_for_file_access(),
803                                                  errmsg("could not truncate relation %s to %u blocks: %m",
804                                                                 relpath(reln->smgr_rnode, forknum),
805                                                                 nblocks)));
806                         if (!isTemp)
807                                 register_dirty_segment(reln, forknum, v);
808                         v = v->mdfd_chain;
809                         ov->mdfd_chain = NULL;
810                 }
811                 else
812                 {
813                         /*
814                          * We still need this segment and 0 or more blocks beyond it, so
815                          * nothing to do here.
816                          */
817                         v = v->mdfd_chain;
818                 }
819                 priorblocks += RELSEG_SIZE;
820         }
821 }
822
823 /*
824  *      mdimmedsync() -- Immediately sync a relation to stable storage.
825  *
826  * Note that only writes already issued are synced; this routine knows
827  * nothing of dirty buffers that may exist inside the buffer manager.
828  */
829 void
830 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
831 {
832         MdfdVec    *v;
833         BlockNumber curnblk;
834
835         /*
836          * NOTE: mdnblocks makes sure we have opened all active segments, so that
837          * fsync loop will get them all!
838          */
839         curnblk = mdnblocks(reln, forknum);
840
841         v = mdopen(reln, forknum, EXTENSION_FAIL);
842
843         while (v != NULL)
844         {
845                 if (FileSync(v->mdfd_vfd) < 0)
846                         ereport(ERROR,
847                                         (errcode_for_file_access(),
848                                          errmsg("could not fsync segment %u of relation %s: %m",
849                                                         v->mdfd_segno,
850                                                         relpath(reln->smgr_rnode, forknum))));
851                 v = v->mdfd_chain;
852         }
853 }
854
855 /*
856  *      mdsync() -- Sync previous writes to stable storage.
857  */
858 void
859 mdsync(void)
860 {
861         static bool mdsync_in_progress = false;
862
863         HASH_SEQ_STATUS hstat;
864         PendingOperationEntry *entry;
865         int                     absorb_counter;
866
867         /*
868          * This is only called during checkpoints, and checkpoints should only
869          * occur in processes that have created a pendingOpsTable.
870          */
871         if (!pendingOpsTable)
872                 elog(ERROR, "cannot sync without a pendingOpsTable");
873
874         /*
875          * If we are in the bgwriter, the sync had better include all fsync
876          * requests that were queued by backends up to this point.      The tightest
877          * race condition that could occur is that a buffer that must be written
878          * and fsync'd for the checkpoint could have been dumped by a backend just
879          * before it was visited by BufferSync().  We know the backend will have
880          * queued an fsync request before clearing the buffer's dirtybit, so we
881          * are safe as long as we do an Absorb after completing BufferSync().
882          */
883         AbsorbFsyncRequests();
884
885         /*
886          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
887          * checkpoint), we want to ignore fsync requests that are entered into the
888          * hashtable after this point --- they should be processed next time,
889          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
890          * ones: new ones will have cycle_ctr equal to the incremented value of
891          * mdsync_cycle_ctr.
892          *
893          * In normal circumstances, all entries present in the table at this point
894          * will have cycle_ctr exactly equal to the current (about to be old)
895          * value of mdsync_cycle_ctr.  However, if we fail partway through the
896          * fsync'ing loop, then older values of cycle_ctr might remain when we
897          * come back here to try again.  Repeated checkpoint failures would
898          * eventually wrap the counter around to the point where an old entry
899          * might appear new, causing us to skip it, possibly allowing a checkpoint
900          * to succeed that should not have.  To forestall wraparound, any time the
901          * previous mdsync() failed to complete, run through the table and
902          * forcibly set cycle_ctr = mdsync_cycle_ctr.
903          *
904          * Think not to merge this loop with the main loop, as the problem is
905          * exactly that that loop may fail before having visited all the entries.
906          * From a performance point of view it doesn't matter anyway, as this path
907          * will never be taken in a system that's functioning normally.
908          */
909         if (mdsync_in_progress)
910         {
911                 /* prior try failed, so update any stale cycle_ctr values */
912                 hash_seq_init(&hstat, pendingOpsTable);
913                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
914                 {
915                         entry->cycle_ctr = mdsync_cycle_ctr;
916                 }
917         }
918
919         /* Advance counter so that new hashtable entries are distinguishable */
920         mdsync_cycle_ctr++;
921
922         /* Set flag to detect failure if we don't reach the end of the loop */
923         mdsync_in_progress = true;
924
925         /* Now scan the hashtable for fsync requests to process */
926         absorb_counter = FSYNCS_PER_ABSORB;
927         hash_seq_init(&hstat, pendingOpsTable);
928         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
929         {
930                 /*
931                  * If the entry is new then don't process it this time.  Note that
932                  * "continue" bypasses the hash-remove call at the bottom of the loop.
933                  */
934                 if (entry->cycle_ctr == mdsync_cycle_ctr)
935                         continue;
936
937                 /* Else assert we haven't missed it */
938                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
939
940                 /*
941                  * If fsync is off then we don't have to bother opening the file at
942                  * all.  (We delay checking until this point so that changing fsync on
943                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
944                  * fall through to delete it.
945                  */
946                 if (enableFsync && !entry->canceled)
947                 {
948                         int                     failures;
949
950                         /*
951                          * If in bgwriter, we want to absorb pending requests every so
952                          * often to prevent overflow of the fsync request queue.  It is
953                          * unspecified whether newly-added entries will be visited by
954                          * hash_seq_search, but we don't care since we don't need to
955                          * process them anyway.
956                          */
957                         if (--absorb_counter <= 0)
958                         {
959                                 AbsorbFsyncRequests();
960                                 absorb_counter = FSYNCS_PER_ABSORB;
961                         }
962
963                         /*
964                          * The fsync table could contain requests to fsync segments that
965                          * have been deleted (unlinked) by the time we get to them. Rather
966                          * than just hoping an ENOENT (or EACCES on Windows) error can be
967                          * ignored, what we do on error is absorb pending requests and
968                          * then retry.  Since mdunlink() queues a "revoke" message before
969                          * actually unlinking, the fsync request is guaranteed to be
970                          * marked canceled after the absorb if it really was this case.
971                          * DROP DATABASE likewise has to tell us to forget fsync requests
972                          * before it starts deletions.
973                          */
974                         for (failures = 0;; failures++)         /* loop exits at "break" */
975                         {
976                                 SMgrRelation reln;
977                                 MdfdVec    *seg;
978                                 char       *path;
979
980                                 /*
981                                  * Find or create an smgr hash entry for this relation. This
982                                  * may seem a bit unclean -- md calling smgr?  But it's really
983                                  * the best solution.  It ensures that the open file reference
984                                  * isn't permanently leaked if we get an error here. (You may
985                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
986                                  * really, because the only case in which a checkpoint is done
987                                  * by a process that isn't about to shut down is in the
988                                  * bgwriter, and it will periodically do smgrcloseall(). This
989                                  * fact justifies our not closing the reln in the success path
990                                  * either, which is a good thing since in non-bgwriter cases
991                                  * we couldn't safely do that.)  Furthermore, in many cases
992                                  * the relation will have been dirtied through this same smgr
993                                  * relation, and so we can save a file open/close cycle.
994                                  */
995                                 reln = smgropen(entry->tag.rnode);
996
997                                 /*
998                                  * It is possible that the relation has been dropped or
999                                  * truncated since the fsync request was entered.  Therefore,
1000                                  * allow ENOENT, but only if we didn't fail already on this
1001                                  * file.  This applies both during _mdfd_getseg() and during
1002                                  * FileSync, since fd.c might have closed the file behind our
1003                                  * back.
1004                                  */
1005                                 seg = _mdfd_getseg(reln, entry->tag.forknum,
1006                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1007                                                                    false, EXTENSION_RETURN_NULL);
1008                                 if (seg != NULL &&
1009                                         FileSync(seg->mdfd_vfd) >= 0)
1010                                         break;          /* success; break out of retry loop */
1011
1012                                 /*
1013                                  * XXX is there any point in allowing more than one retry?
1014                                  * Don't see one at the moment, but easy to change the test
1015                                  * here if so.
1016                                  */
1017                                 path = relpath(entry->tag.rnode, entry->tag.forknum);
1018                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1019                                         failures > 0)
1020                                         ereport(ERROR,
1021                                                         (errcode_for_file_access(),
1022                                                          errmsg("could not fsync segment %u of relation %s: %m",
1023                                                                         entry->tag.segno, path)));
1024                                 else
1025                                         ereport(DEBUG1,
1026                                                         (errcode_for_file_access(),
1027                                                          errmsg("could not fsync segment %u of relation %s but retrying: %m",
1028                                                                         entry->tag.segno, path)));
1029                                 pfree(path);
1030
1031                                 /*
1032                                  * Absorb incoming requests and check to see if canceled.
1033                                  */
1034                                 AbsorbFsyncRequests();
1035                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1036
1037                                 if (entry->canceled)
1038                                         break;
1039                         }                                       /* end retry loop */
1040                 }
1041
1042                 /*
1043                  * If we get here, either we fsync'd successfully, or we don't have to
1044                  * because enableFsync is off, or the entry is (now) marked canceled.
1045                  * Okay to delete it.
1046                  */
1047                 if (hash_search(pendingOpsTable, &entry->tag,
1048                                                 HASH_REMOVE, NULL) == NULL)
1049                         elog(ERROR, "pendingOpsTable corrupted");
1050         }                                                       /* end loop over hashtable entries */
1051
1052         /* Flag successful completion of mdsync */
1053         mdsync_in_progress = false;
1054 }
1055
1056 /*
1057  * mdpreckpt() -- Do pre-checkpoint work
1058  *
1059  * To distinguish unlink requests that arrived before this checkpoint
1060  * started from those that arrived during the checkpoint, we use a cycle
1061  * counter similar to the one we use for fsync requests. That cycle
1062  * counter is incremented here.
1063  *
1064  * This must be called *before* the checkpoint REDO point is determined.
1065  * That ensures that we won't delete files too soon.
1066  *
1067  * Note that we can't do anything here that depends on the assumption
1068  * that the checkpoint will be completed.
1069  */
1070 void
1071 mdpreckpt(void)
1072 {
1073         ListCell   *cell;
1074
1075         /*
1076          * In case the prior checkpoint wasn't completed, stamp all entries in the
1077          * list with the current cycle counter.  Anything that's in the list at
1078          * the start of checkpoint can surely be deleted after the checkpoint is
1079          * finished, regardless of when the request was made.
1080          */
1081         foreach(cell, pendingUnlinks)
1082         {
1083                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1084
1085                 entry->cycle_ctr = mdckpt_cycle_ctr;
1086         }
1087
1088         /*
1089          * Any unlink requests arriving after this point will be assigned the next
1090          * cycle counter, and won't be unlinked until next checkpoint.
1091          */
1092         mdckpt_cycle_ctr++;
1093 }
1094
1095 /*
1096  * mdpostckpt() -- Do post-checkpoint work
1097  *
1098  * Remove any lingering files that can now be safely removed.
1099  */
1100 void
1101 mdpostckpt(void)
1102 {
1103         while (pendingUnlinks != NIL)
1104         {
1105                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1106                 char       *path;
1107
1108                 /*
1109                  * New entries are appended to the end, so if the entry is new we've
1110                  * reached the end of old entries.
1111                  */
1112                 if (entry->cycle_ctr == mdckpt_cycle_ctr)
1113                         break;
1114
1115                 /* Else assert we haven't missed it */
1116                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1117
1118                 /* Unlink the file */
1119                 path = relpath(entry->rnode, MAIN_FORKNUM);
1120                 if (unlink(path) < 0)
1121                 {
1122                         /*
1123                          * There's a race condition, when the database is dropped at the
1124                          * same time that we process the pending unlink requests. If the
1125                          * DROP DATABASE deletes the file before we do, we will get ENOENT
1126                          * here. rmtree() also has to ignore ENOENT errors, to deal with
1127                          * the possibility that we delete the file first.
1128                          */
1129                         if (errno != ENOENT)
1130                                 ereport(WARNING,
1131                                                 (errcode_for_file_access(),
1132                                                  errmsg("could not remove relation %s: %m", path)));
1133                 }
1134                 pfree(path);
1135
1136                 pendingUnlinks = list_delete_first(pendingUnlinks);
1137                 pfree(entry);
1138         }
1139 }
1140
1141 /*
1142  * register_dirty_segment() -- Mark a relation segment as needing fsync
1143  *
1144  * If there is a local pending-ops table, just make an entry in it for
1145  * mdsync to process later.  Otherwise, try to pass off the fsync request
1146  * to the background writer process.  If that fails, just do the fsync
1147  * locally before returning (we expect this will not happen often enough
1148  * to be a performance problem).
1149  */
1150 static void
1151 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1152 {
1153         if (pendingOpsTable)
1154         {
1155                 /* push it into local pending-ops table */
1156                 RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno);
1157         }
1158         else
1159         {
1160                 if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno))
1161                         return;                         /* passed it off successfully */
1162
1163                 if (FileSync(seg->mdfd_vfd) < 0)
1164                         ereport(ERROR,
1165                                         (errcode_for_file_access(),
1166                                          errmsg("could not fsync segment %u of relation %s: %m",
1167                                                         seg->mdfd_segno,
1168                                                         relpath(reln->smgr_rnode, forknum))));
1169         }
1170 }
1171
1172 /*
1173  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1174  *
1175  * As with register_dirty_segment, this could involve either a local or
1176  * a remote pending-ops table.
1177  */
1178 static void
1179 register_unlink(RelFileNode rnode)
1180 {
1181         if (pendingOpsTable)
1182         {
1183                 /* push it into local pending-ops table */
1184                 RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST);
1185         }
1186         else
1187         {
1188                 /*
1189                  * Notify the bgwriter about it.  If we fail to queue the request
1190                  * message, we have to sleep and try again, because we can't simply
1191                  * delete the file now.  Ugly, but hopefully won't happen often.
1192                  *
1193                  * XXX should we just leave the file orphaned instead?
1194                  */
1195                 Assert(IsUnderPostmaster);
1196                 while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM,
1197                                                                         UNLINK_RELATION_REQUEST))
1198                         pg_usleep(10000L);      /* 10 msec seems a good number */
1199         }
1200 }
1201
1202 /*
1203  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1204  *
1205  * We stuff most fsync requests into the local hash table for execution
1206  * during the bgwriter's next checkpoint.  UNLINK requests go into a
1207  * separate linked list, however, because they get processed separately.
1208  *
1209  * The range of possible segment numbers is way less than the range of
1210  * BlockNumber, so we can reserve high values of segno for special purposes.
1211  * We define three:
1212  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1213  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1214  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1215  *       checkpoint.
1216  *
1217  * (Handling the FORGET_* requests is a tad slow because the hash table has
1218  * to be searched linearly, but it doesn't seem worth rethinking the table
1219  * structure for them.)
1220  */
1221 void
1222 RememberFsyncRequest(RelFileNode rnode, ForkNumber forknum, BlockNumber segno)
1223 {
1224         Assert(pendingOpsTable);
1225
1226         if (segno == FORGET_RELATION_FSYNC)
1227         {
1228                 /* Remove any pending requests for the entire relation */
1229                 HASH_SEQ_STATUS hstat;
1230                 PendingOperationEntry *entry;
1231
1232                 hash_seq_init(&hstat, pendingOpsTable);
1233                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1234                 {
1235                         if (RelFileNodeEquals(entry->tag.rnode, rnode) && 
1236                                 entry->tag.forknum == forknum)
1237                         {
1238                                 /* Okay, cancel this entry */
1239                                 entry->canceled = true;
1240                         }
1241                 }
1242         }
1243         else if (segno == FORGET_DATABASE_FSYNC)
1244         {
1245                 /* Remove any pending requests for the entire database */
1246                 HASH_SEQ_STATUS hstat;
1247                 PendingOperationEntry *entry;
1248                 ListCell   *cell, 
1249                                    *prev,
1250                                    *next;
1251
1252                 /* Remove fsync requests */
1253                 hash_seq_init(&hstat, pendingOpsTable);
1254                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1255                 {
1256                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1257                         {
1258                                 /* Okay, cancel this entry */
1259                                 entry->canceled = true;
1260                         }
1261                 }
1262         
1263                 /* Remove unlink requests */
1264                 prev = NULL;
1265                 for (cell = list_head(pendingUnlinks); cell; cell = next)
1266                 {
1267                         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1268
1269                         next = lnext(cell);
1270                         if (entry->rnode.dbNode == rnode.dbNode) 
1271                         {
1272                                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1273                                 pfree(entry);
1274                         }
1275                         else
1276                                 prev = cell;
1277                 }
1278         }
1279         else if (segno == UNLINK_RELATION_REQUEST)
1280         {
1281                 /* Unlink request: put it in the linked list */
1282                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1283                 PendingUnlinkEntry *entry;
1284
1285                 entry = palloc(sizeof(PendingUnlinkEntry));
1286                 entry->rnode = rnode;
1287                 entry->cycle_ctr = mdckpt_cycle_ctr;
1288
1289                 pendingUnlinks = lappend(pendingUnlinks, entry);
1290
1291                 MemoryContextSwitchTo(oldcxt);
1292         }
1293         else
1294         {
1295                 /* Normal case: enter a request to fsync this segment */
1296                 PendingOperationTag key;
1297                 PendingOperationEntry *entry;
1298                 bool            found;
1299
1300                 /* ensure any pad bytes in the hash key are zeroed */
1301                 MemSet(&key, 0, sizeof(key));
1302                 key.rnode = rnode;
1303                 key.forknum = forknum;
1304                 key.segno = segno;
1305
1306                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1307                                                                                                           &key,
1308                                                                                                           HASH_ENTER,
1309                                                                                                           &found);
1310                 /* if new or previously canceled entry, initialize it */
1311                 if (!found || entry->canceled)
1312                 {
1313                         entry->canceled = false;
1314                         entry->cycle_ctr = mdsync_cycle_ctr;
1315                 }
1316
1317                 /*
1318                  * NB: it's intentional that we don't change cycle_ctr if the entry
1319                  * already exists.      The fsync request must be treated as old, even
1320                  * though the new request will be satisfied too by any subsequent
1321                  * fsync.
1322                  *
1323                  * However, if the entry is present but is marked canceled, we should
1324                  * act just as though it wasn't there.  The only case where this could
1325                  * happen would be if a file had been deleted, we received but did not
1326                  * yet act on the cancel request, and the same relfilenode was then
1327                  * assigned to a new file.      We mustn't lose the new request, but it
1328                  * should be considered new not old.
1329                  */
1330         }
1331 }
1332
1333 /*
1334  * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
1335  */
1336 void
1337 ForgetRelationFsyncRequests(RelFileNode rnode, ForkNumber forknum)
1338 {
1339         if (pendingOpsTable)
1340         {
1341                 /* standalone backend or startup process: fsync state is local */
1342                 RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
1343         }
1344         else if (IsUnderPostmaster)
1345         {
1346                 /*
1347                  * Notify the bgwriter about it.  If we fail to queue the revoke
1348                  * message, we have to sleep and try again ... ugly, but hopefully
1349                  * won't happen often.
1350                  *
1351                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1352                  * error would leave the no-longer-used file still present on disk,
1353                  * which would be bad, so I'm inclined to assume that the bgwriter
1354                  * will always empty the queue soon.
1355                  */
1356                 while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
1357                         pg_usleep(10000L);      /* 10 msec seems a good number */
1358
1359                 /*
1360                  * Note we don't wait for the bgwriter to actually absorb the revoke
1361                  * message; see mdsync() for the implications.
1362                  */
1363         }
1364 }
1365
1366 /*
1367  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1368  */
1369 void
1370 ForgetDatabaseFsyncRequests(Oid dbid)
1371 {
1372         RelFileNode rnode;
1373
1374         rnode.dbNode = dbid;
1375         rnode.spcNode = 0;
1376         rnode.relNode = 0;
1377
1378         if (pendingOpsTable)
1379         {
1380                 /* standalone backend or startup process: fsync state is local */
1381                 RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
1382         }
1383         else if (IsUnderPostmaster)
1384         {
1385                 /* see notes in ForgetRelationFsyncRequests */
1386                 while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
1387                                                                         FORGET_DATABASE_FSYNC))
1388                         pg_usleep(10000L);      /* 10 msec seems a good number */
1389         }
1390 }
1391
1392
1393 /*
1394  *      _fdvec_alloc() -- Make a MdfdVec object.
1395  */
1396 static MdfdVec *
1397 _fdvec_alloc(void)
1398 {
1399         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1400 }
1401
1402 /*
1403  * Open the specified segment of the relation,
1404  * and make a MdfdVec object for it.  Returns NULL on failure.
1405  */
1406 static MdfdVec *
1407 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1408                           int oflags)
1409 {
1410         MdfdVec    *v;
1411         int                     fd;
1412         char       *path,
1413                            *fullpath;
1414
1415         path = relpath(reln->smgr_rnode, forknum);
1416
1417         if (segno > 0)
1418         {
1419                 /* be sure we have enough space for the '.segno' */
1420                 fullpath = (char *) palloc(strlen(path) + 12);
1421                 sprintf(fullpath, "%s.%u", path, segno);
1422                 pfree(path);
1423         }
1424         else
1425                 fullpath = path;
1426
1427         /* open the file */
1428         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1429
1430         pfree(fullpath);
1431
1432         if (fd < 0)
1433                 return NULL;
1434
1435         /* allocate an mdfdvec entry for it */
1436         v = _fdvec_alloc();
1437
1438         /* fill the entry */
1439         v->mdfd_vfd = fd;
1440         v->mdfd_segno = segno;
1441         v->mdfd_chain = NULL;
1442         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1443
1444         /* all done */
1445         return v;
1446 }
1447
1448 /*
1449  *      _mdfd_getseg() -- Find the segment of the relation holding the
1450  *              specified block.
1451  *
1452  * If the segment doesn't exist, we ereport, return NULL, or create the
1453  * segment, according to "behavior".  Note: isTemp need only be correct
1454  * in the EXTENSION_CREATE case.
1455  */
1456 static MdfdVec *
1457 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1458                          bool isTemp, ExtensionBehavior behavior)
1459 {
1460         MdfdVec    *v = mdopen(reln, forknum, behavior);
1461         BlockNumber targetseg;
1462         BlockNumber nextsegno;
1463
1464         if (!v)
1465                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1466
1467         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1468         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1469         {
1470                 Assert(nextsegno == v->mdfd_segno + 1);
1471
1472                 if (v->mdfd_chain == NULL)
1473                 {
1474                         /*
1475                          * Normally we will create new segments only if authorized by the
1476                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1477                          * recovery, create segments anyway; this allows cases such as
1478                          * replaying WAL data that has a write into a high-numbered
1479                          * segment of a relation that was later deleted.  We want to go
1480                          * ahead and create the segments so we can finish out the replay.
1481                          *
1482                          * We have to maintain the invariant that segments before the last
1483                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1484                          * with zeroes if needed.  (This only matters if caller is
1485                          * extending the relation discontiguously, but that can happen in
1486                          * hash indexes.)
1487                          */
1488                         if (behavior == EXTENSION_CREATE || InRecovery)
1489                         {
1490                                 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
1491                                 {
1492                                         char       *zerobuf = palloc0(BLCKSZ);
1493
1494                                         mdextend(reln, forknum,
1495                                                          nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1496                                                          zerobuf, isTemp);
1497                                         pfree(zerobuf);
1498                                 }
1499                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
1500                         }
1501                         else
1502                         {
1503                                 /* We won't create segment if not existent */
1504                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
1505                         }
1506                         if (v->mdfd_chain == NULL)
1507                         {
1508                                 if (behavior == EXTENSION_RETURN_NULL &&
1509                                         FILE_POSSIBLY_DELETED(errno))
1510                                         return NULL;
1511                                 ereport(ERROR,
1512                                                 (errcode_for_file_access(),
1513                                                  errmsg("could not open segment %u of relation %s (target block %u): %m",
1514                                                                 nextsegno,
1515                                                                 relpath(reln->smgr_rnode, forknum),
1516                                                                 blkno)));
1517                         }
1518                 }
1519                 v = v->mdfd_chain;
1520         }
1521         return v;
1522 }
1523
1524 /*
1525  * Get number of blocks present in a single disk file
1526  */
1527 static BlockNumber
1528 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1529 {
1530         off_t           len;
1531
1532         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1533         if (len < 0)
1534                 ereport(ERROR,
1535                                 (errcode_for_file_access(),
1536                                  errmsg("could not seek to end of segment %u of relation %s: %m",
1537                                                 seg->mdfd_segno, relpath(reln->smgr_rnode, forknum))));
1538         /* note that this calculation will ignore any partial block at EOF */
1539         return (BlockNumber) (len / BLCKSZ);
1540 }